mirror of
				https://github.com/f4exb/sdrangel.git
				synced 2025-10-31 13:00:26 -04:00 
			
		
		
		
	Use more precise SIMD flags and detect actual x86_64 SIMD features
This commit is contained in:
		
							parent
							
								
									dbbbfa12ee
								
							
						
					
					
						commit
						63d6eea066
					
				| @ -61,10 +61,6 @@ if (NOT BUILD_DEBIAN) | ||||
|     find_package(SerialDV) | ||||
| endif() | ||||
| 
 | ||||
| IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|x86") | ||||
|  SET(USE_SSE "SSE4_1" CACHE STRING "Use SSE 4.1 SIMD instructions") | ||||
| ENDIF() | ||||
| 
 | ||||
| # MacOS Compatibility | ||||
| if(APPLE) | ||||
|         find_package(ICONV) | ||||
| @ -411,17 +407,87 @@ include_directories( | ||||
|     ${OPENGL_INCLUDE_DIR} | ||||
| ) | ||||
| 
 | ||||
| if(USE_SSE MATCHES SSE4_1) | ||||
|     if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) | ||||
|         set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -msse4.1" ) | ||||
|         set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -msse4.1" ) | ||||
|         add_definitions(-DUSE_SSE) | ||||
|     elseif(MSVC) | ||||
|         set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /arch:SSE4_1" ) | ||||
|         set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /GL /Ot /Ox /arch:SSE4_1" ) | ||||
|         set( CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG" ) | ||||
|         add_definitions (/D "_CRT_SECURE_NO_WARNINGS") | ||||
|         add_definitions(-DUSE_SSE) | ||||
| ############################################################################## | ||||
| 
 | ||||
| EXECUTE_PROCESS( COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE ) | ||||
| message( STATUS "Architecture: ${ARCHITECTURE}" ) | ||||
| 
 | ||||
| if (${ARCHITECTURE} MATCHES "x86_64|AMD64|x86") | ||||
|     EXECUTE_PROCESS( COMMAND grep flags /proc/cpuinfo OUTPUT_VARIABLE CPU_FLAGS ) | ||||
|     if (${CPU_FLAGS} MATCHES "avx2") | ||||
|         set(HAS_AVX2 ON CACHE BOOL "Architecture has AVX2 SIMD enabled") | ||||
|         if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) | ||||
|             set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mavx2" ) | ||||
|             set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mavx2" ) | ||||
|             message(STATUS "Use AVX2 SIMD instructions") | ||||
|             add_definitions(-DUSE_AVX2) | ||||
|     else() | ||||
|         set(HAS_AVX2 OFF CACHE BOOL "Architecture does not have AVX2 SIMD enabled") | ||||
|         endif() | ||||
|     endif() | ||||
|     if (${CPU_FLAGS} MATCHES "sse4_1") | ||||
|         set(HAS_SSE4_1 ON CACHE BOOL "Architecture has SSE 4.1 SIMD enabled") | ||||
|         if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) | ||||
|             set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -msse4.1" ) | ||||
|             set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -msse4.1" ) | ||||
|             message(STATUS "Use SSE 4.1 SIMD instructions") | ||||
|             add_definitions(-DUSE_SSE4_1) | ||||
|         elseif(MSVC) | ||||
|             set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /arch:SSE4_1" ) | ||||
|             set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /GL /Ot /Ox /arch:SSE4_1" ) | ||||
|             set( CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG" ) | ||||
|             add_definitions (/D "_CRT_SECURE_NO_WARNINGS") | ||||
|             add_definitions(-DUSE_SSE4_1) | ||||
|         endif() | ||||
|     else() | ||||
|         set(HAS_SSE4_1 OFF CACHE BOOL "Architecture does not have SSE 4.1 SIMD enabled") | ||||
|     endif() | ||||
|     if (${CPU_FLAGS} MATCHES "ssse3") | ||||
|         set(HAS_SSSE3 ON CACHE BOOL "Architecture has SSSE3 SIMD enabled") | ||||
|         if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) | ||||
|             set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mssse3" ) | ||||
|             set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mssse3" ) | ||||
|             message(STATUS "Use SSSE3 SIMD instructions") | ||||
|             add_definitions(-DUSE_SSSE3) | ||||
|         elseif(MSVC) | ||||
|             set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /arch:SSSE3" ) | ||||
|             set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /GL /Ot /Ox /arch:SSSE3" ) | ||||
|             set( CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG" ) | ||||
|             add_definitions (/D "_CRT_SECURE_NO_WARNINGS") | ||||
|             add_definitions(-DUSE_SSSE3) | ||||
|         endif() | ||||
|     else() | ||||
|         set(HAS_SSSE3 OFF CACHE BOOL "Architecture does not have SSSE3 SIMD enabled") | ||||
|     endif() | ||||
|     if (${CPU_FLAGS} MATCHES "sse2") | ||||
|         set(HAS_SSE2 ON CACHE BOOL "Architecture has SSE2 SIMD enabled") | ||||
|         if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) | ||||
|             set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -msse2" ) | ||||
|             set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -msse2" ) | ||||
|             message(STATUS "Use SSE2 SIMD instructions") | ||||
|             add_definitions(-DUSE_SSE2) | ||||
|         elseif(MSVC) | ||||
|             set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /arch:SSE2" ) | ||||
|             set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /GL /Ot /Ox /arch:SSE2" ) | ||||
|             set( CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG" ) | ||||
|             add_definitions (/D "_CRT_SECURE_NO_WARNINGS") | ||||
|             add_definitions(-DUSE_SSE2) | ||||
|         endif() | ||||
|     else() | ||||
|         set(HAS_SSE2 OFF CACHE BOOL "Architecture does not have SSE2 SIMD enabled") | ||||
|     endif() | ||||
| elseif (${ARCHITECTURE} MATCHES "armv7l") | ||||
|     EXECUTE_PROCESS( COMMAND grep Features /proc/cpuinfo OUTPUT_VARIABLE CPU_FLAGS ) | ||||
|     if (${CPU_FLAGS} MATCHES "neon") | ||||
|         set(HAS_NEON ON CACHE BOOL "Architecture has NEON SIMD enabled") | ||||
|         if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) | ||||
|             set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfpu=neon" ) | ||||
|             set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mfpu=neon" ) | ||||
|             message(STATUS "Use NEON SIMD instructions") | ||||
|             add_definitions(-DUSE_NEON) | ||||
|         endif() | ||||
|     else() | ||||
|         set(HAS_NEON OFF CACHE BOOL "Architecture does not have NEON SIMD enabled") | ||||
|     endif() | ||||
| endif() | ||||
| 
 | ||||
|  | ||||
| @ -1,39 +1,11 @@ | ||||
| project(cm256cc) | ||||
| 
 | ||||
| EXECUTE_PROCESS( COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE ) | ||||
| message( STATUS "CM256cc: Architecture: ${ARCHITECTURE}" ) | ||||
| 
 | ||||
| if(${ARCHITECTURE} MATCHES "x86_64|AMD64|x86") | ||||
|     SET(USE_SIMD "SSSE3") | ||||
| elseif(${ARCHITECTURE} MATCHES "armv7l") | ||||
|     SET(USE_SIMD "NEON") | ||||
| endif() | ||||
| 
 | ||||
| message( STATUS "CM256cc: use SIMD: ${USE_SIMD}" ) | ||||
| 
 | ||||
| if(USE_SIMD MATCHES SSSE3) | ||||
|     if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) | ||||
|         set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mssse3" ) | ||||
|         set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mssse3" ) | ||||
|         message(STATUS "g++ SSSE3") | ||||
|         add_definitions(-DUSE_SIMD) | ||||
|     elseif(MSVC) | ||||
|         set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /arch:SSSE3" ) | ||||
|         set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /GL /Ot /Ox /arch:SSSE3" ) | ||||
|         set( CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG" ) | ||||
|         message(STATUS "MSVC SSSE3") | ||||
|         add_definitions (/D "_CRT_SECURE_NO_WARNINGS") | ||||
|         add_definitions(-DUSE_SIMD) | ||||
|     endif() | ||||
| elseif(USE_SIMD MATCHES NEON) | ||||
|     if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) | ||||
|         set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfpu=neon" ) | ||||
|         set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mfpu=neon" ) | ||||
|         message(STATUS "g++ NEON") | ||||
|         add_definitions(-DUSE_NEON) | ||||
|     endif() | ||||
| if (HAS_SSSE3) | ||||
|     message(STATUS "SDRdaemonFEC: use SSSE3 SIMD" ) | ||||
| elseif (HAS_NEON) | ||||
|     message(STATUS "SDRdaemonFEC: use Neon SIMD" ) | ||||
| else() | ||||
|     message(STATUS "CM256cc: Unsupported architecture") | ||||
|     message(STATUS "SDRdaemonFEC: Unsupported architecture") | ||||
|     return() | ||||
| endif() | ||||
| 
 | ||||
|  | ||||
| @ -15,7 +15,7 @@ CONFIG(MINGW64):LIBCM256CCSRC = "D:\softs\cm256cc" | ||||
| INCLUDEPATH += $$LIBCM256CCSRC | ||||
| 
 | ||||
| DEFINES += __WINDOWS__=1 | ||||
| DEFINES += USE_SIMD=1 | ||||
| DEFINES += USE_SSSE3=1 | ||||
| QMAKE_CXXFLAGS += -msse4.1 | ||||
| 
 | ||||
| CONFIG(Release):build_subdir = release | ||||
|  | ||||
| @ -13,7 +13,7 @@ TARGET = modam | ||||
| INCLUDEPATH += $$PWD | ||||
| INCLUDEPATH += ../../../sdrbase | ||||
| 
 | ||||
| DEFINES += USE_SSE=1 | ||||
| DEFINES += USE_SSE4_1=1 | ||||
| QMAKE_CXXFLAGS += -msse4.1 | ||||
| 
 | ||||
| CONFIG(Release):build_subdir = release | ||||
|  | ||||
| @ -2,40 +2,10 @@ project(sdrdaemonfec) | ||||
| 
 | ||||
| find_package(LibNANOMSG) | ||||
| 
 | ||||
| EXECUTE_PROCESS( COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE ) | ||||
| message( STATUS "SDRdaemonFEC: Architecture: ${ARCHITECTURE}" ) | ||||
| 
 | ||||
| if(${ARCHITECTURE} MATCHES "x86_64|AMD64|x86") | ||||
|     SET(USE_SIMD "SSSE3") | ||||
| elseif(${ARCHITECTURE} MATCHES "armv7l") | ||||
|     SET(USE_SIMD "NEON") | ||||
| endif() | ||||
| 
 | ||||
| message( STATUS "SDRdaemonFEC: use SIMD: ${USE_SIMD}" ) | ||||
| 
 | ||||
| if(USE_SIMD MATCHES SSSE3) | ||||
|     if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) | ||||
|         set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mssse3" ) | ||||
|         set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mssse3" ) | ||||
|         message(STATUS "SDRdaemonFEC: g++ SSSE3") | ||||
|         add_definitions(-DUSE_SIMD) | ||||
|         add_definitions(-DUSE_SSE) | ||||
|     elseif(MSVC) | ||||
|         set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /arch:SSSE3" ) | ||||
|         set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /GL /Ot /Ox /arch:SSSE3" ) | ||||
|         set( CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG" ) | ||||
|         message(STATUS "SDRdaemonFEC: MSVC SSSE3") | ||||
|         add_definitions (/D "_CRT_SECURE_NO_WARNINGS") | ||||
|         add_definitions(-DUSE_SIMD) | ||||
|         add_definitions(-DUSE_SSE) | ||||
|     endif() | ||||
| elseif(USE_SIMD MATCHES NEON) | ||||
|     if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) | ||||
|         set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfpu=neon" ) | ||||
|         set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mfpu=neon" ) | ||||
|         message(STATUS "SDRdaemonFEC: g++ NEON") | ||||
|         add_definitions(-DUSE_NEON) | ||||
|     endif() | ||||
| if (HAS_SSSE3) | ||||
|     message(STATUS "SDRdaemonFEC: use SSSE3 SIMD" ) | ||||
| elseif (HAS_NEON) | ||||
|     message(STATUS "SDRdaemonFEC: use Neon SIMD" ) | ||||
| else() | ||||
|     message(STATUS "SDRdaemonFEC: Unsupported architecture") | ||||
|     return() | ||||
|  | ||||
| @ -23,9 +23,8 @@ INCLUDEPATH += ../../../lz4 | ||||
| INCLUDEPATH += $$LIBNANOMSGSRC/src | ||||
| INCLUDEPATH += $$LIBCM256CCSRC | ||||
| 
 | ||||
| DEFINES += USE_SIMD=1 | ||||
| DEFINES += USE_SSE=1 | ||||
| QMAKE_CXXFLAGS += -msse4.1 | ||||
| DEFINES += USE_SSSE3=1 | ||||
| QMAKE_CXXFLAGS += -mssse3 | ||||
| 
 | ||||
| CONFIG(Release):build_subdir = release | ||||
| CONFIG(Debug):build_subdir = debug | ||||
|  | ||||
| @ -18,7 +18,7 @@ | ||||
| #define INCLUDE_GPL_DSP_DECIMATORS_H_ | ||||
| 
 | ||||
| #include "dsp/dsptypes.h" | ||||
| #ifdef USE_SSE | ||||
| #ifdef USE_SSE4_1 | ||||
| #include "dsp/inthalfbandfiltereo1.h" | ||||
| #else | ||||
| #include "dsp/inthalfbandfilterdb.h" | ||||
| @ -124,7 +124,7 @@ public: | ||||
| 	void decimate64_cen(SampleVector::iterator* it, const T* buf, qint32 len); | ||||
| 
 | ||||
| private: | ||||
| #ifdef USE_SSE | ||||
| #ifdef USE_SSE4_1 | ||||
|     IntHalfbandFilterEO1<DECIMATORS_HB_FILTER_ORDER> m_decimator2;  // 1st stages
 | ||||
|     IntHalfbandFilterEO1<DECIMATORS_HB_FILTER_ORDER> m_decimator4;  // 2nd stages
 | ||||
|     IntHalfbandFilterEO1<DECIMATORS_HB_FILTER_ORDER> m_decimator8;  // 3rd stages
 | ||||
|  | ||||
| @ -1,8 +1,8 @@ | ||||
| #ifndef INCLUDE_INTERPOLATOR_H | ||||
| #define INCLUDE_INTERPOLATOR_H | ||||
| 
 | ||||
| #ifdef USE_SSE | ||||
| #include <immintrin.h> | ||||
| #ifdef USE_SSE2 | ||||
| #include <emmintrin.h> | ||||
| #endif | ||||
| #include "dsp/dsptypes.h" | ||||
| #include "util/export.h" | ||||
| @ -125,7 +125,7 @@ private: | ||||
| 	{ | ||||
| 		if (phase < 0) | ||||
| 			phase = 0; | ||||
| #if USE_SSE | ||||
| #if USE_SSE2 | ||||
| 		// beware of the ringbuffer
 | ||||
| 		if(m_ptr == 0) { | ||||
| 			// only one straight block
 | ||||
|  | ||||
| @ -22,7 +22,7 @@ | ||||
| #ifndef SDRBASE_DSP_INTHALFBANDFILTEREO_H_ | ||||
| #define SDRBASE_DSP_INTHALFBANDFILTEREO_H_ | ||||
| 
 | ||||
| #ifdef USE_SSE | ||||
| #ifdef USE_SSE4_1 | ||||
| #include <smmintrin.h> | ||||
| #endif | ||||
| 
 | ||||
| @ -464,7 +464,7 @@ protected: | ||||
|         qint32 iAcc = 0; | ||||
|         qint32 qAcc = 0; | ||||
| 
 | ||||
| #ifdef USE_SSE | ||||
| #ifdef USE_SSE4_1 | ||||
| //#warning "IntHalfbandFiler SIMD"
 | ||||
|         const __m128i* h = (const __m128i*) HBFIRFilterTraits<HBFilterOrder>::hbCoeffs; | ||||
|         __m128i sumI = _mm_setzero_si128(); | ||||
| @ -551,7 +551,7 @@ protected: | ||||
|         qint32 iAcc = 0; | ||||
|         qint32 qAcc = 0; | ||||
| 
 | ||||
| #ifdef USE_SSE | ||||
| #ifdef USE_SSE4_1 | ||||
|         const __m128i* h = (const __m128i*) HBFIRFilterTraits<HBFilterOrder>::hbCoeffs; | ||||
|         __m128i sumI = _mm_setzero_si128(); | ||||
|         __m128i sumQ = _mm_setzero_si128(); | ||||
|  | ||||
| @ -22,10 +22,14 @@ | ||||
| #ifndef SDRBASE_DSP_INTHALFBANDFILTEREO2_H_ | ||||
| #define SDRBASE_DSP_INTHALFBANDFILTEREO2_H_ | ||||
| 
 | ||||
| #ifdef USE_SSE | ||||
| #ifdef USE_SSE4_1 | ||||
| #include <smmintrin.h> | ||||
| #endif | ||||
| 
 | ||||
| #ifdef USE_NEON | ||||
| #include <arm_neon.h> | ||||
| #endif | ||||
| 
 | ||||
| #include <stdint.h> | ||||
| #include "dsp/dsptypes.h" | ||||
| #include "dsp/hbfiltertraits.h" | ||||
| @ -484,8 +488,7 @@ protected: | ||||
|         qint32 iAcc = 0; | ||||
|         qint32 qAcc = 0; | ||||
| 
 | ||||
| #ifdef USE_SSE | ||||
| //#warning "IntHalfbandFiler SIMD"
 | ||||
| #if defined(USE_SSE4_1) | ||||
|         const __m128i* h = (const __m128i*) HBFIRFilterTraits<HBFilterOrder>::hbCoeffs; | ||||
|         __m128i sumI = _mm_setzero_si128(); | ||||
|         __m128i sumQ = _mm_setzero_si128(); | ||||
| @ -528,6 +531,47 @@ protected: | ||||
|         sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 8)); | ||||
|         sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 4)); | ||||
|         qAcc = _mm_cvtsi128_si32(sumQ); | ||||
| #elif defined(USE_NEON) | ||||
|         int32x4_t sumI = vdupq_n_s32(0); | ||||
|         int32x4_t sumQ = vdupq_n_s32(0); | ||||
|         int32x4_t sa, sb, sh; | ||||
| 
 | ||||
|         for (int i = 0; i < HBFIRFilterTraits<HBFilterOrder>::hbOrder / 16; i++) | ||||
|         { | ||||
|             sh = vld1_s32(&h[4*i]); | ||||
| 
 | ||||
|             if ((m_ptrB % 2) == 0) | ||||
|             { | ||||
|                 sa = vld1q_s32(&(m_evenA[0][a])); | ||||
|                 sb = vld1q_s32(&(m_evenB[0][b])); | ||||
|                 sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); | ||||
| 
 | ||||
|                 sa = vld1q_s32(&(m_evenA[1][a])); | ||||
|                 sb = vld1q_s32(&(m_evenB[1][b])); | ||||
|                 sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 sa = vld1q_s32(&(m_oddA[0][a])); | ||||
|                 sb = vld1q_s32(&(m_oddB[0][b])); | ||||
|                 sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); | ||||
| 
 | ||||
|                 sa = vld1q_s32(&(m_oddA[1][a])); | ||||
|                 sb = vld1q_s32(&(m_oddB[1][b])); | ||||
|                 sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); | ||||
|             } | ||||
| 
 | ||||
|             a += 4; | ||||
|             b += 4; | ||||
|         } | ||||
| 
 | ||||
|         int32x2_t sumI1 = vpadd_s32(vget_high_s32(sumI), vget_low_s32(sumI)); | ||||
|         int32x2_t sumI2 = vpadd_s32(sumI1, sumI1); | ||||
|         iAcc = vget_lane_s32(sumI2, 0); | ||||
| 
 | ||||
|         int32x2_t sumQ1 = vpadd_s32(vget_high_s32(sumQ), vget_low_s32(sumQ)); | ||||
|         int32x2_t sumQ2 = vpadd_s32(sumQ1, sumQ1); | ||||
|         qAcc = vget_lane_s32(sumQ2, 0); | ||||
| #else | ||||
|         for (int i = 0; i < HBFIRFilterTraits<HBFilterOrder>::hbOrder / 4; i++) | ||||
|         { | ||||
| @ -570,7 +614,7 @@ protected: | ||||
|         qint32 iAcc = 0; | ||||
|         qint32 qAcc = 0; | ||||
| 
 | ||||
| #ifdef USE_SSE | ||||
| #if defined(USE_SSE4_1) | ||||
| //#warning "IntHalfbandFiler SIMD"
 | ||||
|         const __m128i* h = (const __m128i*) HBFIRFilterTraits<HBFilterOrder>::hbCoeffs; | ||||
|         __m128i sumI = _mm_setzero_si128(); | ||||
| @ -614,6 +658,47 @@ protected: | ||||
|         sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 8)); | ||||
|         sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 4)); | ||||
|         qAcc = _mm_cvtsi128_si32(sumQ); | ||||
| #elif defined(USE_NEON) | ||||
|         int32x4_t sumI = vdupq_n_s32(0); | ||||
|         int32x4_t sumQ = vdupq_n_s32(0); | ||||
|         int32x4_t sa, sb, sh; | ||||
| 
 | ||||
|         for (int i = 0; i < HBFIRFilterTraits<HBFilterOrder>::hbOrder / 16; i++) | ||||
|         { | ||||
|             sh = vld1_s32(&h[4*i]); | ||||
| 
 | ||||
|             if ((m_ptrB % 2) == 0) | ||||
|             { | ||||
|                 sa = vld1q_s32(&(m_evenA[0][a])); | ||||
|                 sb = vld1q_s32(&(m_evenB[0][b])); | ||||
|                 sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); | ||||
| 
 | ||||
|                 sa = vld1q_s32(&(m_evenA[1][a])); | ||||
|                 sb = vld1q_s32(&(m_evenB[1][b])); | ||||
|                 sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 sa = vld1q_s32(&(m_oddA[0][a])); | ||||
|                 sb = vld1q_s32(&(m_oddB[0][b])); | ||||
|                 sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); | ||||
| 
 | ||||
|                 sa = vld1q_s32(&(m_oddA[1][a])); | ||||
|                 sb = vld1q_s32(&(m_oddB[1][b])); | ||||
|                 sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); | ||||
|             } | ||||
| 
 | ||||
|             a += 4; | ||||
|             b += 4; | ||||
|         } | ||||
| 
 | ||||
|         int32x2_t sumI1 = vpadd_s32(vget_high_s32(sumI), vget_low_s32(sumI)); | ||||
|         int32x2_t sumI2 = vpadd_s32(sumI1, sumI1); | ||||
|         iAcc = vget_lane_s32(sumI2, 0); | ||||
| 
 | ||||
|         int32x2_t sumQ1 = vpadd_s32(vget_high_s32(sumQ), vget_low_s32(sumQ)); | ||||
|         int32x2_t sumQ2 = vpadd_s32(sumQ1, sumQ1); | ||||
|         qAcc = vget_lane_s32(sumQ2, 0); | ||||
| #else | ||||
|         for (int i = 0; i < HBFIRFilterTraits<HBFilterOrder>::hbOrder / 4; i++) | ||||
|         { | ||||
|  | ||||
| @ -201,7 +201,7 @@ void UpChannelizer::applyConfiguration() | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| #ifdef USE_SSE | ||||
| #ifdef USE_SSE4_1 | ||||
| UpChannelizer::FilterStage::FilterStage(Mode mode) : | ||||
|     m_filter(new IntHalfbandFilterEO2<UPCHANNELIZER_HB_FILTER_ORDER>), | ||||
|     m_workFunction(0) | ||||
|  | ||||
| @ -23,7 +23,7 @@ | ||||
| #include <QMutex> | ||||
| #include "util/export.h" | ||||
| #include "util/message.h" | ||||
| #ifdef USE_SSE | ||||
| #ifdef USE_SSE4_1 | ||||
| #include "dsp/inthalfbandfiltereo2.h" | ||||
| #else | ||||
| #include "dsp/inthalfbandfilterdb.h" | ||||
| @ -73,7 +73,7 @@ protected: | ||||
|             ModeUpperHalf | ||||
|         }; | ||||
| 
 | ||||
| #ifdef USE_SSE | ||||
| #ifdef USE_SSE4_1 | ||||
|         typedef bool (IntHalfbandFilterEO2<UPCHANNELIZER_HB_FILTER_ORDER>::*WorkFunction)(Sample* sIn, Sample *sOut); | ||||
|         IntHalfbandFilterEO2<UPCHANNELIZER_HB_FILTER_ORDER>* m_filter; | ||||
| #else | ||||
|  | ||||
| @ -15,8 +15,8 @@ | ||||
| // along with this program. If not, see <http://www.gnu.org/licenses/>.          //
 | ||||
| ///////////////////////////////////////////////////////////////////////////////////
 | ||||
| 
 | ||||
| #ifdef USE_SSE | ||||
| #include <immintrin.h> | ||||
| #ifdef USE_SSE2 | ||||
| #include <emmintrin.h> | ||||
| #endif | ||||
| 
 | ||||
| #include <QMouseEvent> | ||||
| @ -381,83 +381,83 @@ void GLSpectrum::updateHistogram(const std::vector<Real>& spectrum) | ||||
| 
 | ||||
| 	m_currentSpectrum = &spectrum; // Store spectrum for current spectrum line display
 | ||||
| 
 | ||||
| #ifndef USE_SSE | ||||
| 	for(int i = 0; i < m_fftSize; i++) { | ||||
| 		int v = (int)((spectrum[i] - m_referenceLevel) * 100.0 / m_powerRange + 100.0); | ||||
| #ifdef USE_SSE2 | ||||
|     if(m_decay >= 0) { // normal
 | ||||
|         const __m128 refl = {m_referenceLevel, m_referenceLevel, m_referenceLevel, m_referenceLevel}; | ||||
|         const __m128 power = {m_powerRange, m_powerRange, m_powerRange, m_powerRange}; | ||||
|         const __m128 mul = {100.0f, 100.0f, 100.0f, 100.0f}; | ||||
| 
 | ||||
| 		if ((v >= 0) && (v <= 99)) { | ||||
| 			b = m_histogram + i * 100 + v; | ||||
| 			if(*b < 220) | ||||
| 				*b += m_histogramStroke; // was 4
 | ||||
| 			else if(*b < 239) | ||||
| 				*b += 1; | ||||
| 		} | ||||
| 	} | ||||
|         for(int i = 0; i < m_fftSize; i += 4) { | ||||
|             __m128 abc = _mm_loadu_ps (&spectrum[i]); | ||||
|             abc = _mm_sub_ps(abc, refl); | ||||
|             abc = _mm_mul_ps(abc, mul); | ||||
|             abc = _mm_div_ps(abc, power); | ||||
|             abc =  _mm_add_ps(abc, mul); | ||||
|             __m128i result = _mm_cvtps_epi32(abc); | ||||
| 
 | ||||
|             for(int j = 0; j < 4; j++) { | ||||
|                 int v = ((int*)&result)[j]; | ||||
|                 if((v >= 0) && (v <= 99)) { | ||||
|                     b = m_histogram + (i + j) * 100 + v; | ||||
|                     if(*b < 220) | ||||
|                         *b += m_histogramStroke; // was 4
 | ||||
|                     else if(*b < 239) | ||||
|                         *b += 1; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } else { // draw double pixels
 | ||||
|         int add = -m_decay * 4; | ||||
|         const __m128 refl = {m_referenceLevel, m_referenceLevel, m_referenceLevel, m_referenceLevel}; | ||||
|         const __m128 power = {m_powerRange, m_powerRange, m_powerRange, m_powerRange}; | ||||
|         const __m128 mul = {100.0f, 100.0f, 100.0f, 100.0f}; | ||||
| 
 | ||||
|         for(int i = 0; i < m_fftSize; i += 4) { | ||||
|             __m128 abc = _mm_loadu_ps (&spectrum[i]); | ||||
|             abc = _mm_sub_ps(abc, refl); | ||||
|             abc = _mm_mul_ps(abc, mul); | ||||
|             abc = _mm_div_ps(abc, power); | ||||
|             abc =  _mm_add_ps(abc, mul); | ||||
|             __m128i result = _mm_cvtps_epi32(abc); | ||||
| 
 | ||||
|             for(int j = 0; j < 4; j++) { | ||||
|                 int v = ((int*)&result)[j]; | ||||
|                 if((v >= 1) && (v <= 98)) { | ||||
|                     b = m_histogram + (i + j) * 100 + v; | ||||
|                     if(b[-1] < 220) | ||||
|                         b[-1] += add; | ||||
|                     else if(b[-1] < 239) | ||||
|                         b[-1] += 1; | ||||
|                     if(b[0] < 220) | ||||
|                         b[0] += add; | ||||
|                     else if(b[0] < 239) | ||||
|                         b[0] += 1; | ||||
|                     if(b[1] < 220) | ||||
|                         b[1] += add; | ||||
|                     else if(b[1] < 239) | ||||
|                         b[1] += 1; | ||||
|                 } else if((v >= 0) && (v <= 99)) { | ||||
|                     b = m_histogram + (i + j) * 100 + v; | ||||
|                     if(*b < 220) | ||||
|                         *b += add; | ||||
|                     else if(*b < 239) | ||||
|                         *b += 1; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| #else | ||||
| 	if(m_decay >= 0) { // normal
 | ||||
| 		const __m128 refl = {m_referenceLevel, m_referenceLevel, m_referenceLevel, m_referenceLevel}; | ||||
| 		const __m128 power = {m_powerRange, m_powerRange, m_powerRange, m_powerRange}; | ||||
| 		const __m128 mul = {100.0f, 100.0f, 100.0f, 100.0f}; | ||||
|     for(int i = 0; i < m_fftSize; i++) { | ||||
|         int v = (int)((spectrum[i] - m_referenceLevel) * 100.0 / m_powerRange + 100.0); | ||||
| 
 | ||||
| 		for(int i = 0; i < m_fftSize; i += 4) { | ||||
| 			__m128 abc = _mm_loadu_ps (&spectrum[i]); | ||||
| 			abc = _mm_sub_ps(abc, refl); | ||||
| 			abc = _mm_mul_ps(abc, mul); | ||||
| 			abc = _mm_div_ps(abc, power); | ||||
| 			abc =  _mm_add_ps(abc, mul); | ||||
| 			__m128i result = _mm_cvtps_epi32(abc); | ||||
| 
 | ||||
| 			for(int j = 0; j < 4; j++) { | ||||
| 				int v = ((int*)&result)[j]; | ||||
| 				if((v >= 0) && (v <= 99)) { | ||||
| 					b = m_histogram + (i + j) * 100 + v; | ||||
| 					if(*b < 220) | ||||
| 						*b += m_histogramStroke; // was 4
 | ||||
| 					else if(*b < 239) | ||||
| 						*b += 1; | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 	} else { // draw double pixels
 | ||||
| 		int add = -m_decay * 4; | ||||
| 		const __m128 refl = {m_referenceLevel, m_referenceLevel, m_referenceLevel, m_referenceLevel}; | ||||
| 		const __m128 power = {m_powerRange, m_powerRange, m_powerRange, m_powerRange}; | ||||
| 		const __m128 mul = {100.0f, 100.0f, 100.0f, 100.0f}; | ||||
| 
 | ||||
| 		for(int i = 0; i < m_fftSize; i += 4) { | ||||
| 			__m128 abc = _mm_loadu_ps (&spectrum[i]); | ||||
| 			abc = _mm_sub_ps(abc, refl); | ||||
| 			abc = _mm_mul_ps(abc, mul); | ||||
| 			abc = _mm_div_ps(abc, power); | ||||
| 			abc =  _mm_add_ps(abc, mul); | ||||
| 			__m128i result = _mm_cvtps_epi32(abc); | ||||
| 
 | ||||
| 			for(int j = 0; j < 4; j++) { | ||||
| 				int v = ((int*)&result)[j]; | ||||
| 				if((v >= 1) && (v <= 98)) { | ||||
| 					b = m_histogram + (i + j) * 100 + v; | ||||
| 					if(b[-1] < 220) | ||||
| 						b[-1] += add; | ||||
| 					else if(b[-1] < 239) | ||||
| 						b[-1] += 1; | ||||
| 					if(b[0] < 220) | ||||
| 						b[0] += add; | ||||
| 					else if(b[0] < 239) | ||||
| 						b[0] += 1; | ||||
| 					if(b[1] < 220) | ||||
| 						b[1] += add; | ||||
| 					else if(b[1] < 239) | ||||
| 						b[1] += 1; | ||||
| 				} else if((v >= 0) && (v <= 99)) { | ||||
| 					b = m_histogram + (i + j) * 100 + v; | ||||
| 					if(*b < 220) | ||||
| 						*b += add; | ||||
| 					else if(*b < 239) | ||||
| 						*b += 1; | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
|         if ((v >= 0) && (v <= 99)) { | ||||
|             b = m_histogram + i * 100 + v; | ||||
|             if(*b < 220) | ||||
|                 *b += m_histogramStroke; // was 4
 | ||||
|             else if(*b < 239) | ||||
|                 *b += 1; | ||||
|         } | ||||
|     } | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
|  | ||||
| @ -14,8 +14,8 @@ INCLUDEPATH += $$PWD | ||||
| DEFINES += USE_KISSFFT=1 | ||||
| DEFINES += __WINDOWS__=1 | ||||
| DEFINES += DSD_USE_SERIALDV=1 | ||||
| DEFINES += USE_SSE=1 | ||||
| QMAKE_CXXFLAGS += -msse4.1 | ||||
| DEFINES += USE_SSE2=1 | ||||
| QMAKE_CXXFLAGS += -msse2 | ||||
| 
 | ||||
| CONFIG(Release):build_subdir = release | ||||
| CONFIG(Debug):build_subdir = debug | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user