From 63d6eea0665e6beeab6329d166fa0032829607b4 Mon Sep 17 00:00:00 2001 From: f4exb Date: Mon, 7 Nov 2016 00:42:57 +0100 Subject: [PATCH] Use more precise SIMD flags and detect actual x86_64 SIMD features --- CMakeLists.txt | 96 +++++++++-- cm256cc/CMakeLists.txt | 38 +---- cm256cc/cm256cc.pro | 2 +- plugins/channeltx/modam/modam.pro | 2 +- .../samplesource/sdrdaemonfec/CMakeLists.txt | 38 +---- .../sdrdaemonfec/sdrdaemonfec.pro | 5 +- sdrbase/dsp/decimators.h | 4 +- sdrbase/dsp/interpolator.h | 6 +- sdrbase/dsp/inthalfbandfiltereo1.h | 6 +- sdrbase/dsp/inthalfbandfiltereo2.h | 93 ++++++++++- sdrbase/dsp/upchannelizer.cpp | 2 +- sdrbase/dsp/upchannelizer.h | 4 +- sdrbase/gui/glspectrum.cpp | 152 +++++++++--------- sdrbase/sdrbase.pro | 4 +- 14 files changed, 272 insertions(+), 180 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a5afa16c..f3defc825 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,10 +61,6 @@ if (NOT BUILD_DEBIAN) find_package(SerialDV) endif() -IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|x86") - SET(USE_SSE "SSE4_1" CACHE STRING "Use SSE 4.1 SIMD instructions") -ENDIF() - # MacOS Compatibility if(APPLE) find_package(ICONV) @@ -411,17 +407,87 @@ include_directories( ${OPENGL_INCLUDE_DIR} ) -if(USE_SSE MATCHES SSE4_1) - if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) - set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -msse4.1" ) - set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -msse4.1" ) - add_definitions(-DUSE_SSE) - elseif(MSVC) - set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /arch:SSE4_1" ) - set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /GL /Ot /Ox /arch:SSE4_1" ) - set( CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG" ) - add_definitions (/D "_CRT_SECURE_NO_WARNINGS") - add_definitions(-DUSE_SSE) +############################################################################## + +EXECUTE_PROCESS( COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE ) +message( STATUS "Architecture: ${ARCHITECTURE}" ) + +if (${ARCHITECTURE} MATCHES "x86_64|AMD64|x86") + EXECUTE_PROCESS( COMMAND grep flags /proc/cpuinfo OUTPUT_VARIABLE CPU_FLAGS ) + if (${CPU_FLAGS} MATCHES "avx2") + set(HAS_AVX2 ON CACHE BOOL "Architecture has AVX2 SIMD enabled") + if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) + set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mavx2" ) + set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mavx2" ) + message(STATUS "Use AVX2 SIMD instructions") + add_definitions(-DUSE_AVX2) + else() + set(HAS_AVX2 OFF CACHE BOOL "Architecture does not have AVX2 SIMD enabled") + endif() + endif() + if (${CPU_FLAGS} MATCHES "sse4_1") + set(HAS_SSE4_1 ON CACHE BOOL "Architecture has SSE 4.1 SIMD enabled") + if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) + set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -msse4.1" ) + set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -msse4.1" ) + message(STATUS "Use SSE 4.1 SIMD instructions") + add_definitions(-DUSE_SSE4_1) + elseif(MSVC) + set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /arch:SSE4_1" ) + set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /GL /Ot /Ox /arch:SSE4_1" ) + set( CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG" ) + add_definitions (/D "_CRT_SECURE_NO_WARNINGS") + add_definitions(-DUSE_SSE4_1) + endif() + else() + set(HAS_SSE4_1 OFF CACHE BOOL "Architecture does not have SSE 4.1 SIMD enabled") + endif() + if (${CPU_FLAGS} MATCHES "ssse3") + set(HAS_SSSE3 ON CACHE BOOL "Architecture has SSSE3 SIMD enabled") + if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) + set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mssse3" ) + set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mssse3" ) + message(STATUS "Use SSSE3 SIMD instructions") + add_definitions(-DUSE_SSSE3) + elseif(MSVC) + set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /arch:SSSE3" ) + set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /GL /Ot /Ox /arch:SSSE3" ) + set( CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG" ) + add_definitions (/D "_CRT_SECURE_NO_WARNINGS") + add_definitions(-DUSE_SSSE3) + endif() + else() + set(HAS_SSSE3 OFF CACHE BOOL "Architecture does not have SSSE3 SIMD enabled") + endif() + if (${CPU_FLAGS} MATCHES "sse2") + set(HAS_SSE2 ON CACHE BOOL "Architecture has SSE2 SIMD enabled") + if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) + set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -msse2" ) + set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -msse2" ) + message(STATUS "Use SSE2 SIMD instructions") + add_definitions(-DUSE_SSE2) + elseif(MSVC) + set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /arch:SSE2" ) + set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /GL /Ot /Ox /arch:SSE2" ) + set( CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG" ) + add_definitions (/D "_CRT_SECURE_NO_WARNINGS") + add_definitions(-DUSE_SSE2) + endif() + else() + set(HAS_SSE2 OFF CACHE BOOL "Architecture does not have SSE2 SIMD enabled") + endif() +elseif (${ARCHITECTURE} MATCHES "armv7l") + EXECUTE_PROCESS( COMMAND grep Features /proc/cpuinfo OUTPUT_VARIABLE CPU_FLAGS ) + if (${CPU_FLAGS} MATCHES "neon") + set(HAS_NEON ON CACHE BOOL "Architecture has NEON SIMD enabled") + if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) + set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfpu=neon" ) + set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mfpu=neon" ) + message(STATUS "Use NEON SIMD instructions") + add_definitions(-DUSE_NEON) + endif() + else() + set(HAS_NEON OFF CACHE BOOL "Architecture does not have NEON SIMD enabled") endif() endif() diff --git a/cm256cc/CMakeLists.txt b/cm256cc/CMakeLists.txt index 660b4309d..6405b9322 100644 --- a/cm256cc/CMakeLists.txt +++ b/cm256cc/CMakeLists.txt @@ -1,39 +1,11 @@ project(cm256cc) -EXECUTE_PROCESS( COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE ) -message( STATUS "CM256cc: Architecture: ${ARCHITECTURE}" ) - -if(${ARCHITECTURE} MATCHES "x86_64|AMD64|x86") - SET(USE_SIMD "SSSE3") -elseif(${ARCHITECTURE} MATCHES "armv7l") - SET(USE_SIMD "NEON") -endif() - -message( STATUS "CM256cc: use SIMD: ${USE_SIMD}" ) - -if(USE_SIMD MATCHES SSSE3) - if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) - set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mssse3" ) - set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mssse3" ) - message(STATUS "g++ SSSE3") - add_definitions(-DUSE_SIMD) - elseif(MSVC) - set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /arch:SSSE3" ) - set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /GL /Ot /Ox /arch:SSSE3" ) - set( CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG" ) - message(STATUS "MSVC SSSE3") - add_definitions (/D "_CRT_SECURE_NO_WARNINGS") - add_definitions(-DUSE_SIMD) - endif() -elseif(USE_SIMD MATCHES NEON) - if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) - set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfpu=neon" ) - set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mfpu=neon" ) - message(STATUS "g++ NEON") - add_definitions(-DUSE_NEON) - endif() +if (HAS_SSSE3) + message(STATUS "SDRdaemonFEC: use SSSE3 SIMD" ) +elseif (HAS_NEON) + message(STATUS "SDRdaemonFEC: use Neon SIMD" ) else() - message(STATUS "CM256cc: Unsupported architecture") + message(STATUS "SDRdaemonFEC: Unsupported architecture") return() endif() diff --git a/cm256cc/cm256cc.pro b/cm256cc/cm256cc.pro index 0896f38b2..0b50b7a39 100644 --- a/cm256cc/cm256cc.pro +++ b/cm256cc/cm256cc.pro @@ -15,7 +15,7 @@ CONFIG(MINGW64):LIBCM256CCSRC = "D:\softs\cm256cc" INCLUDEPATH += $$LIBCM256CCSRC DEFINES += __WINDOWS__=1 -DEFINES += USE_SIMD=1 +DEFINES += USE_SSSE3=1 QMAKE_CXXFLAGS += -msse4.1 CONFIG(Release):build_subdir = release diff --git a/plugins/channeltx/modam/modam.pro b/plugins/channeltx/modam/modam.pro index 7fd04a043..e33564037 100644 --- a/plugins/channeltx/modam/modam.pro +++ b/plugins/channeltx/modam/modam.pro @@ -13,7 +13,7 @@ TARGET = modam INCLUDEPATH += $$PWD INCLUDEPATH += ../../../sdrbase -DEFINES += USE_SSE=1 +DEFINES += USE_SSE4_1=1 QMAKE_CXXFLAGS += -msse4.1 CONFIG(Release):build_subdir = release diff --git a/plugins/samplesource/sdrdaemonfec/CMakeLists.txt b/plugins/samplesource/sdrdaemonfec/CMakeLists.txt index 69ae74ff2..a010b0e45 100644 --- a/plugins/samplesource/sdrdaemonfec/CMakeLists.txt +++ b/plugins/samplesource/sdrdaemonfec/CMakeLists.txt @@ -2,40 +2,10 @@ project(sdrdaemonfec) find_package(LibNANOMSG) -EXECUTE_PROCESS( COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE ) -message( STATUS "SDRdaemonFEC: Architecture: ${ARCHITECTURE}" ) - -if(${ARCHITECTURE} MATCHES "x86_64|AMD64|x86") - SET(USE_SIMD "SSSE3") -elseif(${ARCHITECTURE} MATCHES "armv7l") - SET(USE_SIMD "NEON") -endif() - -message( STATUS "SDRdaemonFEC: use SIMD: ${USE_SIMD}" ) - -if(USE_SIMD MATCHES SSSE3) - if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) - set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mssse3" ) - set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mssse3" ) - message(STATUS "SDRdaemonFEC: g++ SSSE3") - add_definitions(-DUSE_SIMD) - add_definitions(-DUSE_SSE) - elseif(MSVC) - set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /arch:SSSE3" ) - set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oi /GL /Ot /Ox /arch:SSSE3" ) - set( CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG" ) - message(STATUS "SDRdaemonFEC: MSVC SSSE3") - add_definitions (/D "_CRT_SECURE_NO_WARNINGS") - add_definitions(-DUSE_SIMD) - add_definitions(-DUSE_SSE) - endif() -elseif(USE_SIMD MATCHES NEON) - if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) - set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfpu=neon" ) - set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mfpu=neon" ) - message(STATUS "SDRdaemonFEC: g++ NEON") - add_definitions(-DUSE_NEON) - endif() +if (HAS_SSSE3) + message(STATUS "SDRdaemonFEC: use SSSE3 SIMD" ) +elseif (HAS_NEON) + message(STATUS "SDRdaemonFEC: use Neon SIMD" ) else() message(STATUS "SDRdaemonFEC: Unsupported architecture") return() diff --git a/plugins/samplesource/sdrdaemonfec/sdrdaemonfec.pro b/plugins/samplesource/sdrdaemonfec/sdrdaemonfec.pro index cfe657edf..6da09f089 100644 --- a/plugins/samplesource/sdrdaemonfec/sdrdaemonfec.pro +++ b/plugins/samplesource/sdrdaemonfec/sdrdaemonfec.pro @@ -23,9 +23,8 @@ INCLUDEPATH += ../../../lz4 INCLUDEPATH += $$LIBNANOMSGSRC/src INCLUDEPATH += $$LIBCM256CCSRC -DEFINES += USE_SIMD=1 -DEFINES += USE_SSE=1 -QMAKE_CXXFLAGS += -msse4.1 +DEFINES += USE_SSSE3=1 +QMAKE_CXXFLAGS += -mssse3 CONFIG(Release):build_subdir = release CONFIG(Debug):build_subdir = debug diff --git a/sdrbase/dsp/decimators.h b/sdrbase/dsp/decimators.h index fdee166b4..4aa41c3e2 100644 --- a/sdrbase/dsp/decimators.h +++ b/sdrbase/dsp/decimators.h @@ -18,7 +18,7 @@ #define INCLUDE_GPL_DSP_DECIMATORS_H_ #include "dsp/dsptypes.h" -#ifdef USE_SSE +#ifdef USE_SSE4_1 #include "dsp/inthalfbandfiltereo1.h" #else #include "dsp/inthalfbandfilterdb.h" @@ -124,7 +124,7 @@ public: void decimate64_cen(SampleVector::iterator* it, const T* buf, qint32 len); private: -#ifdef USE_SSE +#ifdef USE_SSE4_1 IntHalfbandFilterEO1 m_decimator2; // 1st stages IntHalfbandFilterEO1 m_decimator4; // 2nd stages IntHalfbandFilterEO1 m_decimator8; // 3rd stages diff --git a/sdrbase/dsp/interpolator.h b/sdrbase/dsp/interpolator.h index 8cf3f29a0..b504987af 100644 --- a/sdrbase/dsp/interpolator.h +++ b/sdrbase/dsp/interpolator.h @@ -1,8 +1,8 @@ #ifndef INCLUDE_INTERPOLATOR_H #define INCLUDE_INTERPOLATOR_H -#ifdef USE_SSE -#include +#ifdef USE_SSE2 +#include #endif #include "dsp/dsptypes.h" #include "util/export.h" @@ -125,7 +125,7 @@ private: { if (phase < 0) phase = 0; -#if USE_SSE +#if USE_SSE2 // beware of the ringbuffer if(m_ptr == 0) { // only one straight block diff --git a/sdrbase/dsp/inthalfbandfiltereo1.h b/sdrbase/dsp/inthalfbandfiltereo1.h index 7eb77db07..db196e060 100644 --- a/sdrbase/dsp/inthalfbandfiltereo1.h +++ b/sdrbase/dsp/inthalfbandfiltereo1.h @@ -22,7 +22,7 @@ #ifndef SDRBASE_DSP_INTHALFBANDFILTEREO_H_ #define SDRBASE_DSP_INTHALFBANDFILTEREO_H_ -#ifdef USE_SSE +#ifdef USE_SSE4_1 #include #endif @@ -464,7 +464,7 @@ protected: qint32 iAcc = 0; qint32 qAcc = 0; -#ifdef USE_SSE +#ifdef USE_SSE4_1 //#warning "IntHalfbandFiler SIMD" const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; __m128i sumI = _mm_setzero_si128(); @@ -551,7 +551,7 @@ protected: qint32 iAcc = 0; qint32 qAcc = 0; -#ifdef USE_SSE +#ifdef USE_SSE4_1 const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; __m128i sumI = _mm_setzero_si128(); __m128i sumQ = _mm_setzero_si128(); diff --git a/sdrbase/dsp/inthalfbandfiltereo2.h b/sdrbase/dsp/inthalfbandfiltereo2.h index e58d7cf88..79761d9a1 100644 --- a/sdrbase/dsp/inthalfbandfiltereo2.h +++ b/sdrbase/dsp/inthalfbandfiltereo2.h @@ -22,10 +22,14 @@ #ifndef SDRBASE_DSP_INTHALFBANDFILTEREO2_H_ #define SDRBASE_DSP_INTHALFBANDFILTEREO2_H_ -#ifdef USE_SSE +#ifdef USE_SSE4_1 #include #endif +#ifdef USE_NEON +#include +#endif + #include #include "dsp/dsptypes.h" #include "dsp/hbfiltertraits.h" @@ -484,8 +488,7 @@ protected: qint32 iAcc = 0; qint32 qAcc = 0; -#ifdef USE_SSE -//#warning "IntHalfbandFiler SIMD" +#if defined(USE_SSE4_1) const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; __m128i sumI = _mm_setzero_si128(); __m128i sumQ = _mm_setzero_si128(); @@ -528,6 +531,47 @@ protected: sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 8)); sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 4)); qAcc = _mm_cvtsi128_si32(sumQ); +#elif defined(USE_NEON) + int32x4_t sumI = vdupq_n_s32(0); + int32x4_t sumQ = vdupq_n_s32(0); + int32x4_t sa, sb, sh; + + for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) + { + sh = vld1_s32(&h[4*i]); + + if ((m_ptrB % 2) == 0) + { + sa = vld1q_s32(&(m_evenA[0][a])); + sb = vld1q_s32(&(m_evenB[0][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); + + sa = vld1q_s32(&(m_evenA[1][a])); + sb = vld1q_s32(&(m_evenB[1][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); + } + else + { + sa = vld1q_s32(&(m_oddA[0][a])); + sb = vld1q_s32(&(m_oddB[0][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); + + sa = vld1q_s32(&(m_oddA[1][a])); + sb = vld1q_s32(&(m_oddB[1][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); + } + + a += 4; + b += 4; + } + + int32x2_t sumI1 = vpadd_s32(vget_high_s32(sumI), vget_low_s32(sumI)); + int32x2_t sumI2 = vpadd_s32(sumI1, sumI1); + iAcc = vget_lane_s32(sumI2, 0); + + int32x2_t sumQ1 = vpadd_s32(vget_high_s32(sumQ), vget_low_s32(sumQ)); + int32x2_t sumQ2 = vpadd_s32(sumQ1, sumQ1); + qAcc = vget_lane_s32(sumQ2, 0); #else for (int i = 0; i < HBFIRFilterTraits::hbOrder / 4; i++) { @@ -570,7 +614,7 @@ protected: qint32 iAcc = 0; qint32 qAcc = 0; -#ifdef USE_SSE +#if defined(USE_SSE4_1) //#warning "IntHalfbandFiler SIMD" const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; __m128i sumI = _mm_setzero_si128(); @@ -614,6 +658,47 @@ protected: sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 8)); sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 4)); qAcc = _mm_cvtsi128_si32(sumQ); +#elif defined(USE_NEON) + int32x4_t sumI = vdupq_n_s32(0); + int32x4_t sumQ = vdupq_n_s32(0); + int32x4_t sa, sb, sh; + + for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) + { + sh = vld1_s32(&h[4*i]); + + if ((m_ptrB % 2) == 0) + { + sa = vld1q_s32(&(m_evenA[0][a])); + sb = vld1q_s32(&(m_evenB[0][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); + + sa = vld1q_s32(&(m_evenA[1][a])); + sb = vld1q_s32(&(m_evenB[1][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); + } + else + { + sa = vld1q_s32(&(m_oddA[0][a])); + sb = vld1q_s32(&(m_oddB[0][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); + + sa = vld1q_s32(&(m_oddA[1][a])); + sb = vld1q_s32(&(m_oddB[1][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); + } + + a += 4; + b += 4; + } + + int32x2_t sumI1 = vpadd_s32(vget_high_s32(sumI), vget_low_s32(sumI)); + int32x2_t sumI2 = vpadd_s32(sumI1, sumI1); + iAcc = vget_lane_s32(sumI2, 0); + + int32x2_t sumQ1 = vpadd_s32(vget_high_s32(sumQ), vget_low_s32(sumQ)); + int32x2_t sumQ2 = vpadd_s32(sumQ1, sumQ1); + qAcc = vget_lane_s32(sumQ2, 0); #else for (int i = 0; i < HBFIRFilterTraits::hbOrder / 4; i++) { diff --git a/sdrbase/dsp/upchannelizer.cpp b/sdrbase/dsp/upchannelizer.cpp index a7eeb6d47..dedf9eb57 100644 --- a/sdrbase/dsp/upchannelizer.cpp +++ b/sdrbase/dsp/upchannelizer.cpp @@ -201,7 +201,7 @@ void UpChannelizer::applyConfiguration() } } -#ifdef USE_SSE +#ifdef USE_SSE4_1 UpChannelizer::FilterStage::FilterStage(Mode mode) : m_filter(new IntHalfbandFilterEO2), m_workFunction(0) diff --git a/sdrbase/dsp/upchannelizer.h b/sdrbase/dsp/upchannelizer.h index 80d290d35..f5c372896 100644 --- a/sdrbase/dsp/upchannelizer.h +++ b/sdrbase/dsp/upchannelizer.h @@ -23,7 +23,7 @@ #include #include "util/export.h" #include "util/message.h" -#ifdef USE_SSE +#ifdef USE_SSE4_1 #include "dsp/inthalfbandfiltereo2.h" #else #include "dsp/inthalfbandfilterdb.h" @@ -73,7 +73,7 @@ protected: ModeUpperHalf }; -#ifdef USE_SSE +#ifdef USE_SSE4_1 typedef bool (IntHalfbandFilterEO2::*WorkFunction)(Sample* sIn, Sample *sOut); IntHalfbandFilterEO2* m_filter; #else diff --git a/sdrbase/gui/glspectrum.cpp b/sdrbase/gui/glspectrum.cpp index 27381cbf7..aaee2094a 100644 --- a/sdrbase/gui/glspectrum.cpp +++ b/sdrbase/gui/glspectrum.cpp @@ -15,8 +15,8 @@ // along with this program. If not, see . // /////////////////////////////////////////////////////////////////////////////////// -#ifdef USE_SSE -#include +#ifdef USE_SSE2 +#include #endif #include @@ -381,83 +381,83 @@ void GLSpectrum::updateHistogram(const std::vector& spectrum) m_currentSpectrum = &spectrum; // Store spectrum for current spectrum line display -#ifndef USE_SSE - for(int i = 0; i < m_fftSize; i++) { - int v = (int)((spectrum[i] - m_referenceLevel) * 100.0 / m_powerRange + 100.0); +#ifdef USE_SSE2 + if(m_decay >= 0) { // normal + const __m128 refl = {m_referenceLevel, m_referenceLevel, m_referenceLevel, m_referenceLevel}; + const __m128 power = {m_powerRange, m_powerRange, m_powerRange, m_powerRange}; + const __m128 mul = {100.0f, 100.0f, 100.0f, 100.0f}; - if ((v >= 0) && (v <= 99)) { - b = m_histogram + i * 100 + v; - if(*b < 220) - *b += m_histogramStroke; // was 4 - else if(*b < 239) - *b += 1; - } - } + for(int i = 0; i < m_fftSize; i += 4) { + __m128 abc = _mm_loadu_ps (&spectrum[i]); + abc = _mm_sub_ps(abc, refl); + abc = _mm_mul_ps(abc, mul); + abc = _mm_div_ps(abc, power); + abc = _mm_add_ps(abc, mul); + __m128i result = _mm_cvtps_epi32(abc); + + for(int j = 0; j < 4; j++) { + int v = ((int*)&result)[j]; + if((v >= 0) && (v <= 99)) { + b = m_histogram + (i + j) * 100 + v; + if(*b < 220) + *b += m_histogramStroke; // was 4 + else if(*b < 239) + *b += 1; + } + } + } + } else { // draw double pixels + int add = -m_decay * 4; + const __m128 refl = {m_referenceLevel, m_referenceLevel, m_referenceLevel, m_referenceLevel}; + const __m128 power = {m_powerRange, m_powerRange, m_powerRange, m_powerRange}; + const __m128 mul = {100.0f, 100.0f, 100.0f, 100.0f}; + + for(int i = 0; i < m_fftSize; i += 4) { + __m128 abc = _mm_loadu_ps (&spectrum[i]); + abc = _mm_sub_ps(abc, refl); + abc = _mm_mul_ps(abc, mul); + abc = _mm_div_ps(abc, power); + abc = _mm_add_ps(abc, mul); + __m128i result = _mm_cvtps_epi32(abc); + + for(int j = 0; j < 4; j++) { + int v = ((int*)&result)[j]; + if((v >= 1) && (v <= 98)) { + b = m_histogram + (i + j) * 100 + v; + if(b[-1] < 220) + b[-1] += add; + else if(b[-1] < 239) + b[-1] += 1; + if(b[0] < 220) + b[0] += add; + else if(b[0] < 239) + b[0] += 1; + if(b[1] < 220) + b[1] += add; + else if(b[1] < 239) + b[1] += 1; + } else if((v >= 0) && (v <= 99)) { + b = m_histogram + (i + j) * 100 + v; + if(*b < 220) + *b += add; + else if(*b < 239) + *b += 1; + } + } + } + } #else - if(m_decay >= 0) { // normal - const __m128 refl = {m_referenceLevel, m_referenceLevel, m_referenceLevel, m_referenceLevel}; - const __m128 power = {m_powerRange, m_powerRange, m_powerRange, m_powerRange}; - const __m128 mul = {100.0f, 100.0f, 100.0f, 100.0f}; + for(int i = 0; i < m_fftSize; i++) { + int v = (int)((spectrum[i] - m_referenceLevel) * 100.0 / m_powerRange + 100.0); - for(int i = 0; i < m_fftSize; i += 4) { - __m128 abc = _mm_loadu_ps (&spectrum[i]); - abc = _mm_sub_ps(abc, refl); - abc = _mm_mul_ps(abc, mul); - abc = _mm_div_ps(abc, power); - abc = _mm_add_ps(abc, mul); - __m128i result = _mm_cvtps_epi32(abc); - - for(int j = 0; j < 4; j++) { - int v = ((int*)&result)[j]; - if((v >= 0) && (v <= 99)) { - b = m_histogram + (i + j) * 100 + v; - if(*b < 220) - *b += m_histogramStroke; // was 4 - else if(*b < 239) - *b += 1; - } - } - } - } else { // draw double pixels - int add = -m_decay * 4; - const __m128 refl = {m_referenceLevel, m_referenceLevel, m_referenceLevel, m_referenceLevel}; - const __m128 power = {m_powerRange, m_powerRange, m_powerRange, m_powerRange}; - const __m128 mul = {100.0f, 100.0f, 100.0f, 100.0f}; - - for(int i = 0; i < m_fftSize; i += 4) { - __m128 abc = _mm_loadu_ps (&spectrum[i]); - abc = _mm_sub_ps(abc, refl); - abc = _mm_mul_ps(abc, mul); - abc = _mm_div_ps(abc, power); - abc = _mm_add_ps(abc, mul); - __m128i result = _mm_cvtps_epi32(abc); - - for(int j = 0; j < 4; j++) { - int v = ((int*)&result)[j]; - if((v >= 1) && (v <= 98)) { - b = m_histogram + (i + j) * 100 + v; - if(b[-1] < 220) - b[-1] += add; - else if(b[-1] < 239) - b[-1] += 1; - if(b[0] < 220) - b[0] += add; - else if(b[0] < 239) - b[0] += 1; - if(b[1] < 220) - b[1] += add; - else if(b[1] < 239) - b[1] += 1; - } else if((v >= 0) && (v <= 99)) { - b = m_histogram + (i + j) * 100 + v; - if(*b < 220) - *b += add; - else if(*b < 239) - *b += 1; - } - } - } - } + if ((v >= 0) && (v <= 99)) { + b = m_histogram + i * 100 + v; + if(*b < 220) + *b += m_histogramStroke; // was 4 + else if(*b < 239) + *b += 1; + } + } #endif } diff --git a/sdrbase/sdrbase.pro b/sdrbase/sdrbase.pro index 6efd0895d..456f3bfbf 100644 --- a/sdrbase/sdrbase.pro +++ b/sdrbase/sdrbase.pro @@ -14,8 +14,8 @@ INCLUDEPATH += $$PWD DEFINES += USE_KISSFFT=1 DEFINES += __WINDOWS__=1 DEFINES += DSD_USE_SERIALDV=1 -DEFINES += USE_SSE=1 -QMAKE_CXXFLAGS += -msse4.1 +DEFINES += USE_SSE2=1 +QMAKE_CXXFLAGS += -msse2 CONFIG(Release):build_subdir = release CONFIG(Debug):build_subdir = debug