diff --git a/CMakeLists.txt b/CMakeLists.txt index f3defc825..d28581f15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -210,7 +210,9 @@ set(sdrbase_HEADERS sdrbase/dsp/inthalfbandfilter.h sdrbase/dsp/inthalfbandfilterdb.h sdrbase/dsp/inthalfbandfiltereo1.h + sdrbase/dsp/inthalfbandfiltereo1i.h sdrbase/dsp/inthalfbandfiltereo2.h + sdrbase/dsp/inthalfbandfiltereo2i.h sdrbase/dsp/kissfft.h sdrbase/dsp/kissengine.h sdrbase/dsp/lowpass.h diff --git a/sdrbase/dsp/hbfiltertraits.h b/sdrbase/dsp/hbfiltertraits.h index ccac695cc..14bbb266c 100644 --- a/sdrbase/dsp/hbfiltertraits.h +++ b/sdrbase/dsp/hbfiltertraits.h @@ -37,7 +37,7 @@ struct HBFIRFilterTraits<32> static const int32_t hbOrder = 32; static const int32_t hbShift = 14; static const int16_t hbMod[32+6]; - static const int32_t hbCoeffs[8] __attribute__ ((aligned (16))); + static const int32_t hbCoeffs[8] __attribute__ ((aligned (32))); }; template<> @@ -55,7 +55,7 @@ struct HBFIRFilterTraits<64> static const int32_t hbOrder = 64; static const int32_t hbShift = 14; static const int16_t hbMod[64+6]; - static const int32_t hbCoeffs[16] __attribute__ ((aligned (16))); + static const int32_t hbCoeffs[16] __attribute__ ((aligned (32))); }; template<> @@ -73,7 +73,7 @@ struct HBFIRFilterTraits<96> static const int32_t hbOrder = 96; static const int32_t hbShift = 16; static const int16_t hbMod[96+6]; - static const int32_t hbCoeffs[24] __attribute__ ((aligned (16))); + static const int32_t hbCoeffs[24] __attribute__ ((aligned (32))); }; template<> diff --git a/sdrbase/dsp/inthalfbandfiltereo1.h b/sdrbase/dsp/inthalfbandfiltereo1.h index db196e060..dbd116bf7 100644 --- a/sdrbase/dsp/inthalfbandfiltereo1.h +++ b/sdrbase/dsp/inthalfbandfiltereo1.h @@ -29,6 +29,7 @@ #include #include "dsp/dsptypes.h" #include "dsp/hbfiltertraits.h" +#include "dsp/inthalfbandfiltereo1i.h" #include "util/export.h" template @@ -465,50 +466,13 @@ protected: qint32 qAcc = 0; #ifdef USE_SSE4_1 -//#warning "IntHalfbandFiler SIMD" - const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; - __m128i sumI = _mm_setzero_si128(); - __m128i sumQ = _mm_setzero_si128(); - __m128i sa, sb; - a -= 3; - - for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) - { - if ((m_ptr % 2) == 0) - { - sa = _mm_shuffle_epi32(_mm_loadu_si128((__m128i*) &(m_even[0][a])), _MM_SHUFFLE(0,1,2,3)); - sb = _mm_loadu_si128((__m128i*) &(m_even[0][b])); - sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); - - sa = _mm_shuffle_epi32(_mm_loadu_si128((__m128i*) &(m_even[1][a])), _MM_SHUFFLE(0,1,2,3)); - sb = _mm_loadu_si128((__m128i*) &(m_even[1][b])); - sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); - } - else - { - sa = _mm_shuffle_epi32(_mm_loadu_si128((__m128i*) &(m_odd[0][a])), _MM_SHUFFLE(0,1,2,3)); - sb = _mm_loadu_si128((__m128i*) &(m_odd[0][b])); - sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); - - sa = _mm_shuffle_epi32(_mm_loadu_si128((__m128i*) &(m_odd[1][a])), _MM_SHUFFLE(0,1,2,3)); - sb = _mm_loadu_si128((__m128i*) &(m_odd[1][b])); - sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); - } - - a -= 4; - b += 4; - ++h; - } - - // horizontal add of four 32 bit partial sums - - sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 8)); - sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 4)); - iAcc = _mm_cvtsi128_si32(sumI); - - sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 8)); - sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 4)); - qAcc = _mm_cvtsi128_si32(sumQ); + IntHalfbandFilterEO1Intrisics::work( + m_ptr, + m_even, + m_odd, + iAcc, + qAcc + ); #else for (int i = 0; i < HBFIRFilterTraits::hbOrder / 4; i++) { diff --git a/sdrbase/dsp/inthalfbandfiltereo1i.h b/sdrbase/dsp/inthalfbandfiltereo1i.h new file mode 100644 index 000000000..c03ec032c --- /dev/null +++ b/sdrbase/dsp/inthalfbandfiltereo1i.h @@ -0,0 +1,99 @@ +/////////////////////////////////////////////////////////////////////////////////// +// Copyright (C) 2016 F4EXB // +// written by Edouard Griffiths // +// // +// Integer half-band FIR based interpolator and decimator // +// This is the even/odd double buffer variant. Really useful only when SIMD is // +// used // +// // +// This program is free software; you can redistribute it and/or modify // +// it under the terms of the GNU General Public License as published by // +// the Free Software Foundation as version 3 of the License, or // +// // +// This program is distributed in the hope that it will be useful, // +// but WITHOUT ANY WARRANTY; without even the implied warranty of // +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // +// GNU General Public License V3 for more details. // +// // +// You should have received a copy of the GNU General Public License // +// along with this program. If not, see . // +/////////////////////////////////////////////////////////////////////////////////// + +#ifndef SDRBASE_DSP_INTHALFBANDFILTEREO1I_H_ +#define SDRBASE_DSP_INTHALFBANDFILTEREO1I_H_ + +#include + +#if defined(USE_AVX2) +#include +#elif defined(USE_SSE4_1) +#include +#elif defined(USE_NEON) +#include +#endif + +#include "hbfiltertraits.h" + +template +class IntHalfbandFilterEO1Intrisics +{ +public: + static void work( + int ptr, + int32_t even[2][HBFilterOrder], + int32_t odd[2][HBFilterOrder], + int32_t& iAcc, int32_t& qAcc) + { +#if defined(USE_SSE4_1) + int a = ptr/2 + HBFIRFilterTraits::hbOrder/2; // tip pointer + int b = ptr/2 + 1; // tail pointer + const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; + __m128i sumI = _mm_setzero_si128(); + __m128i sumQ = _mm_setzero_si128(); + __m128i sa, sb; + a -= 3; + + for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) + { + if ((ptr % 2) == 0) + { + sa = _mm_shuffle_epi32(_mm_loadu_si128((__m128i*) &(even[0][a])), _MM_SHUFFLE(0,1,2,3)); + sb = _mm_loadu_si128((__m128i*) &(even[0][b])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + + sa = _mm_shuffle_epi32(_mm_loadu_si128((__m128i*) &(even[1][a])), _MM_SHUFFLE(0,1,2,3)); + sb = _mm_loadu_si128((__m128i*) &(even[1][b])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + } + else + { + sa = _mm_shuffle_epi32(_mm_loadu_si128((__m128i*) &(odd[0][a])), _MM_SHUFFLE(0,1,2,3)); + sb = _mm_loadu_si128((__m128i*) &(odd[0][b])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + + sa = _mm_shuffle_epi32(_mm_loadu_si128((__m128i*) &(odd[1][a])), _MM_SHUFFLE(0,1,2,3)); + sb = _mm_loadu_si128((__m128i*) &(odd[1][b])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + } + + a -= 4; + b += 4; + ++h; + } + + // horizontal add of four 32 bit partial sums + + sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 8)); + sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 4)); + iAcc = _mm_cvtsi128_si32(sumI); + + sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 8)); + sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 4)); + qAcc = _mm_cvtsi128_si32(sumQ); +#endif + } +}; + + + +#endif /* SDRBASE_DSP_INTHALFBANDFILTEREO1I_H_ */ diff --git a/sdrbase/dsp/inthalfbandfiltereo2.h b/sdrbase/dsp/inthalfbandfiltereo2.h index 79761d9a1..16ebab6fb 100644 --- a/sdrbase/dsp/inthalfbandfiltereo2.h +++ b/sdrbase/dsp/inthalfbandfiltereo2.h @@ -33,6 +33,7 @@ #include #include "dsp/dsptypes.h" #include "dsp/hbfiltertraits.h" +#include "dsp/inthalfbandfiltereo2i.h" #include "util/export.h" template @@ -488,90 +489,17 @@ protected: qint32 iAcc = 0; qint32 qAcc = 0; -#if defined(USE_SSE4_1) - const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; - __m128i sumI = _mm_setzero_si128(); - __m128i sumQ = _mm_setzero_si128(); - __m128i sa, sb; - - for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) - { - if ((m_ptrB % 2) == 0) - { - sa = _mm_loadu_si128((__m128i*) &(m_evenA[0][a])); - sb = _mm_loadu_si128((__m128i*) &(m_evenB[0][b])); - sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); - - sa = _mm_loadu_si128((__m128i*) &(m_evenA[1][a])); - sb = _mm_loadu_si128((__m128i*) &(m_evenB[1][b])); - sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); - } - else - { - sa = _mm_loadu_si128((__m128i*) &(m_oddA[0][a])); - sb = _mm_loadu_si128((__m128i*) &(m_oddB[0][b])); - sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); - - sa = _mm_loadu_si128((__m128i*) &(m_oddA[1][a])); - sb = _mm_loadu_si128((__m128i*) &(m_oddB[1][b])); - sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); - } - - a += 4; - b += 4; - ++h; - } - - // horizontal add of four 32 bit partial sums - - sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 8)); - sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 4)); - iAcc = _mm_cvtsi128_si32(sumI); - - sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 8)); - sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 4)); - qAcc = _mm_cvtsi128_si32(sumQ); -#elif defined(USE_NEON) - int32x4_t sumI = vdupq_n_s32(0); - int32x4_t sumQ = vdupq_n_s32(0); - int32x4_t sa, sb, sh; - - for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) - { - sh = vld1_s32(&h[4*i]); - - if ((m_ptrB % 2) == 0) - { - sa = vld1q_s32(&(m_evenA[0][a])); - sb = vld1q_s32(&(m_evenB[0][b])); - sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); - - sa = vld1q_s32(&(m_evenA[1][a])); - sb = vld1q_s32(&(m_evenB[1][b])); - sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); - } - else - { - sa = vld1q_s32(&(m_oddA[0][a])); - sb = vld1q_s32(&(m_oddB[0][b])); - sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); - - sa = vld1q_s32(&(m_oddA[1][a])); - sb = vld1q_s32(&(m_oddB[1][b])); - sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); - } - - a += 4; - b += 4; - } - - int32x2_t sumI1 = vpadd_s32(vget_high_s32(sumI), vget_low_s32(sumI)); - int32x2_t sumI2 = vpadd_s32(sumI1, sumI1); - iAcc = vget_lane_s32(sumI2, 0); - - int32x2_t sumQ1 = vpadd_s32(vget_high_s32(sumQ), vget_low_s32(sumQ)); - int32x2_t sumQ2 = vpadd_s32(sumQ1, sumQ1); - qAcc = vget_lane_s32(sumQ2, 0); +#if defined(USE_AVX2) || defined(USE_SSE4_1) || defined(USE_NEON) + IntHalfbandFilterEO2Intrisics::work( + m_ptrA, + m_ptrB, + m_evenA, + m_evenB, + m_oddA, + m_oddB, + iAcc, + qAcc + ); #else for (int i = 0; i < HBFIRFilterTraits::hbOrder / 4; i++) { @@ -614,91 +542,17 @@ protected: qint32 iAcc = 0; qint32 qAcc = 0; -#if defined(USE_SSE4_1) -//#warning "IntHalfbandFiler SIMD" - const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; - __m128i sumI = _mm_setzero_si128(); - __m128i sumQ = _mm_setzero_si128(); - __m128i sa, sb; - - for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) - { - if ((m_ptrB % 2) == 0) - { - sa = _mm_loadu_si128((__m128i*) &(m_evenA[0][a])); - sb = _mm_loadu_si128((__m128i*) &(m_evenB[0][b])); - sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); - - sa = _mm_loadu_si128((__m128i*) &(m_evenA[1][a])); - sb = _mm_loadu_si128((__m128i*) &(m_evenB[1][b])); - sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); - } - else - { - sa = _mm_loadu_si128((__m128i*) &(m_oddA[0][a])); - sb = _mm_loadu_si128((__m128i*) &(m_oddB[0][b])); - sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); - - sa = _mm_loadu_si128((__m128i*) &(m_oddA[1][a])); - sb = _mm_loadu_si128((__m128i*) &(m_oddB[1][b])); - sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); - } - - a += 4; - b += 4; - ++h; - } - - // horizontal add of four 32 bit partial sums - - sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 8)); - sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 4)); - iAcc = _mm_cvtsi128_si32(sumI); - - sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 8)); - sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 4)); - qAcc = _mm_cvtsi128_si32(sumQ); -#elif defined(USE_NEON) - int32x4_t sumI = vdupq_n_s32(0); - int32x4_t sumQ = vdupq_n_s32(0); - int32x4_t sa, sb, sh; - - for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) - { - sh = vld1_s32(&h[4*i]); - - if ((m_ptrB % 2) == 0) - { - sa = vld1q_s32(&(m_evenA[0][a])); - sb = vld1q_s32(&(m_evenB[0][b])); - sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); - - sa = vld1q_s32(&(m_evenA[1][a])); - sb = vld1q_s32(&(m_evenB[1][b])); - sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); - } - else - { - sa = vld1q_s32(&(m_oddA[0][a])); - sb = vld1q_s32(&(m_oddB[0][b])); - sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); - - sa = vld1q_s32(&(m_oddA[1][a])); - sb = vld1q_s32(&(m_oddB[1][b])); - sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); - } - - a += 4; - b += 4; - } - - int32x2_t sumI1 = vpadd_s32(vget_high_s32(sumI), vget_low_s32(sumI)); - int32x2_t sumI2 = vpadd_s32(sumI1, sumI1); - iAcc = vget_lane_s32(sumI2, 0); - - int32x2_t sumQ1 = vpadd_s32(vget_high_s32(sumQ), vget_low_s32(sumQ)); - int32x2_t sumQ2 = vpadd_s32(sumQ1, sumQ1); - qAcc = vget_lane_s32(sumQ2, 0); +#if defined(USE_AVX2) || defined(USE_SSE4_1) || defined(USE_NEON) + IntHalfbandFilterEO2Intrisics::work( + m_ptrA, + m_ptrB, + m_evenA, + m_evenB, + m_oddA, + m_oddB, + iAcc, + qAcc + ); #else for (int i = 0; i < HBFIRFilterTraits::hbOrder / 4; i++) { diff --git a/sdrbase/dsp/inthalfbandfiltereo2i.h b/sdrbase/dsp/inthalfbandfiltereo2i.h new file mode 100644 index 000000000..e3b1df4d4 --- /dev/null +++ b/sdrbase/dsp/inthalfbandfiltereo2i.h @@ -0,0 +1,547 @@ +/////////////////////////////////////////////////////////////////////////////////// +// Copyright (C) 2016 F4EXB // +// written by Edouard Griffiths // +// // +// Integer half-band FIR based interpolator and decimator // +// This is the even/odd double buffer variant. Really useful only when SIMD is // +// used // +// // +// This program is free software; you can redistribute it and/or modify // +// it under the terms of the GNU General Public License as published by // +// the Free Software Foundation as version 3 of the License, or // +// // +// This program is distributed in the hope that it will be useful, // +// but WITHOUT ANY WARRANTY; without even the implied warranty of // +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // +// GNU General Public License V3 for more details. // +// // +// You should have received a copy of the GNU General Public License // +// along with this program. If not, see . // +/////////////////////////////////////////////////////////////////////////////////// + +#ifndef INCLUDE_INTHALFBANDFILTEREO2I_H_ +#define INCLUDE_INTHALFBANDFILTEREO2I_H_ + +#include + +#if defined(USE_AVX2) +#include +#elif defined(USE_SSE4_1) +#include +#elif defined(USE_NEON) +#include +#endif + +#include "hbfiltertraits.h" + +template +class IntHalfbandFilterEO2Intrisics +{ +public: + static void work( + int ptrA, + int ptrB, + int32_t evenA[2][HBFilterOrder], + int32_t evenB[2][HBFilterOrder], + int32_t oddA[2][HBFilterOrder], + int32_t oddB[2][HBFilterOrder], + int32_t& iAcc, int32_t& qAcc) + { +#if defined(USE_SSE4_1) + int a = ptrA/2; // tip pointer + int b = ptrB/2 + 1; // tail pointer + const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; + __m128i sumI = _mm_setzero_si128(); + __m128i sumQ = _mm_setzero_si128(); + __m128i sa, sb; + + for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) + { + if ((ptrB % 2) == 0) + { + sa = _mm_loadu_si128((__m128i*) &(evenA[0][a])); + sb = _mm_loadu_si128((__m128i*) &(evenB[0][b])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + + sa = _mm_loadu_si128((__m128i*) &(evenA[1][a])); + sb = _mm_loadu_si128((__m128i*) &(evenB[1][b])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + } + else + { + sa = _mm_loadu_si128((__m128i*) &(oddA[0][a])); + sb = _mm_loadu_si128((__m128i*) &(oddB[0][b])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + + sa = _mm_loadu_si128((__m128i*) &(oddA[1][a])); + sb = _mm_loadu_si128((__m128i*) &(oddB[1][b])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + } + + a += 4; + b += 4; + ++h; + } + + // horizontal add of four 32 bit partial sums + + sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 8)); + sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 4)); + iAcc = _mm_cvtsi128_si32(sumI); + + sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 8)); + sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 4)); + qAcc = _mm_cvtsi128_si32(sumQ); + +#elif defined(USE_NEON) + int a = ptrA/2; // tip pointer + int b = ptrB/2 + 1; // tail pointer + int32x4_t sumI = vdupq_n_s32(0); + int32x4_t sumQ = vdupq_n_s32(0); + int32x4_t sa, sb, sh; + + for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) + { + sh = vld1q_s32(&HBFIRFilterTraits::hbCoeffs[4*i]); + + if ((ptrB % 2) == 0) + { + sa = vld1q_s32(&(evenA[0][a])); + sb = vld1q_s32(&(evenB[0][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); + + sa = vld1q_s32(&(evenA[1][a])); + sb = vld1q_s32(&(evenB[1][b])); + sumQ = vmlaq_s32(sumQ, vaddq_s32(sa, sb), sh); + } + else + { + sa = vld1q_s32(&(oddA[0][a])); + sb = vld1q_s32(&(oddB[0][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(sa, sb), sh); + + sa = vld1q_s32(&(oddA[1][a])); + sb = vld1q_s32(&(oddB[1][b])); + sumQ = vmlaq_s32(sumQ, vaddq_s32(sa, sb), sh); + } + + a += 4; + b += 4; + } + + int32x2_t sumI1 = vpadd_s32(vget_high_s32(sumI), vget_low_s32(sumI)); + int32x2_t sumI2 = vpadd_s32(sumI1, sumI1); + iAcc = vget_lane_s32(sumI2, 0); + + int32x2_t sumQ1 = vpadd_s32(vget_high_s32(sumQ), vget_low_s32(sumQ)); + int32x2_t sumQ2 = vpadd_s32(sumQ1, sumQ1); + qAcc = vget_lane_s32(sumQ2, 0); +#endif + } +}; + +template<> +class IntHalfbandFilterEO2Intrisics<48> +{ +public: + static void work( + int ptrA, + int ptrB, + int32_t evenA[2][48], + int32_t evenB[2][48], + int32_t oddA[2][48], + int32_t oddB[2][48], + int32_t& iAcc, int32_t& qAcc) + { +#if defined(USE_SSE4_1) + int a = ptrA/2; // tip pointer + int b = ptrB/2 + 1; // tail pointer + const __m128i* h = (const __m128i*) HBFIRFilterTraits<48>::hbCoeffs; + __m128i sumI = _mm_setzero_si128(); + __m128i sumQ = _mm_setzero_si128(); + __m128i sa, sb; + + if ((ptrB % 2) == 0) + { + // 0 + sa = _mm_loadu_si128((__m128i*) &(evenA[0][a])); + sb = _mm_loadu_si128((__m128i*) &(evenB[0][b])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + sa = _mm_loadu_si128((__m128i*) &(evenA[1][a])); + sb = _mm_loadu_si128((__m128i*) &(evenB[1][b])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + h++; + // 1 + sa = _mm_loadu_si128((__m128i*) &(evenA[0][a+4])); + sb = _mm_loadu_si128((__m128i*) &(evenB[0][b+4])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + sa = _mm_loadu_si128((__m128i*) &(evenA[1][a+4])); + sb = _mm_loadu_si128((__m128i*) &(evenB[1][b+4])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + h++; + // 2 + sa = _mm_loadu_si128((__m128i*) &(evenA[0][a+8])); + sb = _mm_loadu_si128((__m128i*) &(evenB[0][b+8])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + sa = _mm_loadu_si128((__m128i*) &(evenA[1][a+8])); + sb = _mm_loadu_si128((__m128i*) &(evenB[1][b+8])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + } + else + { + // 0 + sa = _mm_loadu_si128((__m128i*) &(oddA[0][a])); + sb = _mm_loadu_si128((__m128i*) &(oddB[0][b])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + sa = _mm_loadu_si128((__m128i*) &(oddA[1][a])); + sb = _mm_loadu_si128((__m128i*) &(oddB[1][b])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + h++; + // 1 + sa = _mm_loadu_si128((__m128i*) &(oddA[0][a+4])); + sb = _mm_loadu_si128((__m128i*) &(oddB[0][b+4])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + sa = _mm_loadu_si128((__m128i*) &(oddA[1][a+4])); + sb = _mm_loadu_si128((__m128i*) &(oddB[1][b+4])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + h++; + // 2 + sa = _mm_loadu_si128((__m128i*) &(oddA[0][a+8])); + sb = _mm_loadu_si128((__m128i*) &(oddB[0][b+8])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + sa = _mm_loadu_si128((__m128i*) &(oddA[1][a+8])); + sb = _mm_loadu_si128((__m128i*) &(oddB[1][b+8])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), *h)); + } + + // horizontal add of four 32 bit partial sums + + sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 8)); + sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 4)); + iAcc = _mm_cvtsi128_si32(sumI); + + sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 8)); + sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 4)); + qAcc = _mm_cvtsi128_si32(sumQ); + +#elif defined(USE_NEON) + int a = ptrA/2; // tip pointer + int b = ptrB/2 + 1; // tail pointer + int32x4_t sumI = vdupq_n_s32(0); + int32x4_t sumQ = vdupq_n_s32(0); + int32x4x3_t sh = vld3q_s32((int32_t const *) &HBFIRFilterTraits<48>::hbCoeffs[0]); + int32x4x3_t sa, sb; + + if ((ptrB % 2) == 0) + { + sa = vld3q_s32((int32_t const *) &(evenA[0][a])); + sb = vld3q_s32((int32_t const *) &(evenB[0][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(sa.val[0], sb.val[0]), sh.val[0]); + sumI = vmlaq_s32(sumI, vaddq_s32(sa.val[1], sb.val[1]), sh.val[1]); + sumI = vmlaq_s32(sumI, vaddq_s32(sa.val[2], sb.val[2]), sh.val[2]); + sa = vld3q_s32((int32_t const *) &(evenA[1][a])); + sb = vld3q_s32((int32_t const *) &(evenB[1][b])); + sumQ = vmlaq_s32(sumQ, vaddq_s32(sa.val[0], sb.val[0]), sh.val[0]); + sumQ = vmlaq_s32(sumQ, vaddq_s32(sa.val[1], sb.val[1]), sh.val[1]); + sumQ = vmlaq_s32(sumQ, vaddq_s32(sa.val[2], sb.val[2]), sh.val[2]); + } + else + { + sa = vld3q_s32((int32_t const *) &(oddA[0][a])); + sb = vld3q_s32((int32_t const *) &(oddB[0][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(sa.val[0], sb.val[0]), sh.val[0]); + sumI = vmlaq_s32(sumI, vaddq_s32(sa.val[1], sb.val[1]), sh.val[1]); + sumI = vmlaq_s32(sumI, vaddq_s32(sa.val[2], sb.val[2]), sh.val[2]); + sa = vld3q_s32((int32_t const *) &(oddA[1][a])); + sb = vld3q_s32((int32_t const *) &(oddB[1][b])); + sumQ = vmlaq_s32(sumQ, vaddq_s32(sa.val[0], sb.val[0]), sh.val[0]); + sumQ = vmlaq_s32(sumQ, vaddq_s32(sa.val[1], sb.val[1]), sh.val[1]); + sumQ = vmlaq_s32(sumQ, vaddq_s32(sa.val[2], sb.val[2]), sh.val[2]); + } + + int32x2_t sumI1 = vpadd_s32(vget_high_s32(sumI), vget_low_s32(sumI)); + int32x2_t sumI2 = vpadd_s32(sumI1, sumI1); + iAcc = vget_lane_s32(sumI2, 0); + + int32x2_t sumQ1 = vpadd_s32(vget_high_s32(sumQ), vget_low_s32(sumQ)); + int32x2_t sumQ2 = vpadd_s32(sumQ1, sumQ1); + qAcc = vget_lane_s32(sumQ2, 0); +#endif + } +}; + + +template<> +class IntHalfbandFilterEO2Intrisics<96> +{ +public: + static void work( + int ptrA, + int ptrB, + int32_t evenA[2][96], + int32_t evenB[2][96], + int32_t oddA[2][96], + int32_t oddB[2][96], + int32_t& iAcc, int32_t& qAcc) + { +#if defined(USE_AVX2) + int a = ptrA/2; // tip pointer + int b = ptrB/2 + 1; // tail pointer + const __m256i* h = (const __m256i*) HBFIRFilterTraits<96>::hbCoeffs; + __m256i sumI = _mm256_setzero_si256(); + __m256i sumQ = _mm256_setzero_si256(); + __m256i sa, sb; + + if ((ptrB % 2) == 0) + { + // I + sa = _mm256_loadu_si256((__m256i*) &(evenA[0][a])); + sb = _mm256_loadu_si256((__m256i*) &(evenB[0][b])); + sumI = _mm256_add_epi32(sumI, _mm256_mullo_epi32(_mm256_add_epi32(sa, sb), h[0])); + sa = _mm256_loadu_si256((__m256i*) &(evenA[0][a+8])); + sb = _mm256_loadu_si256((__m256i*) &(evenB[0][b+8])); + sumI = _mm256_add_epi32(sumI, _mm256_mullo_epi32(_mm256_add_epi32(sa, sb), h[1])); + sa = _mm256_loadu_si256((__m256i*) &(evenA[0][a+16])); + sb = _mm256_loadu_si256((__m256i*) &(evenB[0][b+16])); + sumI = _mm256_add_epi32(sumI, _mm256_mullo_epi32(_mm256_add_epi32(sa, sb), h[2])); + // Q + sa = _mm256_loadu_si256((__m256i*) &(evenA[1][a])); + sb = _mm256_loadu_si256((__m256i*) &(evenB[1][b])); + sumQ = _mm256_add_epi32(sumQ, _mm256_mullo_epi32(_mm256_add_epi32(sa, sb), h[0])); + sa = _mm256_loadu_si256((__m256i*) &(evenA[1][a+8])); + sb = _mm256_loadu_si256((__m256i*) &(evenB[1][b+8])); + sumQ = _mm256_add_epi32(sumQ, _mm256_mullo_epi32(_mm256_add_epi32(sa, sb), h[1])); + sa = _mm256_loadu_si256((__m256i*) &(evenA[1][a+16])); + sb = _mm256_loadu_si256((__m256i*) &(evenB[1][b+16])); + sumQ = _mm256_add_epi32(sumQ, _mm256_mullo_epi32(_mm256_add_epi32(sa, sb), h[2])); + } + else + { + // I + sa = _mm256_loadu_si256((__m256i*) &(oddA[0][a])); + sb = _mm256_loadu_si256((__m256i*) &(oddB[0][b])); + sumI = _mm256_add_epi32(sumI, _mm256_mullo_epi32(_mm256_add_epi32(sa, sb), h[0])); + sa = _mm256_loadu_si256((__m256i*) &(oddA[0][a+8])); + sb = _mm256_loadu_si256((__m256i*) &(oddB[0][b+8])); + sumI = _mm256_add_epi32(sumI, _mm256_mullo_epi32(_mm256_add_epi32(sa, sb), h[1])); + sa = _mm256_loadu_si256((__m256i*) &(oddA[0][a+16])); + sb = _mm256_loadu_si256((__m256i*) &(oddB[0][b+16])); + sumI = _mm256_add_epi32(sumI, _mm256_mullo_epi32(_mm256_add_epi32(sa, sb), h[2])); + // Q + sa = _mm256_loadu_si256((__m256i*) &(oddA[1][a])); + sb = _mm256_loadu_si256((__m256i*) &(oddB[1][b])); + sumQ = _mm256_add_epi32(sumQ, _mm256_mullo_epi32(_mm256_add_epi32(sa, sb), h[0])); + sa = _mm256_loadu_si256((__m256i*) &(oddA[1][a+8])); + sb = _mm256_loadu_si256((__m256i*) &(oddB[1][b+8])); + sumQ = _mm256_add_epi32(sumQ, _mm256_mullo_epi32(_mm256_add_epi32(sa, sb), h[1])); + sa = _mm256_loadu_si256((__m256i*) &(oddA[1][a+16])); + sb = _mm256_loadu_si256((__m256i*) &(oddB[1][b+16])); + sumQ = _mm256_add_epi32(sumQ, _mm256_mullo_epi32(_mm256_add_epi32(sa, sb), h[2])); + } + + // horizontal add + + __m128i vloI = _mm256_castsi256_si128(sumI); + vloI = _mm_add_epi32(vloI, _mm_srli_si128(vloI, 8)); + vloI = _mm_add_epi32(vloI, _mm_srli_si128(vloI, 4)); + iAcc = _mm_cvtsi128_si32(vloI); + __m128i vhiI = _mm256_extracti128_si256(sumI, 1); + vhiI = _mm_add_epi32(vhiI, _mm_srli_si128(vhiI, 8)); + vhiI = _mm_add_epi32(vhiI, _mm_srli_si128(vhiI, 4)); + iAcc += _mm_cvtsi128_si32(vhiI); + + __m128i vloQ = _mm256_castsi256_si128(sumQ); + vloQ = _mm_add_epi32(vloQ, _mm_srli_si128(vloQ, 8)); + vloQ = _mm_add_epi32(vloQ, _mm_srli_si128(vloQ, 4)); + qAcc = _mm_cvtsi128_si32(vloQ); + __m128i vhiQ = _mm256_extracti128_si256(sumQ, 1); + vhiQ = _mm_add_epi32(vhiQ, _mm_srli_si128(vhiQ, 8)); + vhiQ = _mm_add_epi32(vhiQ, _mm_srli_si128(vhiQ, 4)); + qAcc += _mm_cvtsi128_si32(vhiQ); + +#elif defined(USE_SSE4_1) + int a = ptrA/2; // tip pointer + int b = ptrB/2 + 1; // tail pointer + const __m128i* h = (const __m128i*) HBFIRFilterTraits<96>::hbCoeffs; + __m128i sumI = _mm_setzero_si128(); + __m128i sumQ = _mm_setzero_si128(); + __m128i sa, sb; + + if ((ptrB % 2) == 0) + { + // I + sa = _mm_loadu_si128((__m128i*) &(evenA[0][a])); + sb = _mm_loadu_si128((__m128i*) &(evenB[0][b])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[0])); + sa = _mm_loadu_si128((__m128i*) &(evenA[0][a+4])); + sb = _mm_loadu_si128((__m128i*) &(evenB[0][b+4])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[1])); + sa = _mm_loadu_si128((__m128i*) &(evenA[0][a+8])); + sb = _mm_loadu_si128((__m128i*) &(evenB[0][b+8])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[2])); + sa = _mm_loadu_si128((__m128i*) &(evenA[0][a+12])); + sb = _mm_loadu_si128((__m128i*) &(evenB[0][b+12])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[3])); + sa = _mm_loadu_si128((__m128i*) &(evenA[0][a+16])); + sb = _mm_loadu_si128((__m128i*) &(evenB[0][b+16])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[4])); + sa = _mm_loadu_si128((__m128i*) &(evenA[0][a+20])); + sb = _mm_loadu_si128((__m128i*) &(evenB[0][b+20])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[5])); + // Q + sa = _mm_loadu_si128((__m128i*) &(evenA[1][a])); + sb = _mm_loadu_si128((__m128i*) &(evenB[1][b])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[0])); + sa = _mm_loadu_si128((__m128i*) &(evenA[1][a+4])); + sb = _mm_loadu_si128((__m128i*) &(evenB[1][b+4])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[1])); + sa = _mm_loadu_si128((__m128i*) &(evenA[1][a+8])); + sb = _mm_loadu_si128((__m128i*) &(evenB[1][b+8])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[2])); + sa = _mm_loadu_si128((__m128i*) &(evenA[1][a+12])); + sb = _mm_loadu_si128((__m128i*) &(evenB[1][b+12])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[3])); + sa = _mm_loadu_si128((__m128i*) &(evenA[1][a+16])); + sb = _mm_loadu_si128((__m128i*) &(evenB[1][b+16])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[4])); + sa = _mm_loadu_si128((__m128i*) &(evenA[1][a+20])); + sb = _mm_loadu_si128((__m128i*) &(evenB[1][b+20])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[5])); + } + else + { + // I + sa = _mm_loadu_si128((__m128i*) &(oddA[0][a])); + sb = _mm_loadu_si128((__m128i*) &(oddB[0][b])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[0])); + sa = _mm_loadu_si128((__m128i*) &(oddA[0][a+4])); + sb = _mm_loadu_si128((__m128i*) &(oddB[0][b+4])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[1])); + sa = _mm_loadu_si128((__m128i*) &(oddA[0][a+8])); + sb = _mm_loadu_si128((__m128i*) &(oddB[0][b+8])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[2])); + sa = _mm_loadu_si128((__m128i*) &(oddA[0][a+12])); + sb = _mm_loadu_si128((__m128i*) &(oddB[0][b+12])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[3])); + sa = _mm_loadu_si128((__m128i*) &(oddA[0][a+16])); + sb = _mm_loadu_si128((__m128i*) &(oddB[0][b+16])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[4])); + sa = _mm_loadu_si128((__m128i*) &(oddA[0][a+20])); + sb = _mm_loadu_si128((__m128i*) &(oddB[0][b+20])); + sumI = _mm_add_epi32(sumI, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[5])); + // Q + sa = _mm_loadu_si128((__m128i*) &(oddA[1][a])); + sb = _mm_loadu_si128((__m128i*) &(oddB[1][b])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[0])); + sa = _mm_loadu_si128((__m128i*) &(oddA[1][a+4])); + sb = _mm_loadu_si128((__m128i*) &(oddB[1][b+4])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[1])); + sa = _mm_loadu_si128((__m128i*) &(oddA[1][a+8])); + sb = _mm_loadu_si128((__m128i*) &(oddB[1][b+8])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[2])); + sa = _mm_loadu_si128((__m128i*) &(oddA[1][a+12])); + sb = _mm_loadu_si128((__m128i*) &(oddB[1][b+12])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[3])); + sa = _mm_loadu_si128((__m128i*) &(oddA[1][a+16])); + sb = _mm_loadu_si128((__m128i*) &(oddB[1][b+16])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[4])); + sa = _mm_loadu_si128((__m128i*) &(oddA[1][a+20])); + sb = _mm_loadu_si128((__m128i*) &(oddB[1][b+20])); + sumQ = _mm_add_epi32(sumQ, _mm_mullo_epi32(_mm_add_epi32(sa, sb), h[5])); + } + + // horizontal add of four 32 bit partial sums + + sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 8)); + sumI = _mm_add_epi32(sumI, _mm_srli_si128(sumI, 4)); + iAcc = _mm_cvtsi128_si32(sumI); + + sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 8)); + sumQ = _mm_add_epi32(sumQ, _mm_srli_si128(sumQ, 4)); + qAcc = _mm_cvtsi128_si32(sumQ); + +#elif defined(USE_NEON) + int a = ptrA/2; // tip pointer + int b = ptrB/2 + 1; // tail pointer + + int32x4_t sumI = vdupq_n_s32(0); + int32x4_t sumQ = vdupq_n_s32(0); + + int32x4x3_t sh = vld3q_s32((int32_t const *) &HBFIRFilterTraits<96>::hbCoeffs[0]); + int32x4x4_t s4a, s4b, s4h; + int32x4x2_t s2a, s2b, s2h; + + if ((ptrB % 2) == 0) + { + s4h = vld4q_s32((int32_t const *) &HBFIRFilterTraits<96>::hbCoeffs[0]); + + s4a = vld4q_s32((int32_t const *) &(evenA[0][a])); + s4b = vld4q_s32((int32_t const *) &(evenB[0][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(s4a.val[0], s4b.val[0]), s4h.val[0]); + sumI = vmlaq_s32(sumI, vaddq_s32(s4a.val[1], s4b.val[1]), s4h.val[1]); + sumI = vmlaq_s32(sumI, vaddq_s32(s4a.val[2], s4b.val[2]), s4h.val[2]); + sumI = vmlaq_s32(sumI, vaddq_s32(s4a.val[3], s4b.val[3]), s4h.val[3]); + + s4a = vld4q_s32((int32_t const *) &(evenA[1][a])); + s4b = vld4q_s32((int32_t const *) &(evenB[1][b])); + sumQ = vmlaq_s32(sumQ, vaddq_s32(s4a.val[0], s4b.val[0]), s4h.val[0]); + sumQ = vmlaq_s32(sumQ, vaddq_s32(s4a.val[1], s4b.val[1]), s4h.val[1]); + sumQ = vmlaq_s32(sumQ, vaddq_s32(s4a.val[2], s4b.val[2]), s4h.val[2]); + sumQ = vmlaq_s32(sumQ, vaddq_s32(s4a.val[3], s4b.val[3]), s4h.val[3]); + + s2h = vld4q_s32((int32_t const *) &HBFIRFilterTraits<96>::hbCoeffs[16]); + + s2a = vld2q_s32((int32_t const *) &(evenA[0][a+16])); + s2b = vld2q_s32((int32_t const *) &(evenB[0][b+16])); + sumI = vmlaq_s32(sumI, vaddq_s32(s2a.val[0], s2b.val[0]), s2h.val[0]); + sumI = vmlaq_s32(sumI, vaddq_s32(s2a.val[1], s2b.val[1]), s2h.val[1]); + + s2a = vld2q_s32((int32_t const *) &(evenA[1][a+16])); + s2b = vld2q_s32((int32_t const *) &(evenB[1][b+16])); + sumQ = vmlaq_s32(sumQ, vaddq_s32(s2a.val[0], s2b.val[0]), s2h.val[0]); + sumQ = vmlaq_s32(sumQ, vaddq_s32(s2a.val[1], s2b.val[1]), s2h.val[1]); + } + else + { + s4h = vld4q_s32((int32_t const *) &HBFIRFilterTraits<96>::hbCoeffs[0]); + + s4a = vld4q_s32((int32_t const *) &(oddA[0][a])); + s4b = vld4q_s32((int32_t const *) &(oddB[0][b])); + sumI = vmlaq_s32(sumI, vaddq_s32(s4a.val[0], s4b.val[0]), s4h.val[0]); + sumI = vmlaq_s32(sumI, vaddq_s32(s4a.val[1], s4b.val[1]), s4h.val[1]); + sumI = vmlaq_s32(sumI, vaddq_s32(s4a.val[2], s4b.val[2]), s4h.val[2]); + sumI = vmlaq_s32(sumI, vaddq_s32(s4a.val[3], s4b.val[3]), s4h.val[3]); + + s4a = vld4q_s32((int32_t const *) &(oddA[1][a])); + s4b = vld4q_s32((int32_t const *) &(oddB[1][b])); + sumQ = vmlaq_s32(sumQ, vaddq_s32(s4a.val[0], s4b.val[0]), s4h.val[0]); + sumQ = vmlaq_s32(sumQ, vaddq_s32(s4a.val[1], s4b.val[1]), s4h.val[1]); + sumQ = vmlaq_s32(sumQ, vaddq_s32(s4a.val[2], s4b.val[2]), s4h.val[2]); + sumQ = vmlaq_s32(sumQ, vaddq_s32(s4a.val[3], s4b.val[3]), s4h.val[3]); + + s2h = vld4q_s32((int32_t const *) &HBFIRFilterTraits<96>::hbCoeffs[16]); + + s2a = vld2q_s32((int32_t const *) &(oddA[0][a+16])); + s2b = vld2q_s32((int32_t const *) &(oddB[0][b+16])); + sumI = vmlaq_s32(sumI, vaddq_s32(s2a.val[0], s2b.val[0]), s2h.val[0]); + sumI = vmlaq_s32(sumI, vaddq_s32(s2a.val[1], s2b.val[1]), s2h.val[1]); + + s2a = vld2q_s32((int32_t const *) &(oddA[1][a+16])); + s2b = vld2q_s32((int32_t const *) &(oddB[1][b+16])); + sumQ = vmlaq_s32(sumQ, vaddq_s32(s2a.val[0], s2b.val[0]), s2h.val[0]); + sumQ = vmlaq_s32(sumQ, vaddq_s32(s2a.val[1], s2b.val[1]), s2h.val[1]); + } + + int32x2_t sumI1 = vpadd_s32(vget_high_s32(sumI), vget_low_s32(sumI)); + int32x2_t sumI2 = vpadd_s32(sumI1, sumI1); + iAcc = vget_lane_s32(sumI2, 0); + + int32x2_t sumQ1 = vpadd_s32(vget_high_s32(sumQ), vget_low_s32(sumQ)); + int32x2_t sumQ2 = vpadd_s32(sumQ1, sumQ1); + qAcc = vget_lane_s32(sumQ2, 0); +#endif + } +}; + + +#endif /* INCLUDE_INTHALFBANDFILTEREO2I_H_ */ diff --git a/sdrbase/dsp/upchannelizer.cpp b/sdrbase/dsp/upchannelizer.cpp index dedf9eb57..e31d55487 100644 --- a/sdrbase/dsp/upchannelizer.cpp +++ b/sdrbase/dsp/upchannelizer.cpp @@ -203,20 +203,20 @@ void UpChannelizer::applyConfiguration() #ifdef USE_SSE4_1 UpChannelizer::FilterStage::FilterStage(Mode mode) : - m_filter(new IntHalfbandFilterEO2), + m_filter(new IntHalfbandFilterEO1), m_workFunction(0) { switch(mode) { case ModeCenter: - m_workFunction = &IntHalfbandFilterEO2::workInterpolateCenter; + m_workFunction = &IntHalfbandFilterEO1::workInterpolateCenter; break; case ModeLowerHalf: - m_workFunction = &IntHalfbandFilterEO2::workInterpolateLowerHalf; + m_workFunction = &IntHalfbandFilterEO1::workInterpolateLowerHalf; break; case ModeUpperHalf: - m_workFunction = &IntHalfbandFilterEO2::workInterpolateUpperHalf; + m_workFunction = &IntHalfbandFilterEO1::workInterpolateUpperHalf; break; } } diff --git a/sdrbase/dsp/upchannelizer.h b/sdrbase/dsp/upchannelizer.h index f5c372896..66855b71a 100644 --- a/sdrbase/dsp/upchannelizer.h +++ b/sdrbase/dsp/upchannelizer.h @@ -24,7 +24,7 @@ #include "util/export.h" #include "util/message.h" #ifdef USE_SSE4_1 -#include "dsp/inthalfbandfiltereo2.h" +#include "dsp/inthalfbandfiltereo1.h" #else #include "dsp/inthalfbandfilterdb.h" #endif @@ -74,8 +74,8 @@ protected: }; #ifdef USE_SSE4_1 - typedef bool (IntHalfbandFilterEO2::*WorkFunction)(Sample* sIn, Sample *sOut); - IntHalfbandFilterEO2* m_filter; + typedef bool (IntHalfbandFilterEO1::*WorkFunction)(Sample* sIn, Sample *sOut); + IntHalfbandFilterEO1* m_filter; #else typedef bool (IntHalfbandFilterDB::*WorkFunction)(Sample* sIn, Sample *sOut); IntHalfbandFilterDB* m_filter;