From 0a6dc5db37e79fc255803864c8c238715c1aa274 Mon Sep 17 00:00:00 2001 From: f4exb Date: Wed, 9 Nov 2016 03:27:30 +0100 Subject: [PATCH] IntHalfband filters: tuned optimizations and chose the best for x86_64 --- sdrbase/dsp/decimators.h | 14 +- sdrbase/dsp/inthalfbandfilterdb.h | 57 +++---- sdrbase/dsp/inthalfbandfiltereo1.h | 2 +- sdrbase/dsp/inthalfbandfilterst.h | 71 +++++---- sdrbase/dsp/inthalfbandfiltersti.h | 236 ++++++++++++++++++----------- 5 files changed, 222 insertions(+), 158 deletions(-) diff --git a/sdrbase/dsp/decimators.h b/sdrbase/dsp/decimators.h index 9d6b177dd..4aa41c3e2 100644 --- a/sdrbase/dsp/decimators.h +++ b/sdrbase/dsp/decimators.h @@ -19,7 +19,7 @@ #include "dsp/dsptypes.h" #ifdef USE_SSE4_1 -#include "dsp/inthalfbandfilterst.h" +#include "dsp/inthalfbandfiltereo1.h" #else #include "dsp/inthalfbandfilterdb.h" #endif @@ -125,12 +125,12 @@ public: private: #ifdef USE_SSE4_1 - IntHalfbandFilterST m_decimator2; // 1st stages - IntHalfbandFilterST m_decimator4; // 2nd stages - IntHalfbandFilterST m_decimator8; // 3rd stages - IntHalfbandFilterST m_decimator16; // 4th stages - IntHalfbandFilterST m_decimator32; // 5th stages - IntHalfbandFilterST m_decimator64; // 6th stages + IntHalfbandFilterEO1 m_decimator2; // 1st stages + IntHalfbandFilterEO1 m_decimator4; // 2nd stages + IntHalfbandFilterEO1 m_decimator8; // 3rd stages + IntHalfbandFilterEO1 m_decimator16; // 4th stages + IntHalfbandFilterEO1 m_decimator32; // 5th stages + IntHalfbandFilterEO1 m_decimator64; // 6th stages #else IntHalfbandFilterDB m_decimator2; // 1st stages IntHalfbandFilterDB m_decimator4; // 2nd stages diff --git a/sdrbase/dsp/inthalfbandfilterdb.h b/sdrbase/dsp/inthalfbandfilterdb.h index 49de946ba..5b03e385d 100644 --- a/sdrbase/dsp/inthalfbandfilterdb.h +++ b/sdrbase/dsp/inthalfbandfilterdb.h @@ -41,7 +41,7 @@ public: { case 0: // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; // tell caller we don't have a new sample @@ -51,7 +51,7 @@ public: // save result doFIR(sample); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; @@ -71,7 +71,7 @@ public: // save result doFIR(SampleOut); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; // tell caller we didn't consume the sample @@ -83,7 +83,7 @@ public: // save result doFIR(SampleOut); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; // tell caller we consumed the sample @@ -100,7 +100,7 @@ public: { case 0: // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; // tell caller we don't have a new sample @@ -110,7 +110,7 @@ public: // save result doFIR(x, y); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; // tell caller we have a new sample @@ -127,7 +127,7 @@ public: // insert sample into ring-buffer storeSample((FixReal) -sample->imag(), (FixReal) sample->real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; // tell caller we don't have a new sample @@ -139,7 +139,7 @@ public: // save result doFIR(sample); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 2; // tell caller we have a new sample @@ -149,7 +149,7 @@ public: // insert sample into ring-buffer storeSample((FixReal) sample->imag(), (FixReal) -sample->real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 3; // tell caller we don't have a new sample @@ -161,7 +161,7 @@ public: // save result doFIR(sample); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; // tell caller we have a new sample @@ -186,7 +186,7 @@ public: sampleOut->setImag(-s.real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; @@ -204,7 +204,7 @@ public: sampleOut->setImag(-s.imag()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 2; @@ -222,7 +222,7 @@ public: sampleOut->setImag(s.real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 3; @@ -240,7 +240,7 @@ public: sampleOut->setImag(s.imag()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; @@ -259,7 +259,7 @@ public: // insert sample into ring-buffer storeSample((FixReal) sample->imag(), (FixReal) -sample->real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; // tell caller we don't have a new sample @@ -271,7 +271,7 @@ public: // save result doFIR(sample); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 2; // tell caller we have a new sample @@ -281,7 +281,7 @@ public: // insert sample into ring-buffer storeSample((FixReal) -sample->imag(), (FixReal) sample->real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 3; // tell caller we don't have a new sample @@ -293,7 +293,7 @@ public: // save result doFIR(sample); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; // tell caller we have a new sample @@ -318,7 +318,7 @@ public: sampleOut->setImag(s.real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; @@ -336,7 +336,7 @@ public: sampleOut->setImag(-s.imag()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 2; @@ -354,7 +354,7 @@ public: sampleOut->setImag(-s.real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 3; @@ -372,7 +372,7 @@ public: sampleOut->setImag(s.imag()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; @@ -385,21 +385,21 @@ public: void myDecimate(const Sample* sample1, Sample* sample2) { storeSample((FixReal) sample1->real(), (FixReal) sample1->imag()); - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); storeSample((FixReal) sample2->real(), (FixReal) sample2->imag()); doFIR(sample2); - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); } void myDecimate(qint32 x1, qint32 y1, qint32 *x2, qint32 *y2) { storeSample(x1, y1); - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); storeSample(*x2, *y2); doFIR(x2, y2); - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); } protected: @@ -424,6 +424,11 @@ protected: m_samplesDB[m_ptr + m_size][1] = y; } + void advancePointer() + { + m_ptr = m_ptr + 1 < m_size ? m_ptr + 1: 0; + } + void doFIR(Sample* sample) { int a = m_ptr + m_size; // tip pointer diff --git a/sdrbase/dsp/inthalfbandfiltereo1.h b/sdrbase/dsp/inthalfbandfiltereo1.h index 7e2dda65c..c6661128b 100644 --- a/sdrbase/dsp/inthalfbandfiltereo1.h +++ b/sdrbase/dsp/inthalfbandfiltereo1.h @@ -450,7 +450,7 @@ protected: void advancePointer() { - m_ptr = (m_ptr + 1) % (2*m_size); + m_ptr = m_ptr + 1 < 2*m_size ? m_ptr + 1: 0; } void doFIR(Sample* sample) diff --git a/sdrbase/dsp/inthalfbandfilterst.h b/sdrbase/dsp/inthalfbandfilterst.h index ee5875396..4651fba20 100644 --- a/sdrbase/dsp/inthalfbandfilterst.h +++ b/sdrbase/dsp/inthalfbandfilterst.h @@ -42,7 +42,7 @@ public: { case 0: // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; // tell caller we don't have a new sample @@ -52,7 +52,7 @@ public: // save result doFIR(sample); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; @@ -72,7 +72,7 @@ public: // save result doFIR(SampleOut); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; // tell caller we didn't consume the sample @@ -84,7 +84,7 @@ public: // save result doFIR(SampleOut); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; // tell caller we consumed the sample @@ -101,7 +101,7 @@ public: { case 0: // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; // tell caller we don't have a new sample @@ -111,7 +111,7 @@ public: // save result doFIR(x, y); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; // tell caller we have a new sample @@ -128,7 +128,7 @@ public: // insert sample into ring-buffer storeSample((FixReal) -sample->imag(), (FixReal) sample->real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; // tell caller we don't have a new sample @@ -140,7 +140,7 @@ public: // save result doFIR(sample); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 2; // tell caller we have a new sample @@ -150,7 +150,7 @@ public: // insert sample into ring-buffer storeSample((FixReal) sample->imag(), (FixReal) -sample->real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 3; // tell caller we don't have a new sample @@ -162,7 +162,7 @@ public: // save result doFIR(sample); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; // tell caller we have a new sample @@ -187,7 +187,7 @@ public: sampleOut->setImag(-s.real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; @@ -205,7 +205,7 @@ public: sampleOut->setImag(-s.imag()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 2; @@ -223,7 +223,7 @@ public: sampleOut->setImag(s.real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 3; @@ -241,7 +241,7 @@ public: sampleOut->setImag(s.imag()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; @@ -260,7 +260,7 @@ public: // insert sample into ring-buffer storeSample((FixReal) sample->imag(), (FixReal) -sample->real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; // tell caller we don't have a new sample @@ -272,7 +272,7 @@ public: // save result doFIR(sample); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 2; // tell caller we have a new sample @@ -282,7 +282,7 @@ public: // insert sample into ring-buffer storeSample((FixReal) -sample->imag(), (FixReal) sample->real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 3; // tell caller we don't have a new sample @@ -294,7 +294,7 @@ public: // save result doFIR(sample); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; // tell caller we have a new sample @@ -319,7 +319,7 @@ public: sampleOut->setImag(s.real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 1; @@ -337,7 +337,7 @@ public: sampleOut->setImag(-s.imag()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 2; @@ -355,7 +355,7 @@ public: sampleOut->setImag(-s.real()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 3; @@ -373,7 +373,7 @@ public: sampleOut->setImag(s.imag()); // advance write-pointer - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); // next state m_state = 0; @@ -386,21 +386,21 @@ public: void myDecimate(const Sample* sample1, Sample* sample2) { storeSample((FixReal) sample1->real(), (FixReal) sample1->imag()); - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); storeSample((FixReal) sample2->real(), (FixReal) sample2->imag()); doFIR(sample2); - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); } void myDecimate(qint32 x1, qint32 y1, qint32 *x2, qint32 *y2) { storeSample(x1, y1); - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); storeSample(*x2, *y2); doFIR(x2, y2); - m_ptr = (m_ptr + 1) % m_size; + advancePointer(); } protected: @@ -431,6 +431,11 @@ protected: m_samplesDB[m_ptr + m_size][1] = y; } + void advancePointer() + { + m_ptr = m_ptr + 1 < m_size ? m_ptr + 1: 0; + } + void doFIR(Sample* sample) { // calculate on odd values @@ -442,9 +447,10 @@ protected: m_iOddAcc = 0; m_qOddAcc = 0; #ifdef USE_SSE4_1 - memcpy((void *) m_samplesAligned, (const void *) &(m_samplesDB[ m_ptr + 1][0]), HBFilterOrder*2*sizeof(qint32)); - IntHalfbandFilterSTIntrinsics::work( - m_samplesAligned, +// memcpy((void *) m_samplesAligned, (const void *) &(m_samplesDB[ m_ptr + 1][0]), HBFilterOrder*2*sizeof(qint32)); + IntHalfbandFilterSTIntrinsics::workNA( + m_ptr + 1, + m_samplesDB, m_iEvenAcc, m_qEvenAcc, m_iOddAcc, @@ -490,9 +496,10 @@ protected: m_qOddAcc = 0; #ifdef USE_SSE4_1 - memcpy((void *) m_samplesAligned, (const void *) &(m_samplesDB[ m_ptr + 1][0]), HBFilterOrder*2*sizeof(qint32)); - IntHalfbandFilterSTIntrinsics::work( - m_samplesAligned, +// memcpy((void *) m_samplesAligned, (const void *) &(m_samplesDB[ m_ptr + 1][0]), HBFilterOrder*2*sizeof(qint32)); + IntHalfbandFilterSTIntrinsics::workNA( + m_ptr + 1, + m_samplesDB, m_iEvenAcc, m_qEvenAcc, m_iOddAcc, diff --git a/sdrbase/dsp/inthalfbandfiltersti.h b/sdrbase/dsp/inthalfbandfiltersti.h index 08883d58b..8e347e411 100644 --- a/sdrbase/dsp/inthalfbandfiltersti.h +++ b/sdrbase/dsp/inthalfbandfiltersti.h @@ -1,92 +1,144 @@ -/////////////////////////////////////////////////////////////////////////////////// -// Copyright (C) 2016 F4EXB // -// written by Edouard Griffiths // -// // -// Integer half-band FIR based interpolator and decimator // -// This is the even/odd and I/Q stride with double buffering variant // -// This is the SIMD intrinsics code // -// // -// This program is free software; you can redistribute it and/or modify // -// it under the terms of the GNU General Public License as published by // -// the Free Software Foundation as version 3 of the License, or // -// // -// This program is distributed in the hope that it will be useful, // -// but WITHOUT ANY WARRANTY; without even the implied warranty of // -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // -// GNU General Public License V3 for more details. // -// // -// You should have received a copy of the GNU General Public License // -// along with this program. If not, see . // -/////////////////////////////////////////////////////////////////////////////////// - -#ifndef SDRBASE_DSP_INTHALFBANDFILTERSTI_H_ -#define SDRBASE_DSP_INTHALFBANDFILTERSTI_H_ - -#include - -#if defined(USE_SSE4_1) -#include -#endif - -#include "hbfiltertraits.h" - -template -class IntHalfbandFilterSTIntrinsics -{ -public: - static void work( - int32_t samples[HBFilterOrder][2], - int32_t& iEvenAcc, int32_t& qEvenAcc, - int32_t& iOddAcc, int32_t& qOddAcc) - { -#if defined(USE_SSE4_1) - int a = HBFIRFilterTraits::hbOrder - 2; // tip - int b = 0; // tail - const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; - __m128i sum = _mm_setzero_si128(); - __m128i sh, shh, sa, sb; - int32_t sums[4] __attribute__ ((aligned (16))); - - for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) - { - sh = _mm_load_si128(h); - shh = _mm_shuffle_epi32(sh, _MM_SHUFFLE(0,0,0,0)); - sa = _mm_load_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq - sb = _mm_load_si128((__m128i*) &(samples[b][0])); - sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); - a -= 2; - b += 2; - shh = _mm_shuffle_epi32(sh, _MM_SHUFFLE(1,1,1,1)); - sa = _mm_load_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq - sb = _mm_load_si128((__m128i*) &(samples[b][0])); - sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); - a -= 2; - b += 2; - shh = _mm_shuffle_epi32(sh, _MM_SHUFFLE(2,2,2,2)); - sa = _mm_load_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq - sb = _mm_load_si128((__m128i*) &(samples[b][0])); - sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); - a -= 2; - b += 2; - shh = _mm_shuffle_epi32(sh, _MM_SHUFFLE(3,3,3,3)); - sa = _mm_load_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq - sb = _mm_load_si128((__m128i*) &(samples[b][0])); - sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); - a -= 2; - b += 2; - ++h; - } - - // Extract values from sum vector - _mm_store_si128((__m128i*) sums, sum); - iEvenAcc = sums[0]; - qEvenAcc = sums[1]; - iOddAcc = sums[2]; - qOddAcc = sums[3]; -#endif - } -}; - - - -#endif /* SDRBASE_DSP_INTHALFBANDFILTERSTI_H_ */ +/////////////////////////////////////////////////////////////////////////////////// +// Copyright (C) 2016 F4EXB // +// written by Edouard Griffiths // +// // +// Integer half-band FIR based interpolator and decimator // +// This is the even/odd and I/Q stride with double buffering variant // +// This is the SIMD intrinsics code // +// // +// This program is free software; you can redistribute it and/or modify // +// it under the terms of the GNU General Public License as published by // +// the Free Software Foundation as version 3 of the License, or // +// // +// This program is distributed in the hope that it will be useful, // +// but WITHOUT ANY WARRANTY; without even the implied warranty of // +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // +// GNU General Public License V3 for more details. // +// // +// You should have received a copy of the GNU General Public License // +// along with this program. If not, see . // +/////////////////////////////////////////////////////////////////////////////////// + +#ifndef SDRBASE_DSP_INTHALFBANDFILTERSTI_H_ +#define SDRBASE_DSP_INTHALFBANDFILTERSTI_H_ + +#include + +#if defined(USE_SSE4_1) +#include +#endif + +#include "hbfiltertraits.h" + +template +class IntHalfbandFilterSTIntrinsics +{ +public: + static void work( + int32_t samples[HBFilterOrder][2], + int32_t& iEvenAcc, int32_t& qEvenAcc, + int32_t& iOddAcc, int32_t& qOddAcc) + { +#if defined(USE_SSE4_1) + int a = HBFIRFilterTraits::hbOrder - 2; // tip + int b = 0; // tail + const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; + __m128i sum = _mm_setzero_si128(); + __m128i shh, sa, sb; + int32_t sums[4] __attribute__ ((aligned (16))); + + for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) + { + shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(0,0,0,0)); + sa = _mm_load_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq + sb = _mm_load_si128((__m128i*) &(samples[b][0])); + sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); + a -= 2; + b += 2; + shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(1,1,1,1)); + sa = _mm_load_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq + sb = _mm_load_si128((__m128i*) &(samples[b][0])); + sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); + a -= 2; + b += 2; + shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(2,2,2,2)); + sa = _mm_load_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq + sb = _mm_load_si128((__m128i*) &(samples[b][0])); + sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); + a -= 2; + b += 2; + shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(3,3,3,3)); + sa = _mm_load_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq + sb = _mm_load_si128((__m128i*) &(samples[b][0])); + sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); + a -= 2; + b += 2; + ++h; + } + + // Extract values from sum vector + _mm_store_si128((__m128i*) sums, sum); + iEvenAcc = sums[0]; + qEvenAcc = sums[1]; + iOddAcc = sums[2]; + qOddAcc = sums[3]; +#endif + } + + // not aligned version + static void workNA( + int ptr, + int32_t samples[HBFilterOrder*2][2], + int32_t& iEvenAcc, int32_t& qEvenAcc, + int32_t& iOddAcc, int32_t& qOddAcc) + { +#if defined(USE_SSE4_1) + int a = ptr + HBFIRFilterTraits::hbOrder - 2; // tip + int b = ptr + 0; // tail + const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; + __m128i sum = _mm_setzero_si128(); + __m128i shh, sa, sb; + int32_t sums[4] __attribute__ ((aligned (16))); + + for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) + { + shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(0,0,0,0)); + sa = _mm_loadu_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq + sb = _mm_loadu_si128((__m128i*) &(samples[b][0])); + sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); + a -= 2; + b += 2; + shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(1,1,1,1)); + sa = _mm_loadu_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq + sb = _mm_loadu_si128((__m128i*) &(samples[b][0])); + sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); + a -= 2; + b += 2; + shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(2,2,2,2)); + sa = _mm_loadu_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq + sb = _mm_loadu_si128((__m128i*) &(samples[b][0])); + sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); + a -= 2; + b += 2; + shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(3,3,3,3)); + sa = _mm_loadu_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq + sb = _mm_loadu_si128((__m128i*) &(samples[b][0])); + sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); + a -= 2; + b += 2; + ++h; + } + + // Extract values from sum vector + _mm_store_si128((__m128i*) sums, sum); + iEvenAcc = sums[0]; + qEvenAcc = sums[1]; + iOddAcc = sums[2]; + qOddAcc = sums[3]; +#endif + } +}; + + + +#endif /* SDRBASE_DSP_INTHALFBANDFILTERSTI_H_ */