From efbce97a2f8f8d75856b9cebf550df0e31892ec5 Mon Sep 17 00:00:00 2001 From: f4exb Date: Thu, 10 Nov 2016 02:56:49 +0100 Subject: [PATCH] Improve int halfband filter even/odd I/Q stride variant --- sdrbase/dsp/inthalfbandfiltersti.h | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/sdrbase/dsp/inthalfbandfiltersti.h b/sdrbase/dsp/inthalfbandfiltersti.h index 8e347e411..8c5edbcc8 100644 --- a/sdrbase/dsp/inthalfbandfiltersti.h +++ b/sdrbase/dsp/inthalfbandfiltersti.h @@ -42,38 +42,37 @@ public: #if defined(USE_SSE4_1) int a = HBFIRFilterTraits::hbOrder - 2; // tip int b = 0; // tail - const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; + const int *h = (const int*) HBFIRFilterTraits::hbCoeffs; __m128i sum = _mm_setzero_si128(); __m128i shh, sa, sb; int32_t sums[4] __attribute__ ((aligned (16))); for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) { - shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(0,0,0,0)); + shh = _mm_set_epi32(h[4*i], h[4*i], h[4*i], h[4*i]); sa = _mm_load_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq sb = _mm_load_si128((__m128i*) &(samples[b][0])); sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); a -= 2; b += 2; - shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(1,1,1,1)); + shh = _mm_set_epi32(h[4*i+1], h[4*i+1], h[4*i+1], h[4*i+1]); sa = _mm_load_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq sb = _mm_load_si128((__m128i*) &(samples[b][0])); sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); a -= 2; b += 2; - shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(2,2,2,2)); + shh = _mm_set_epi32(h[4*i+2], h[4*i+2], h[4*i+2], h[4*i+2]); sa = _mm_load_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq sb = _mm_load_si128((__m128i*) &(samples[b][0])); sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); a -= 2; b += 2; - shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(3,3,3,3)); + shh = _mm_set_epi32(h[4*i+3], h[4*i+3], h[4*i+3], h[4*i+3]); sa = _mm_load_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq sb = _mm_load_si128((__m128i*) &(samples[b][0])); sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); a -= 2; b += 2; - ++h; } // Extract values from sum vector @@ -95,38 +94,37 @@ public: #if defined(USE_SSE4_1) int a = ptr + HBFIRFilterTraits::hbOrder - 2; // tip int b = ptr + 0; // tail - const __m128i* h = (const __m128i*) HBFIRFilterTraits::hbCoeffs; + const int *h = (const int*) HBFIRFilterTraits::hbCoeffs; __m128i sum = _mm_setzero_si128(); __m128i shh, sa, sb; int32_t sums[4] __attribute__ ((aligned (16))); for (int i = 0; i < HBFIRFilterTraits::hbOrder / 16; i++) { - shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(0,0,0,0)); + shh = _mm_set_epi32(h[4*i], h[4*i], h[4*i], h[4*i]); sa = _mm_loadu_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq sb = _mm_loadu_si128((__m128i*) &(samples[b][0])); sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); a -= 2; b += 2; - shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(1,1,1,1)); + shh = _mm_set_epi32(h[4*i+1], h[4*i+1], h[4*i+1], h[4*i+1]); sa = _mm_loadu_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq sb = _mm_loadu_si128((__m128i*) &(samples[b][0])); sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); a -= 2; b += 2; - shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(2,2,2,2)); + shh = _mm_set_epi32(h[4*i+2], h[4*i+2], h[4*i+2], h[4*i+2]); sa = _mm_loadu_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq sb = _mm_loadu_si128((__m128i*) &(samples[b][0])); sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); a -= 2; b += 2; - shh = _mm_shuffle_epi32(*h, _MM_SHUFFLE(3,3,3,3)); + shh = _mm_set_epi32(h[4*i+3], h[4*i+3], h[4*i+3], h[4*i+3]); sa = _mm_loadu_si128((__m128i*) &(samples[a][0])); // Ei,Eq,Oi,Oq sb = _mm_loadu_si128((__m128i*) &(samples[b][0])); sum = _mm_add_epi32(sum, _mm_mullo_epi32(_mm_add_epi32(sa, sb), shh)); a -= 2; b += 2; - ++h; } // Extract values from sum vector