From 35750f4486708d0e5700ee583fc92b2d6f0b0cf4 Mon Sep 17 00:00:00 2001 From: Pavel Demin Date: Wed, 20 Apr 2022 11:59:57 +0200 Subject: [PATCH] update pffft --- pffft.c | 35 +++++++++++++++++++---------------- pffft.h | 2 +- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/pffft.c b/pffft.c index 5280455..21c7bc3 100644 --- a/pffft.c +++ b/pffft.c @@ -96,7 +96,8 @@ /* Altivec support macros */ -#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__)) +#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) +#include typedef vector float v4sf; # define SIMD_SZ 4 # define VZERO() ((vector float) vec_splat_u8(0)) @@ -108,8 +109,8 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p # define LD_PS1(p) ld_ps1(&p) # define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; } # define UNINTERLEAVE2(in1, in2, out1, out2) { \ - vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \ - vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \ + vector unsigned char vperm1 = (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27}; \ + vector unsigned char vperm2 = (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \ v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \ } # define VTRANSPOSE4(x0,x1,x2,x3) { \ @@ -122,13 +123,13 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p x2 = vec_mergeh(y1, y3); \ x3 = vec_mergel(y1, y3); \ } -# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15)) -# define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0) +# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15}) +# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0) /* SSE1 support macros */ -#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86)) +#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86)) #include typedef __m128 v4sf; @@ -143,7 +144,7 @@ typedef __m128 v4sf; # define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } # define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3) # define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)) -# define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0) +# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0) /* ARM NEON support macros @@ -170,7 +171,7 @@ typedef float32x4_t v4sf; // marginally faster version //# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } # define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) -# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0) +# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0) #else # if !defined(PFFFT_SIMD_DISABLE) # warning "building with simd disabled !\n"; @@ -188,7 +189,7 @@ typedef float v4sf; # define VMADD(a,b,c) ((a)*(b)+(c)) # define VSUB(a,b) ((a)-(b)) # define LD_PS1(p) (p) -# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0) +# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0) #endif // shortcuts for complex multiplcations @@ -210,7 +211,7 @@ typedef union v4sf_union { #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3)) /* detect bugs with the vector support macros */ -void validate_pffft_simd() { +void validate_pffft_simd(void) { float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }; v4sf_union a0, a1, a2, a3, t, u; memcpy(a0.f, f, 4*sizeof(float)); @@ -246,6 +247,8 @@ void validate_pffft_simd() { a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15); } +#else +void validate_pffft_simd() {} // allow test_pffft.c to call this function even when simd is not available.. #endif //!PFFFT_SIMD_DISABLE /* SSE and co like 16-bytes aligned pointers */ @@ -319,7 +322,7 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, dr3 = VADD(cr2, ci3); di2 = VADD(ci2, cr3); di3 = VSUB(ci2, cr3); - wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1]; + wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1]; VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); ch[i+l1ido] = dr2; ch[i+l1ido + 1] = di2; @@ -379,14 +382,14 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, cr4 = VSUB(tr1, tr4); ci2 = VADD(ti1, ti4); ci4 = VSUB(ti1, ti4); - wr1=wa1[i], wi1=fsign*wa1[i+1]; + wr1=wa1[i]; wi1=fsign*wa1[i+1]; VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1)); - wr2=wa2[i], wi2=fsign*wa2[i+1]; + wr2=wa2[i]; wi2=fsign*wa2[i+1]; ch[i + l1ido] = cr2; ch[i + l1ido + 1] = ci2; VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2)); - wr3=wa3[i], wi3=fsign*wa3[i+1]; + wr3=wa3[i]; wi3=fsign*wa3[i+1]; ch[i + 2*l1ido] = cr3; ch[i + 2*l1ido + 1] = ci3; @@ -448,8 +451,8 @@ static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch, dr2 = VSUB(cr2, ci5); di5 = VSUB(ci2, cr5); di2 = VADD(ci2, cr5); - wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1]; - wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1]; + wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1]; + wr3=wa3[i]; wi3=fsign*wa3[i+1]; wr4=wa4[i]; wi4=fsign*wa4[i+1]; VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); ch_ref(i - 1, 2) = dr2; ch_ref(i, 2) = di2; diff --git a/pffft.h b/pffft.h index 5126baf..811f573 100644 --- a/pffft.h +++ b/pffft.h @@ -168,7 +168,7 @@ extern "C" { void pffft_aligned_free(void *); /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */ - int pffft_simd_size(); + int pffft_simd_size(void); #ifdef __cplusplus }