update pffft

This commit is contained in:
Pavel Demin 2022-04-20 11:59:57 +02:00
parent 3fae21d8b3
commit 35750f4486
2 changed files with 20 additions and 17 deletions

35
pffft.c
View File

@ -96,7 +96,8 @@
/*
Altivec support macros
*/
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__))
#include <altivec.h>
typedef vector float v4sf;
# define SIMD_SZ 4
# define VZERO() ((vector float) vec_splat_u8(0))
@ -108,8 +109,8 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
# define LD_PS1(p) ld_ps1(&p)
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
vector unsigned char vperm1 = (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27}; \
vector unsigned char vperm2 = (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \
v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
}
# define VTRANSPOSE4(x0,x1,x2,x3) { \
@ -122,13 +123,13 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
x2 = vec_mergeh(y1, y3); \
x3 = vec_mergel(y1, y3); \
}
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
# define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
/*
SSE1 support macros
*/
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
#include <xmmintrin.h>
typedef __m128 v4sf;
@ -143,7 +144,7 @@ typedef __m128 v4sf;
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
# define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
/*
ARM NEON support macros
@ -170,7 +171,7 @@ typedef float32x4_t v4sf;
// marginally faster version
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
#else
# if !defined(PFFFT_SIMD_DISABLE)
# warning "building with simd disabled !\n";
@ -188,7 +189,7 @@ typedef float v4sf;
# define VMADD(a,b,c) ((a)*(b)+(c))
# define VSUB(a,b) ((a)-(b))
# define LD_PS1(p) (p)
# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
#endif
// shortcuts for complex multiplcations
@ -210,7 +211,7 @@ typedef union v4sf_union {
#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
/* detect bugs with the vector support macros */
void validate_pffft_simd() {
void validate_pffft_simd(void) {
float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
v4sf_union a0, a1, a2, a3, t, u;
memcpy(a0.f, f, 4*sizeof(float));
@ -246,6 +247,8 @@ void validate_pffft_simd() {
a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
}
#else
void validate_pffft_simd() {} // allow test_pffft.c to call this function even when simd is not available..
#endif //!PFFFT_SIMD_DISABLE
/* SSE and co like 16-bytes aligned pointers */
@ -319,7 +322,7 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
dr3 = VADD(cr2, ci3);
di2 = VADD(ci2, cr3);
di3 = VSUB(ci2, cr3);
wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
ch[i+l1ido] = dr2;
ch[i+l1ido + 1] = di2;
@ -379,14 +382,14 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
cr4 = VSUB(tr1, tr4);
ci2 = VADD(ti1, ti4);
ci4 = VSUB(ti1, ti4);
wr1=wa1[i], wi1=fsign*wa1[i+1];
wr1=wa1[i]; wi1=fsign*wa1[i+1];
VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
wr2=wa2[i], wi2=fsign*wa2[i+1];
wr2=wa2[i]; wi2=fsign*wa2[i+1];
ch[i + l1ido] = cr2;
ch[i + l1ido + 1] = ci2;
VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
wr3=wa3[i], wi3=fsign*wa3[i+1];
wr3=wa3[i]; wi3=fsign*wa3[i+1];
ch[i + 2*l1ido] = cr3;
ch[i + 2*l1ido + 1] = ci3;
@ -448,8 +451,8 @@ static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
dr2 = VSUB(cr2, ci5);
di5 = VSUB(ci2, cr5);
di2 = VADD(ci2, cr5);
wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
wr3=wa3[i]; wi3=fsign*wa3[i+1]; wr4=wa4[i]; wi4=fsign*wa4[i+1];
VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
ch_ref(i - 1, 2) = dr2;
ch_ref(i, 2) = di2;

View File

@ -168,7 +168,7 @@ extern "C" {
void pffft_aligned_free(void *);
/* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
int pffft_simd_size();
int pffft_simd_size(void);
#ifdef __cplusplus
}