replace fftw with pffft

2025-09-06 15:17:56 -04:00 · 2020-06-27 18:39:46 +00:00 · 2020-06-27 18:39:46 +00:00 · 9bf5d0b233
commit 9bf5d0b233
parent 071d85029e
8 changed files with 2099 additions and 1430 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 TARGET = ft8d

 OBJECTS = \
-  crc14.o crc.o ft8_downsample.o sync8d.o sync8.o grid2deg.o fftw3mod.o \
+  crc14.o crc.o ft8_downsample.o sync8d.o sync8.o grid2deg.o pffft.o \
  four2a.o deg2grid.o determ.o baseline.o platanh.o bpdecode174_91.o \
  fmtmsg.o packjt.o chkcrc14a.o indexx.o shell.o pctile.o polyfit.o \
  twkfreq1.o osd174_91.o encode174_91.o chkcall.o packjt77.o genft8.o \
--- a/fftw3.f03
+++ b/fftw3.f03
--- a/fftw3.f90
+++ b/fftw3.f90
@ -1,64 +0,0 @@
-  INTEGER FFTW_R2HC
-  PARAMETER (FFTW_R2HC=0)
-  INTEGER FFTW_HC2R
-  PARAMETER (FFTW_HC2R=1)
-  INTEGER FFTW_DHT
-  PARAMETER (FFTW_DHT=2)
-  INTEGER FFTW_REDFT00
-  PARAMETER (FFTW_REDFT00=3)
-  INTEGER FFTW_REDFT01
-  PARAMETER (FFTW_REDFT01=4)
-  INTEGER FFTW_REDFT10
-  PARAMETER (FFTW_REDFT10=5)
-  INTEGER FFTW_REDFT11
-  PARAMETER (FFTW_REDFT11=6)
-  INTEGER FFTW_RODFT00
-  PARAMETER (FFTW_RODFT00=7)
-  INTEGER FFTW_RODFT01
-  PARAMETER (FFTW_RODFT01=8)
-  INTEGER FFTW_RODFT10
-  PARAMETER (FFTW_RODFT10=9)
-  INTEGER FFTW_RODFT11
-  PARAMETER (FFTW_RODFT11=10)
-  INTEGER FFTW_FORWARD
-  PARAMETER (FFTW_FORWARD=-1)
-  INTEGER FFTW_BACKWARD
-  PARAMETER (FFTW_BACKWARD=+1)
-  INTEGER FFTW_MEASURE
-  PARAMETER (FFTW_MEASURE=0)
-  INTEGER FFTW_DESTROY_INPUT
-  PARAMETER (FFTW_DESTROY_INPUT=1)
-  INTEGER FFTW_UNALIGNED
-  PARAMETER (FFTW_UNALIGNED=2)
-  INTEGER FFTW_CONSERVE_MEMORY
-  PARAMETER (FFTW_CONSERVE_MEMORY=4)
-  INTEGER FFTW_EXHAUSTIVE
-  PARAMETER (FFTW_EXHAUSTIVE=8)
-  INTEGER FFTW_PRESERVE_INPUT
-  PARAMETER (FFTW_PRESERVE_INPUT=16)
-  INTEGER FFTW_PATIENT
-  PARAMETER (FFTW_PATIENT=32)
-  INTEGER FFTW_ESTIMATE
-  PARAMETER (FFTW_ESTIMATE=64)
-  INTEGER FFTW_ESTIMATE_PATIENT
-  PARAMETER (FFTW_ESTIMATE_PATIENT=128)
-  INTEGER FFTW_BELIEVE_PCOST
-  PARAMETER (FFTW_BELIEVE_PCOST=256)
-  INTEGER FFTW_DFT_R2HC_ICKY
-  PARAMETER (FFTW_DFT_R2HC_ICKY=512)
-  INTEGER FFTW_NONTHREADED_ICKY
-  PARAMETER (FFTW_NONTHREADED_ICKY=1024)
-  INTEGER FFTW_NO_BUFFERING
-  PARAMETER (FFTW_NO_BUFFERING=2048)
-  INTEGER FFTW_NO_INDIRECT_OP
-  PARAMETER (FFTW_NO_INDIRECT_OP=4096)
-  INTEGER FFTW_ALLOW_LARGE_GENERIC
-  PARAMETER (FFTW_ALLOW_LARGE_GENERIC=8192)
-  INTEGER FFTW_NO_RANK_SPLITS
-  PARAMETER (FFTW_NO_RANK_SPLITS=16384)
-  INTEGER FFTW_NO_VRANK_SPLITS
-  PARAMETER (FFTW_NO_VRANK_SPLITS=32768)
-  INTEGER FFTW_NO_VRECURSE
-  PARAMETER (FFTW_NO_VRECURSE=65536)
-  INTEGER FFTW_NO_SIMD
-  PARAMETER (FFTW_NO_SIMD=131072)
--- a/fftw3mod.f90
+++ b/fftw3mod.f90
@ -1,4 +0,0 @@
-module FFTW3
-  use, intrinsic :: iso_c_binding
-  include 'fftw3.f03'
-end module FFTW3
--- a/four2a.c
+++ b/four2a.c
@ -0,0 +1,40 @@
+#include "pffft.h"
+
+static struct
+{
+    PFFFT_Setup *s;
+    int n;
+} setups[10];
+
+static int size;
+
+void four2a_(float *a, int *nfft, int *ndim, int *sign, int *form)
+{
+    int i;
+    PFFFT_Setup *s;
+    pffft_direction_t direction;
+
+    s = NULL;
+    for(i = 0; i < size; ++i)
+    {
+        if(setups[i].n == *nfft)
+        {
+            s = setups[i].s;
+            break;
+        }
+    }
+
+    if(s == NULL && size < 10)
+    {
+        s = pffft_new_setup(*nfft, PFFFT_COMPLEX);
+        setups[size].s = s;
+        setups[size].n = *nfft;
+        ++size;
+    }
+
+    if(s != NULL)
+    {
+        direction = *sign == 1 ? PFFFT_BACKWARD : PFFFT_FORWARD;
+        pffft_transform_ordered(s, a, a, NULL, direction);
+    }
+}
--- a/four2a.f90
+++ b/four2a.f90
@ -1,115 +0,0 @@
-subroutine four2a(a,nfft,ndim,isign,iform)
-
-! IFORM = 1, 0 or -1, as data is
-! complex, real, or the first half of a complex array.  Transform
-! values are returned in array DATA.  They are complex, real, or
-! the first half of a complex array, as IFORM = 1, -1 or 0.
-
-! The transform of a real array (IFORM = 0) dimensioned N(1) by N(2)
-! by ... will be returned in the same array, now considered to
-! be complex of dimensions N(1)/2+1 by N(2) by ....  Note that if
-! IFORM = 0 or -1, N(1) must be even, and enough room must be
-! reserved.  The missing values may be obtained by complex conjugation.
-
-! The reverse transformation of a half complex array dimensioned
-! N(1)/2+1 by N(2) by ..., is accomplished by setting IFORM
-! to -1.  In the N array, N(1) must be the true N(1), not N(1)/2+1.
-! The transform will be real and returned to the input array.
-
-! This version of four2a makes calls to the FFTW library to do the
-! actual computations.
-
-  use fftw3
-  parameter (NPMAX=2100)                 !Max numberf of stored plans
-  parameter (NSMALL=16384)               !Max size of "small" FFTs
-  complex a(nfft)                        !Array to be transformed
-  complex aa(NSMALL)                     !Local copy of "small" a()
-  integer nn(NPMAX),ns(NPMAX),nf(NPMAX)  !Params of stored plans
-  integer*8 nl(NPMAX),nloc               !More params of plans
-  integer*8 plan(NPMAX)                  !Pointers to stored plans
-  logical found_plan
-  data nplan/0/                          !Number of stored plans
-  common/patience/npatience,nthreads     !Patience and threads for FFTW plans
-  save plan,nplan,nn,ns,nf,nl
-
-  if(nfft.lt.0) go to 999
-
-  nloc=loc(a)
-
-  found_plan = .false.
-  !$omp critical(four2a_setup)
-  do i=1,nplan
-     if(nfft.eq.nn(i) .and. isign.eq.ns(i) .and.                     &
-          iform.eq.nf(i) .and. nloc.eq.nl(i)) then
-        found_plan = .true.
-        exit
-     end if
-  enddo
-
-  if(i.ge.NPMAX) stop 'Too many FFTW plans requested.'
-
-  if (.not. found_plan) then
-     nplan=nplan+1
-     i=nplan
-
-     nn(i)=nfft
-     ns(i)=isign
-     nf(i)=iform
-     nl(i)=nloc
-
-! Planning: FFTW_ESTIMATE, FFTW_ESTIMATE_PATIENT, FFTW_MEASURE,
-!            FFTW_PATIENT,  FFTW_EXHAUSTIVE
-     nflags=FFTW_ESTIMATE
-     if(npatience.eq.1) nflags=FFTW_ESTIMATE_PATIENT
-     if(npatience.eq.2) nflags=FFTW_MEASURE
-     if(npatience.eq.3) nflags=FFTW_PATIENT
-     if(npatience.eq.4) nflags=FFTW_EXHAUSTIVE
-
-     if(nfft.le.NSMALL) then
-        jz=nfft
-        if(iform.eq.0) jz=nfft/2
-        aa(1:jz)=a(1:jz)
-     endif
-
-     !$omp critical(fftw) ! serialize non thread-safe FFTW3 calls
-     if(isign.eq.-1 .and. iform.eq.1) then
-        call sfftw_plan_dft_1d(plan(i),nfft,a,a,FFTW_FORWARD,nflags)
-     else if(isign.eq.1 .and. iform.eq.1) then
-        call sfftw_plan_dft_1d(plan(i),nfft,a,a,FFTW_BACKWARD,nflags)
-     else if(isign.eq.-1 .and. iform.eq.0) then
-        call sfftw_plan_dft_r2c_1d(plan(i),nfft,a,a,nflags)
-     else if(isign.eq.1 .and. iform.eq.-1) then
-        call sfftw_plan_dft_c2r_1d(plan(i),nfft,a,a,nflags)
-     else
-        stop 'Unsupported request in four2a'
-     endif
-     !$omp end critical(fftw)
-
-     if(nfft.le.NSMALL) then
-        jz=nfft
-        if(iform.eq.0) jz=nfft/2
-        a(1:jz)=aa(1:jz)
-     endif
-  end if
-  !$omp end critical(four2a_setup)
-
-  call sfftw_execute(plan(i))
-  return
-
-999 continue
-
-  !$omp critical(four2a)
-  do i=1,nplan
-! The test is only to silence a compiler warning:
-     if(ndim.ne.-999) then
-        !$omp critical(fftw) ! serialize non thread-safe FFTW3 calls
-        call sfftw_destroy_plan(plan(i))
-        !$omp end critical(fftw)
-     end if
-  enddo
-  call fftwf_cleanup()
-  nplan=0
-  !$omp end critical(four2a)
-
-  return
-end subroutine four2a
--- a/pffft.c
+++ b/pffft.c
--- a/pffft.h
+++ b/pffft.h
@ -0,0 +1,177 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
+   authored by Dr Paul Swarztrauber of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+/*
+   PFFFT : a Pretty Fast FFT.
+
+   This is basically an adaptation of the single precision fftpack
+   (v4) as found on netlib taking advantage of SIMD instruction found
+   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
+
+   For architectures where no SIMD instruction is available, the code
+   falls back to a scalar version.
+
+   Restrictions:
+
+   - 1D transforms only, with 32-bit single precision.
+
+   - supports only transforms for inputs of length N of the form
+   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+   144, 160, etc are all acceptable lengths). Performance is best for
+   128<=N<=8192.
+
+   - all (float*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 16 bytes on x86 and
+   powerpc CPUs.
+
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#ifndef PFFFT_H
+#define PFFFT_H
+
+#include <stddef.h> // for size_t
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /* opaque struct holding internal stuff (precomputed twiddle factors)
+     this struct can be shared by many threads as it contains only
+     read-only data.
+  */
+  typedef struct PFFFT_Setup PFFFT_Setup;
+
+  /* direction of the transform */
+  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
+
+  /* type of transform */
+  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
+
+  /*
+    prepare for performing transforms of size N -- the returned
+    PFFFT_Setup structure is read-only so it can safely be shared by
+    multiple concurrent threads.
+  */
+  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
+  void pffft_destroy_setup(PFFFT_Setup *);
+  /*
+     Perform a Fourier transform , The z-domain data is stored in the
+     most efficient order for transforming it back, or using it for
+     convolution. If you need to have its content sorted in the
+     "usual" way, that is as an array of interleaved complex numbers,
+     either use pffft_transform_ordered , or call pffft_zreorder after
+     the forward fft, and before the backward fft.
+
+     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
+     Typically you will want to scale the backward transform by 1/N.
+
+     The 'work' pointer should point to an area of N (2*N for complex
+     fft) floats, properly aligned. If 'work' is NULL, then stack will
+     be used instead (this is probably the best strategy for small
+     FFTs, say for N < 16384).
+
+     input and output may alias.
+  */
+  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /*
+     Similar to pffft_transform, but makes sure that the output is
+     ordered as expected (interleaved complex numbers).  This is
+     similar to calling pffft_transform and then pffft_zreorder.
+
+     input and output may alias.
+  */
+  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /*
+     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
+     PFFFT_FORWARD) if you want to have the frequency components in
+     the correct "canonical" order, as interleaved complex numbers.
+
+     (for real transforms, both 0-frequency and half frequency
+     components, which are real, are assembled in the first entry as
+     F(0)+i*F(n/2+1). Note that the original fftpack did place
+     F(n/2+1) at the end of the arrays).
+
+     input and output should not alias.
+  */
+  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
+
+  /*
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and accumulate them into dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+
+     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+
+  /*
+    the float buffers must have the correct alignment (16-byte boundary
+    on intel and powerpc). This function may be used to obtain such
+    correctly aligned buffers.
+  */
+  void *pffft_aligned_malloc(size_t nb_bytes);
+  void pffft_aligned_free(void *);
+
+  /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
+  int pffft_simd_size();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // PFFFT_H