From 94472ac31c664996704e5966246682b19bbba787 Mon Sep 17 00:00:00 2001 From: Joe Taylor Date: Sun, 1 Feb 2015 16:23:36 +0000 Subject: [PATCH] Many changes in program jt9[.exe] aimed at speeding up the decoders. The long FFTs can now use the multi-threaded FFTW routines. Subroutine decode9.f90 was renamed jt9fano.f90. The JT9 decoder's top-level functions were removed from decoder.f90 and put into a separate subroutine decjt90.f90. Subroutine decoder.f90 is now configured for possible use of OpenMP SECTIONS, with the JT9 and JT65 decoders running concurrently on a multi-core machine. Note, however, that this concurrent processing is not yet fully implemented. Probably calls to timer need to be removed; some variables used in calls to jt65a and decjt9 may need to be declared PRIVATE in decoder; some sections probably need to be declared CRITICAL; probably some SAVE statements in downstream routines have made them not thread-safe; etc., etc. I'm a neophyte at using OpenMP. Comments, suggestions, and/or tests by others will be welcome! git-svn-id: svn+ssh://svn.code.sf.net/p/wsjt/wsjt/branches/wsjtx@4919 ab8295b8-cf94-4d9e-aec4-7959e3be5d79 --- CMakeLists.txt | 3 +- lib/Makefile.jtsdk | 10 +-- lib/decoder.f90 | 145 ++++--------------------------- lib/downsam9.f90 | 48 +++++++--- lib/filbig.f90 | 15 ++-- lib/jt9.f90 | 7 +- lib/{decode9.f90 => jt9fano.f90} | 4 +- lib/jt9sim.f90 | 2 +- lib/sync9.f90 | 7 -- 9 files changed, 75 insertions(+), 166 deletions(-) rename lib/{decode9.f90 => jt9fano.f90} (95%) diff --git a/CMakeLists.txt b/CMakeLists.txt index a72679ba1..12b41b1d4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -241,8 +241,9 @@ set (wsjt_FSRCS lib/dcoord.f90 lib/decode65a.f90 lib/decode65b.f90 - lib/decode9.f90 + lib/jt9fano.f90 lib/decoder.f90 + lib/decjt9.f90 lib/deg2grid.f90 lib/demod64a.f90 lib/determ.f90 diff --git a/lib/Makefile.jtsdk b/lib/Makefile.jtsdk index 83ea2762e..bc2546bdf 100644 --- a/lib/Makefile.jtsdk +++ b/lib/Makefile.jtsdk @@ -13,7 +13,7 @@ CC = c:/JTSDK/Qt5/Tools/mingw48_32/bin/gcc FC = c:/JTSDK/Qt5/Tools/mingw48_32/bin/gfortran CXX = c:/JTSDK/Qt5/Tools/mingw48_32/bin/g++ -FFLAGS = -O2 -fbounds-check -Wall -fno-second-underscore -DWIN32 +FFLAGS = -O2 -Wall -fno-second-underscore -DWIN32 CFLAGS = -I. -fbounds-check -mno-stack-arg-probe -DWIN32 # Default rules @@ -37,7 +37,7 @@ OBJS1 = pctile.o graycode.o sort.o ssort.o chkmsg.o \ nchar.o four2a.o grid2deg.o pfxdump.o wisdom.o \ symspec.o analytic.o db.o genjt9.o flat1.o smo.o \ packbits.o unpackbits.o encode232.o interleave9.o \ - entail.o fano232.o gran.o sync9.o decode9.o \ + entail.o fano232.o gran.o sync9.o jt9fano.o \ fil3.o decoder.o grid2n.o n2grid.o timer.o \ softsym.o getlags.o afc9.o fchisq.o twkfreq.o downsam9.o \ peakdt9.o symspec2.o stdmsg.o morse.o azdist.o geodist.o \ @@ -47,14 +47,14 @@ OBJS1 = pctile.o graycode.o sort.o ssort.o chkmsg.o \ extract.o fchisq65.o demod64a.o chkhist.o interleave63.o ccf2.o \ move.o indexx.o graycode65.o twkfreq65.o smo121.o \ wrapkarn.o init_rs.o encode_rs.o decode_rs.o gen65.o fil4.o \ - flat3.o polfit.o determ.o baddata.o prog_args.f90 \ - options.o prog_args.o + flat3.o polfit.o determ.o baddata.o prog_args.o \ + options.o fmtmsg.o decjt9.o libjt9.a: $(OBJS1) ar cr libjt9.a $(OBJS1) ranlib libjt9.a -OBJS2 = jt9.o jt9a.o jt9b.o jt9c.o ipcomm.o sec_midn.o usleep.o +OBJS2 = jt9.o jt9a.o jt9b.o jt9c.o ipcomm.o sec_midn.o usleep.o LIBS2 = -L'C:/JTSDK/Qt5/5.2.1/mingw48_32/lib' -lQt5Core jt9.exe: $(OBJS2) libjt9.a $(CXX) -o jt9.exe -static $(OBJS2) $(LIBS2) libjt9.a \ diff --git a/lib/decoder.f90 b/lib/decoder.f90 index 17b4eef85..9488d138a 100644 --- a/lib/decoder.f90 +++ b/lib/decoder.f90 @@ -1,21 +1,13 @@ subroutine decoder(ss,id2) -! Decoder for JT9. - use prog_args include 'constants.f90' real ss(184,NSMAX) - character*22 msg character*20 datetime - real*4 ccfred(NSMAX) - real*4 red2(NSMAX) - logical ccfok(NSMAX) - logical done(NSMAX) logical done65,baddata integer*2 id2(NTMAX*12000) real*4 dd(NTMAX*12000) - integer*1 i1SoftSymbols(207) common/npar/nutc,ndiskdat,ntrperiod,nfqso,newdat,npts8,nfa,nfsplit,nfb, & ntol,kin,nzhsym,nsave,nagain,ndepth,ntxmode,nmode,datetime common/tracer/limtrace,lu @@ -29,10 +21,11 @@ subroutine decoder(ss,id2) if (nagain .eq. 0) then open(13,file=trim(temp_dir)//'/decoded.txt',status='unknown') else - open(13,file=trim(temp_dir)//'/decoded.txt',status='unknown',position='append') + open(13,file=trim(temp_dir)//'/decoded.txt',status='unknown', & + position='append') end if - - open(22,file=trim(temp_dir)//'/kvasd.dat',access='direct',recl=1024,status='unknown') + open(22,file=trim(temp_dir)//'/kvasd.dat',access='direct',recl=1024, & + status='unknown') npts65=52*12000 if(baddata(id2,npts65)) then @@ -48,138 +41,34 @@ subroutine decoder(ss,id2) if(newdat.ne.0) dd(1:npts65)=id2(1:npts65) nf1=nfa nf2=nfb -! if(nmode.eq.65+9) nf2=nfsplit call jt65a(dd,npts65,newdat,nutc,nf1,nf2,nfqso,ntol65,nagain,ndecoded) done65=.true. endif if(nmode.eq.65) go to 800 - nsynced=0 - ndecoded=0 - nsps=0 +! print*,'A' +!!$OMP PARALLEL PRIVATE(id) +!!$OMP SECTIONS - nsps=6912 !Params for JT9-1 - df3=1500.0/2048.0 - - tstep=0.5*nsps/12000.0 !Half-symbol step (seconds) - done=.false. - - nf0=0 - nf1=nfa - if(nmode.eq.65+9) nf1=nfsplit - ia=max(1,nint((nf1-nf0)/df3)) - ib=min(NSMAX,nint((nfb-nf0)/df3)) - lag1=-(2.5/tstep + 0.9999) - lag2=5.0/tstep + 0.9999 - if(newdat.ne.0) then - call timer('sync9 ',0) - call sync9(ss,nzhsym,lag1,lag2,ia,ib,ccfred,red2,ipk) - call timer('sync9 ',1) - endif - - nsps8=nsps/8 - df8=1500.0/nsps8 - dblim=db(864.0/nsps8) - 26.2 - - do nqd=1,0,-1 - limit=5000 - ccflim=3.0 - red2lim=1.6 - schklim=2.2 - if(ndepth.eq.2) then - limit=10000 - ccflim=2.7 - endif - if(ndepth.ge.3 .or. nqd.eq.1) then - limit=100000 - ccflim=2.5 - endif - ccfok=.false. - - if(nqd.eq.1) then - nfa1=nfqso-ntol - nfb1=nfqso+ntol - ia=max(1,nint((nfa1-nf0)/df3)) - ib=min(NSMAX,nint((nfb1-nf0)/df3)) - ccfok(ia:ib)=(ccfred(ia:ib).gt.(ccflim-2.0)) .and. & - (red2(ia:ib).gt.(red2lim-1.0)) - ia1=ia - ib1=ib - else - nfa1=nf1 - nfb1=nfb - ia=max(1,nint((nfa1-nf0)/df3)) - ib=min(NSMAX,nint((nfb1-nf0)/df3)) - do i=ia,ib - ccfok(i)=ccfred(i).gt.ccflim .and. red2(i).gt.red2lim - enddo - ccfok(ia1:ib1)=.false. - endif - - fgood=0. - do i=ia,ib - if(done(i) .or. (.not.ccfok(i))) cycle - f=(i-1)*df3 - if(nqd.eq.1 .or. & - (ccfred(i).ge.ccflim .and. abs(f-fgood).gt.10.0*df8)) then - - if(nqd.eq.0) nfreqs0=nfreqs0+1 - if(nqd.eq.1) nfreqs1=nfreqs1+1 - - call timer('softsym ',0) - fpk=nf0 + df3*(i-1) - - call softsym(id2,npts8,nsps8,newdat,fpk,syncpk,snrdb,xdt, & - freq,drift,schk,i1SoftSymbols) - call timer('softsym ',1) - - sync=(syncpk+1)/4.0 - if(maxval(i1SoftSymbols).eq.0) cycle - if(nqd.eq.1 .and. ((sync.lt.0.5) .or. (schk.lt.2.0))) cycle - if(nqd.ne.1 .and. ((sync.lt.1.0) .or. (schk.lt.schklim))) cycle - - call timer('decode9 ',0) - call decode9(i1SoftSymbols,limit,nlim,msg) - call timer('decode9 ',1) - - if(sync.lt.0.0 .or. snrdb.lt.dblim-2.0) sync=0.0 - nsync=sync - if(nsync.gt.10) nsync=10 - nsnr=nint(snrdb) - ndrift=nint(drift/df3) - - if(msg.ne.' ') then - if(nqd.eq.0) ndecodes0=ndecodes0+1 - if(nqd.eq.1) ndecodes1=ndecodes1+1 - - write(*,1000) nutc,nsnr,xdt,nint(freq),msg -1000 format(i4.4,i4,f5.1,i5,1x,'@',1x,a22) - write(13,1002) nutc,nsync,nsnr,xdt,freq,ndrift,msg -1002 format(i4.4,i4,i5,f6.1,f8.0,i4,3x,a22,' JT9') - - iaa=max(1,i-1) - ibb=min(NSMAX,i+22) - fgood=f - nsynced=1 - ndecoded=1 - ccfok(iaa:ibb)=.false. - done(iaa:ibb)=.true. - call flush(6) - endif - endif - enddo - call flush(6) - if(nagain.ne.0) exit - enddo +!!$OMP SECTION +! print*,'B' + call decjt9(ss,id2,nutc,nfqso,newdat,npts8,nfa,nfsplit,nfb,ntol,nzhsym, & + nagain,ndepth,nmode) +!!$OMP SECTION if(nmode.ge.65 .and. (.not.done65)) then if(newdat.ne.0) dd(1:npts65)=id2(1:npts65) nf1=nfa nf2=nfb +! print*,'C' call jt65a(dd,npts65,newdat,nutc,nf1,nf2,nfqso,ntol65,nagain,ndecoded) endif +!!$OMP END SECTIONS NOWAIT +!!$OMP END PARALLEL +! print*,'D' + ! JT65 is not yet producing info for nsynced, ndecoded. 800 write(*,1010) nsynced,ndecoded 1010 format('',2i4) diff --git a/lib/downsam9.f90 b/lib/downsam9.f90 index 851d497ca..4ae36fa64 100644 --- a/lib/downsam9.f90 +++ b/lib/downsam9.f90 @@ -3,15 +3,22 @@ subroutine downsam9(id2,npts8,nsps8,newdat,nspsd,fpk,c2,nz2) !Downsample from id2() into C2() so as to yield nspsd samples per symbol, !mixing from fpk down to zero frequency. + use, intrinsic :: iso_c_binding + use FFTW3 + include 'constants.f90' - parameter (NMAX1=1024*1920) +! parameter (NMAX1=1024*1920) + parameter (NMAX1=884736) + type(C_PTR) :: plan !Pointers plan for big FFT integer*2 id2(0:8*npts8-1) real*4 x1(0:NMAX1-1) complex c1(0:NMAX1/2) complex c2(0:4096-1) real s(5000) - equivalence (c1,x1) - save + logical first + common/patience/npatience,nthreads + data first/.true./ + save plan,first nfft1=1024*nsps8 !Forward FFT length df1=12000.0/nfft1 @@ -23,14 +30,33 @@ subroutine downsam9(id2,npts8,nsps8,newdat,nspsd,fpk,c2,nz2) x1(i)=fac*id2(i) enddo x1(npts:nfft1-1)=0. !Zero the rest of x1 - call timer('fft_forw',0) - call four2a(c1,nfft1,1,-1,0) !Forward FFT, r2c - call timer('fft_forw',1) + endif - nadd=1.0/df1 + if(first) then + nflags=FFTW_ESTIMATE + if(npatience.eq.1) nflags=FFTW_ESTIMATE_PATIENT + if(npatience.eq.2) nflags=FFTW_MEASURE + if(npatience.eq.3) nflags=FFTW_PATIENT + if(npatience.eq.4) nflags=FFTW_EXHAUSTIVE +! Plan the FFTs just once + plan=fftwf_plan_dft_r2c_1d(nfft1,x1,c1,nflags) + first=.false. + endif + + if(newdat.eq.1) then + fac=6.963e-6 !Why this weird constant? + do i=0,npts-1 + x1(i)=fac*id2(i) + enddo + x1(npts:nfft1-1)=0. !Zero the rest of x1 + call timer('FFTbig9 ',0) + call fftwf_execute_dft_r2c(plan,x1,c1) + call timer('FFTbig9 ',1) + + nadd=int(1.0/df1) s=0. do i=1,5000 - j=(i-1)/df1 + j=int((i-1)/df1) do n=1,nadd j=j+1 s(i)=s(i)+real(c1(j))**2 + aimag(c1(j))**2 @@ -42,7 +68,7 @@ subroutine downsam9(id2,npts8,nsps8,newdat,nspsd,fpk,c2,nz2) nfft2=nfft1/ndown !Backward FFT length nh2=nfft2/2 nf=nint(fpk) - i0=fpk/df1 + i0=int(fpk/df1) nw=100 ia=max(1,nf-nw) @@ -57,9 +83,9 @@ subroutine downsam9(id2,npts8,nsps8,newdat,nspsd,fpk,c2,nz2) if(i.gt.nh2) j=j-nfft2 c2(i)=fac*c1(j) enddo - call timer('fft_back',0) + call timer('FFTsmal9',0) call four2a(c2,nfft2,1,1,1) !FFT back to time domain - call timer('fft_back',1) + call timer('FFTsmal9',1) nz2=8*npts8/ndown return diff --git a/lib/filbig.f90 b/lib/filbig.f90 index 8cd8f81ea..16aad432d 100644 --- a/lib/filbig.f90 +++ b/lib/filbig.f90 @@ -20,7 +20,6 @@ subroutine filbig(dd,npts,f0,newdat,c4a,n4,sq0) real rfilt(NFFT2) !Filter (real) ! integer*8 plan1,plan2,plan3 type(C_PTR) :: plan1,plan2,plan3 !Pointers to FFTW plans - logical first ! include 'fftw3.f90' equivalence (rfilt,cfilt),(rca,ca) @@ -29,7 +28,7 @@ subroutine filbig(dd,npts,f0,newdat,c4a,n4,sq0) 5.89886379,1.59355187,-2.49138308,0.60910773,-0.04248129/ common/refspec/dfref,ref(NSZ) common/patience/npatience,nthreads - save + save first,plan1,plan2,plan3 if(npts.lt.0) go to 900 !Clean up at end of program @@ -54,7 +53,7 @@ subroutine filbig(dd,npts,f0,newdat,c4a,n4,sq0) cfilt(i)=fac*halfpulse(i) cfilt(nfft2+2-i)=fac*halfpulse(i) enddo - call sfftw_execute(plan3) + call fftwf_execute_dft(plan3,cfilt,cfilt) base=real(cfilt(nfft2/2+1)) do i=1,nfft2 @@ -73,7 +72,7 @@ subroutine filbig(dd,npts,f0,newdat,c4a,n4,sq0) nz=min(npts,nfft1) rca(1:nz)=dd(1:nz) rca(nz+1:)=0. - call sfftw_execute(plan1) + call fftwf_execute_dft_r2c(plan1,rca,ca) call timer('FFTbig ',1) call timer('flatten ',0) @@ -124,14 +123,14 @@ subroutine filbig(dd,npts,f0,newdat,c4a,n4,sq0) ! Do the short reverse transform, to go back to time domain. call timer('FFTsmall',0) - call sfftw_execute(plan2) + call fftwf_execute_dft(plan2,c4a,c4a) call timer('FFTsmall',1) n4=min(npts/8,nfft2) return -900 call sfftw_destroy_plan(plan1) - call sfftw_destroy_plan(plan2) - call sfftw_destroy_plan(plan3) +900 call fftwf_destroy_plan(plan1) + call fftwf_destroy_plan(plan2) + call fftwf_destroy_plan(plan3) return end subroutine filbig diff --git a/lib/jt9.f90 b/lib/jt9.f90 index f1b0bb2a0..0f8d8f475 100644 --- a/lib/jt9.f90 +++ b/lib/jt9.f90 @@ -164,8 +164,9 @@ program jt9 999 continue iret=fftwf_export_wisdom_to_filename(wisfile) - - call four2a(a,-1,1,1,1) !Save wisdom and free memory - call filbig(a,-1,1,0.0,0,0,0,0,0) !used for FFT plans + call four2a(a,-1,1,1,1) !Save wisdom and free memory + call filbig(a,-1,1,0.0,0,0,0,0,0) !used for FFT plans + call fftwf_cleanup_threads() + call fftwf_cleanup() end program jt9 diff --git a/lib/decode9.f90 b/lib/jt9fano.f90 similarity index 95% rename from lib/decode9.f90 rename to lib/jt9fano.f90 index 06809eb76..0c1ca042d 100644 --- a/lib/decode9.f90 +++ b/lib/jt9fano.f90 @@ -1,4 +1,4 @@ -subroutine decode9(i1SoftSymbols,limit,nlim,msg) +subroutine jt9fano(i1SoftSymbols,limit,nlim,msg) ! Decoder for JT9 ! Input: i1SoftSymbols(207) - Single-bit soft symbols @@ -82,4 +82,4 @@ subroutine decode9(i1SoftSymbols,limit,nlim,msg) endif return -end subroutine decode9 +end subroutine jt9fano diff --git a/lib/jt9sim.f90 b/lib/jt9sim.f90 index 004281dff..fc56e5c9c 100644 --- a/lib/jt9sim.f90 +++ b/lib/jt9sim.f90 @@ -160,7 +160,7 @@ program jt9sim if(i4.ge.128) i1SoftSymbols(i)=i4-256 enddo limit=1000 - call decode9(i1SoftSymbols,limit,nlim,msg) + call jt9fano(i1SoftSymbols,limit,nlim,msg) if(msg.ne.msg0) print*,'Decode error: ',msg0,' ',msg endif enddo diff --git a/lib/sync9.f90 b/lib/sync9.f90 index 433e11f76..a757d8c7f 100644 --- a/lib/sync9.f90 +++ b/lib/sync9.f90 @@ -1,7 +1,6 @@ subroutine sync9(ss,nzhsym,lag1,lag2,ia,ib,ccfred,red2,ipkbest) include 'constants.f90' -! parameter (NSMAX=1365) !Max length of saved spectra real ss(184,NSMAX) real ss1(184) real ccfred(NSMAX) @@ -20,7 +19,6 @@ subroutine sync9(ss,nzhsym,lag1,lag2,ia,ib,ccfred,red2,ipkbest) do i=ia,ib !Loop over freq range ss1=ss(1:184,i) call pctile(ss1,nzhsym,40,xmed) -! xmed=sum(ss1(1:nzhsym))/nzhsym ss1=ss1/xmed - 1.0 do j=1,nzhsym @@ -66,8 +64,6 @@ subroutine sync9(ss,nzhsym,lag1,lag2,ia,ib,ccfred,red2,ipkbest) do j=1,nzhsym savg(ia:ib)=savg(ia:ib) + ss(j,ia:ib) enddo -! df=1500.0/2048.0 ! 0.732422 -! df9=12000.0/6912.0 ! 1.736111 savg(ia:ib)=savg(ia:ib)/nzhsym smo(0:20)=1.0/21.0 smo(-5:-1)=-(1.0/21.0)*(21.0/10.0) @@ -92,10 +88,7 @@ subroutine sync9(ss,nzhsym,lag1,lag2,ia,ib,ccfred,red2,ipkbest) red2(i)=savg2(i)-ref if(red2(i).lt.-99.0) red2(i)=-99.0 if(red2(i).gt.99.0) red2(i)=99.0 -! write(30,3001) i,i*df+1000.0,savg2(i),red2(i),ccfred(i) -!3001 format(i8,4f10.3) enddo -! call flush(30) return end subroutine sync9