!* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ! MPI - Two-Dimensional Fast Fourier Transform - Fortran Version ! FILE: mpi_2dfft.f ! OTHER FILES: ser_fft.f timing_fgettod.c ! DESCRIPTION: ! A 512x512 complex matrix is initialized with a point source ! and then decomposed and distributed to each processor in the ! partition (by rows for C; by columns for Fortran). Each ! processor then performs a one-dimensional FFT on it's portion ! of the matrix which is stored locally. Each processor transposes ! it's portion of the matrix and performs an "All to all" distribution ! to the other processors in the partition. This now partitions the ! intermediate matrix by columns for C; rows for Fortran. Each ! processor then performs a one-dimensional FFT on it's portion of ! the matrix. Finally, the columns/rows of the matrix are gathered ! back at the destination processor and timing and Mflop results ! are displayed. C programs perform an additional test for correctness. ! ! Note: A straightforward unsophisticated 1D FFT kernel is used. It is ! sufficient to convey the general idea, but be aware that there are ! better 1D FFTs available on many systems. ! ! AUTHOR: 06/09/96 Blaise Barney - adapted from George Gusciora's MPI C ! and William Smith's PVM Fortran versions. ! LAST REVISED: !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * program mpi_2dfft include 'mpif.h' external timing_fgettod real PI parameter (PI = 3.141592) integer IMAGE_SIZE,IMAGE_SLICE,NUM_CELLS,SOURCE_PROCESSOR, & DEST_PROCESSOR,POW parameter (IMAGE_SIZE = 512) ! matrix order n X n parameter (NUM_CELLS = 4) ! also number of processes parameter (IMAGE_SLICE = (IMAGE_SIZE / NUM_CELLS)) parameter (SOURCE_PROCESSOR = 0) parameter (DEST_PROCESSOR = 0) parameter (POW = 9) ! power of 2 of matrix order n ! a and b are whole matrix size. a_slice and b_slice are used for ! processing a group of columns. Note: a_slice(SIZE, SLICE) ! b_slice(SLICE, SIZE). a_chunk and b_chunk are used for holding pieces ! when doing the transpose complex a(IMAGE_SIZE,IMAGE_SIZE) ! input matrix complex a_slice(IMAGE_SIZE,IMAGE_SLICE) complex a_chunks(IMAGE_SLICE,IMAGE_SLICE,NUM_CELLS) complex b(IMAGE_SIZE,IMAGE_SIZE) ! intermediate matrix complex b_slice(IMAGE_SLICE,IMAGE_SIZE) complex b_chunks(IMAGE_SLICE,IMAGE_SLICE,NUM_CELLS) complex w_common(IMAGE_SIZE/2) ! twiddle factors integer n ! FFT size integer logn ! log base 2 of n integer nx ! used to compute logn real rn ! real value of n integer i,j ! index variables integer offset ! offset index integer numtasks ! Number of processors integer taskid ! ID number for each processor integer ierr ! return code integer errors,sign ! used for error checking integer cell ! variable for other task numbers ! timing variables integer times1(2), times2(2) ! times from timing_fgettod integer tmpt ! temporary time calc variable real t(5) ! array for holding the timings real sum ! sum of timings integer flops ! total number of floating point ops real mflops ! Mflops/s ! Initialize MPI environment and get task's ID and number of tasks in ! the partition. call MPI_INIT(ierr) call MPI_COMM_RANK(MPI_COMM_WORLD, taskid, ierr) call MPI_COMM_SIZE(MPI_COMM_WORLD, numtasks, ierr) print *, 'MPI task ID = ', taskid !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ! Note: IMAGE_SIZE and POW constants are related. POW must be the power ! of two represented by IMAGE_SIZE. If you change the predefined constant ! IMAGE_SIZE, then you must correspondingly change the predefined constant ! POW. !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * n = IMAGE_SIZE logn = POW rn = REAL(n) ! If parent process initialize input matrix with a centered point source if (taskid .eq. SOURCE_PROCESSOR) then do i=1, n do j=1, n a(i,j) = (0.0, 0.0) enddo enddo a(n/2+1,n/2+1) = (rn, rn) ! print table headings in anticipation of timing results print *, ' ' print *, '512 x 512 2D FFT ' print *, ' Timings(secs)' print *, ' scatter 1D-FFT-row transpose', & ' 1D-FFT-col gather Total' endif ! precompute the complex constants (twiddle factors) for the 1D FFTs */ do i = 1, n/2-1 j = i - 1 w_common(i) = (cos((2.0*PI*j)/rn), -sin((2.0*PI*j)/rn)) enddo !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ! Distribute Input Image by Columns !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * call timing_fgettod(%REF(times1)) call MPI_SCATTER(a, IMAGE_SLICE*IMAGE_SIZE, MPI_COMPLEX, & a_slice, IMAGE_SLICE*IMAGE_SIZE, MPI_COMPLEX, & SOURCE_PROCESSOR, MPI_COMM_WORLD, ierr) call timing_fgettod(%REF(times2)) tmpt = ((times2(1) - times1(1))*1000000) + times2(2) - times1(2) t(1) = REAL(tmpt)/1000000.0 !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ! Perform 1-D Row FFTs !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * call timing_fgettod(%REF(times1)) do i = 1, IMAGE_SLICE call fft(a_slice(1,i), w_common, n, logn) enddo call timing_fgettod(%REF(times2)) tmpt = ((times2(1) - times1(1))*1000000) + times2(2) - times1(2) t(2) = REAL(tmpt)/1000000.0 !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ! Transpose 2-D matrix !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * call timing_fgettod(%REF(times1)) do cell = 1, NUM_CELLS offset = (cell - 1) * IMAGE_SLICE do i = 1, IMAGE_SLICE do j = 1, IMAGE_SLICE a_chunks(i,j,cell) = a_slice(i + offset,j) enddo enddo enddo call MPI_ALLTOALL(a_chunks, IMAGE_SLICE*IMAGE_SLICE, MPI_COMPLEX, & b_slice, IMAGE_SLICE*IMAGE_SLICE, MPI_COMPLEX, & MPI_COMM_WORLD, ierr) ! Finally make the rows look like columns so they're together in memory. do i = 1, IMAGE_SIZE do j = 1, IMAGE_SLICE a_slice(i,j) = b_slice(j,i) enddo enddo call timing_fgettod(%REF(times2)) tmpt = ((times2(1) - times1(1))*1000000) + times2(2) - times1(2) t(3) = REAL(tmpt)/1000000.0 !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ! Then do another set of row FFTs !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * call timing_fgettod(%REF(times1)) do i = 1, IMAGE_SLICE call fft(a_slice(1,i), w_common, n, logn) enddo call timing_fgettod(%REF(times2)) tmpt = ((times2(1) - times1(1))*1000000) + times2(2) - times1(2) t(4) = REAL(tmpt)/1000000.0 !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ! Transpose the matrix back. Gather slices from other tasks, then transpose !* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * call timing_fgettod(%REF(times1)) call MPI_GATHER(a_slice, IMAGE_SLICE*IMAGE_SIZE, MPI_COMPLEX, & a, IMAGE_SLICE*IMAGE_SIZE, MPI_COMPLEX, & DEST_PROCESSOR, MPI_COMM_WORLD, ierr) if (taskid .eq. DEST_PROCESSOR) then do i=1, n do j=1, n b(i,j) = a(j,i) enddo enddo endif call timing_fgettod(%REF(times2)) tmpt = ((times2(1) - times1(1))*1000000) + times2(2) - times1(2) t(5) = REAL(tmpt)/1000000.0 ! Calculate total time - then print all timing events with total sum = 0 do i=1, 5 sum = sum + t(i) enddo write(*,100) taskid,t(1),t(2),t(3),t(4),t(5),sum 100 format('cell ',i1,': ',f10.6,f10.6,f10.6,f10.6,f10.6,f10.6) if (taskid .eq. DEST_PROCESSOR) then ! Calculate mflops flops = (n*n*logn)*10 mflops = real(flops/1000000.0) mflops = mflops/sum print *, 'Total Mflops= ',mflops endif call MPI_FINALIZE(ierr) end