As discussed in my earlier posts FFTW++ is an implicit dealiasing library for turbulence simulations. In this post we will see how to define the wave-vector arrays that correspond to the convolutions that fftw++ computes and how to compute dervatives of a 2D field (density, vorticity or stream function) using ffts.

Setting up the Working Environment

We first need to setup our own wrappers in order to call more advanced fftw++ functions from python. However our aim is not to write a complete wrapper either, just the functionality that we need.

In order to do that we first fork fftw++. Create a directory called libfftwpp in the main directory tree and add the following file there:

CMakeLists.txt

cmake_minimum_required(VERSION 3.10)
project(fftwpp VERSION 0.1 LANGUAGES C CXX)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
find_package(MPI REQUIRED)
include_directories(${MPI_INCLUDE_PATH} ${CMAKE_SOURCE_DIR}/.. ${CMAKE_SOURCE_DIR}/../mpi ${CMAKE_SOURCE_DIR}/../tests)
add_library (${PROJECT_NAME} SHARED ../fftw++.cc ../mpi/mpifftw++.cc ../convolution.cc ../mpi/mpiconvolution.cc ../mpi/mpitranspose.cc cfftwpp.cc)
target_link_libraries(${PROJECT_NAME} ${MPI_LIBRARIES} fftw3_omp fftw3 m)

which will allow us to compile the fftw as a shared library with functions wrapped into c functions in the files cfftwpp.cc and cfftwpp.h that we need to write. The advantage of this approach is that we can define very specific functions that will do exactly what we want in fftw++ that we can call from python instead of trying to do what we want using python wrapped functions.

The C wrapper

We then write a c wrapper with bunch of functions that we would like to access in the form of a header file:

cfftwpp.h

#ifndef _CFFTWPP_H_
#define _CFFTWPP_H_
#include "Complex.h"
#include <mpi.h>
#include "mpiconvolution.h"

#ifdef  __cplusplus
extern "C" {
#endif
  using namespace utils;
  using namespace fftwpp;
  mpiOptions dfoptions;
  double __complex__ *fftwpp_complexalign(unsigned int n){
    return (double __complex__ *) ComplexAlign(n);
  }
  double *fftwpp_doublealign(unsigned int n){
    return doubleAlign(n);
  }
  void fftwpp_mpi_set_options(int a,int alltoall, unsigned int threads, unsigned int verbose){
    dfoptions=mpiOptions(a,alltoall,threads,verbose);
  }
  rcfft2dMPI* fftwpp_plan_rcfft2dmpi(split &df,split &dg,double *f,double __complex__ *g){
    return new rcfft2dMPI(df,dg,f,(Complex *)g,dfoptions);
  }
  void fftwpp_rcfft2dmpi_forward0(rcfft2dMPI *plan, double *f, double __complex__ *g){
    plan->Forward0(f,(Complex *)g);
  }
  void fftwpp_rcfft2dmpi_backward0(rcfft2dMPI *plan, double __complex__ *g, double *f){
    plan->Backward0((Complex *)g,f);
  }
  MPIgroup * fftwpp_mpi_group(unsigned int nyp,MPI_Comm &comm){
    MPIgroup *grp = new MPIgroup(comm,nyp);
    return grp;
  }
  split *fftwpp_mpi_split(unsigned int nx,unsigned int nyp, MPI_Comm &comm){
    return new split(nx,nyp,comm);
  }
  ImplicitHConvolution2MPI* fftwpp_create_hconv2d_mpi(split &dg,split &du,
						      bool xcomp, bool ycomp, double __complex__ *g){
    unsigned int nx=dg.X,ny=(dg.Y-1)*2;
    unsigned int nyp=ny/2+1;
    unsigned int mx=(nx+1)/2;
    unsigned int my=(ny+1)/2;
    return new ImplicitHConvolution2MPI(mx,my,xcomp,ycomp,dg,du,(Complex *)g,dfoptions);
  }
  void fftwpp_hconv2d_mpi_convolve(ImplicitHConvolution2MPI* hconv, double __complex__ *f, double __complex__ *g) {
    Complex *G[]={(Complex *)f,(Complex *)g};
    hconv->convolve(G,multbinary);
  }

#ifdef  __cplusplus
  }
#endif
#endif

and an acoompanying c file that will basically add this header into the library:

cfftwpp.cc

#include "Complex.h"
#include "cfftwpp.h"
#include <iostream>

extern "C" {
using namespace utils;
using namespace fftwpp;
  /*...*/
}

as you may have noticed these functions are mostly MPI functions. This is mainly because the basic interface is already wrapped, and also these are the functions that we need in order to develop a parallel pseudospectral solver. Here we focus on 2D.

Of course one should see the above file as a starting point and add whatever one wants to use from the fftw++ library in this file. If we create a python file (in the same directory) that can call these functions, we need something like:

test.py

import numpy as np
import os
from mpi4py import MPI
from numpy.ctypeslib import ndpointer
from ctypes import CDLL,CFUNCTYPE,POINTER,c_double,c_int,c_uint,c_void_p,c_bool,Structure,byref
base = os.path.dirname(os.path.abspath(__file__))
clib = CDLL(os.path.join(base, 'libfftwpp.so'))

class split(Structure):
  _fields_=[("X",c_uint),("Y",c_uint)
            ,("x",c_uint),("y",c_uint)
            ,("x0",c_uint),("y0",c_uint)
            ,("n",c_uint)
            ]

class group(Structure):
  _fields_=[("rank",c_int),("size",c_int)
            ]

clib.fftwpp_mpi_set_options.argtypes = [c_int,c_int,c_uint,c_uint]

clib.fftwpp_plan_rcfft2dmpi.argtypes = [c_void_p, c_void_p, ndpointer(dtype = float),ndpointer(dtype = np.complex128)]
clib.fftwpp_plan_rcfft2dmpi.restype = c_void_p

clib.fftwpp_rcfft2dmpi_forward0.argtypes = [c_void_p,ndpointer(dtype = float),ndpointer(dtype = np.complex128)]
clib.fftwpp_rcfft2dmpi_backward0.argtypes = [c_void_p,ndpointer(dtype = np.complex128),ndpointer(dtype = float)]

clib.fftwpp_mpi_group.restype = c_void_p
clib.fftwpp_mpi_group.argtypes = [c_uint, c_void_p]

clib.fftwpp_mpi_split.restype = c_void_p
clib.fftwpp_mpi_split.argtypes = [c_uint, c_uint, c_void_p]

clib.fftwpp_create_hconv2d_mpi.restype = c_void_p
clib.fftwpp_create_hconv2d_mpi.argtypes=[c_void_p,c_void_p,c_bool, c_bool,ndpointer(dtype = np.complex128)]

clib.fftwpp_hconv2d_mpi_convolve.argtypes = [ c_void_p,
                                          ndpointer(dtype = np.complex128),
                                          ndpointer(dtype = np.complex128) ]

clib.fftwpp_doublealign.argtypes=[c_uint]
clib.fftwpp_doublealign.restype=POINTER(c_double)

clib.fftwpp_complexalign.argtypes=[c_uint]
clib.fftwpp_complexalign.restype=POINTER(c_double)

with these definitions, we can actually reproduce the mpi example fft2rconv:

comm=MPI.COMM_WORLD
commp=MPI._addressof(comm)

nx,ny=8,8
nyp=int(ny/2)+1
mx=int((nx+1)/2);
my=int((ny+1)/2);

grpptr=clib.fftwpp_mpi_group(nyp,commp)
grp=group.from_address(grpptr)

dfptr=clib.fftwpp_mpi_split(nx,ny,grpptr)
print("grp.size=",grp.size,"grp.rank=",grp.rank)
#print("comm.size=",comm.size)
dgptr=clib.fftwpp_mpi_split(nx,nyp,grpptr)
duptr=clib.fftwpp_mpi_split(mx,nyp,grpptr)
df=split.from_address(dfptr)
dg=split.from_address(dgptr)

if (grp.rank<grp.size):
    print("rank=",comm.rank,",dg.X=",dg.X,"dg.x=",dg.x,"dg.Y=",dg.Y,"dg.y=",dg.y,"dg.n=",dg.n)
    print("rank=",comm.rank,",df.X=",df.X,"df.x=",df.x,"df.Y=",df.Y,"df.y=",df.y,"df.n=",df.n)
    f0=np.ctypeslib.as_array(clib.fftwpp_doublealign(df.n),shape=(df.x,df.Y))
    f1=np.ctypeslib.as_array(clib.fftwpp_doublealign(df.n),shape=(df.x,df.Y))
    #not perfectly clear if we must use dg.n instead.
    g0=np.ctypeslib.as_array(clib.fftwpp_complexalign(2*dg.X*dg.y),shape=(2*dg.X*dg.y,)).view(dtype=complex).reshape((dg.X,dg.y))
    g1=np.ctypeslib.as_array(clib.fftwpp_complexalign(2*dg.X*dg.y),shape=(2*dg.X*dg.y,)).view(dtype=complex).reshape((dg.X,dg.y))
    plan=clib.fftwpp_plan_rcfft2dmpi(dfptr,dgptr,f0,g0)
    cptr=clib.fftwpp_create_hconv2d_mpi(dgptr,duptr,False,False,g0)
    f0[:]=np.array([[j+i+df.x0 for j in range(df.Y)] for i in range(df.x)],dtype=float)
    f1[:]=np.array([[j+i+df.x0 for j in range(df.Y)] for i in range(df.x)],dtype=float)
    f0c=f0.copy()
    clib.fftwpp_rcfft2dmpi_forward0(plan,f0,g0)
    clib.fftwpp_rcfft2dmpi_forward0(plan,f1,g1)
    clib.fftwpp_hconv2d_mpi_convolve(cptr,g0,g1)
    h=g0
    print("h=",h)
    clib.fftwpp_rcfft2dmpi_backward0(plan,g0,f0)
    print("f0=",f0/df.X/df.Y)

The issue of splitting

Now interestingly enough, fftw++ seems to use a different array splitting scheme than the usual. Below figure shows how a 5 element array is split between 4 nodes for the usual methods (used for example in fftw) vs. the one used in fftw++.

mpi_split

the two different algorithms can be written explictly as:

splitmpi.py

def splitmpi(N,rank,size):
    nperpe = int(N/size)
    nrem = N - size*nperpe
    local_N = nperpe+(rank < nrem)
    loc_base = rank*nperpe+min(rank,nrem)
    return local_N,loc_base


def ceilquotient(a,b):
    return int((a+b-1)/b)

def localdimension(N, rank, size):
    n=ceilquotient(N,size)
    start=n*rank
    extra=N-start
    if(extra < 0):
        extra=0
    if(n > extra):
        n=extra
    return n,start