/*
* Copyright (c) 2010-2013 Michael Pippig
*
* This file is part of PFFT.
*
* PFFT is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* PFFT is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with PFFT. If not, see .
*
*/
#include "pfft.h"
#include "ipfft.h"
#include "util.h"
/* Global infos about procmesh are only enabled in debug mode
* Otherwise we do not use any global variables. */
#if PFFT_DEBUG_GVARS
extern MPI_Comm *gdbg_comm_cart;
extern int gdbg_rnk_pm;
extern MPI_Comm *gdbg_comms_pm;
#endif
#if PFFT_DEBUG_SERTRAFO
#define PFFT_DEBUG_SERTRAFO_PTR00 , &(ths0->dbg[0])
#define PFFT_DEBUG_SERTRAFO_PTR01 , &(ths0->dbg[1])
#define PFFT_DEBUG_SERTRAFO_PTR10 , &(ths1->dbg[0])
#define PFFT_DEBUG_SERTRAFO_PTR11 , &(ths1->dbg[1])
#else
#define PFFT_DEBUG_SERTRAFO_PTR00
#define PFFT_DEBUG_SERTRAFO_PTR01
#define PFFT_DEBUG_SERTRAFO_PTR10
#define PFFT_DEBUG_SERTRAFO_PTR11
#endif
static sertrafo_plan plan_sertrafo_p(
INT nb, int rnk, const INT *n, INT howmany,
R *in0, R *out0, R *in1, R *out1,
int sign, const X(r2r_kind) *kinds,
unsigned trafo_flag, unsigned transp_flag,
unsigned opt_flag, unsigned fftw_flags,
double *time);
static sertrafo_plan plan_sertrafo_pt(
INT nb, int rnk, const INT *n, INT howmany,
R *in0, R *out0, R *in1, R *out1, int sign,
const X(r2r_kind) *kinds, unsigned trafo_flag,
unsigned transp_flag0, unsigned transp_flag1,
unsigned opt_flag, unsigned fftw_flags,
double *time);
static sertrafo_plan plan_remap_only(
INT nb, int rnk, const INT *n, INT howmany, R *in, R *out,
unsigned trafo_flag, unsigned transp_flag, unsigned fftw_flags);
static void plan_trafo(PX(fftw_plan) * plan,
INT nb, int rnk, const INT *n, INT howmany,
R *in, R *out, int sign, const X(r2r_kind) *kinds,
unsigned trafo_flag, unsigned transp_flag, unsigned fftw_flags
#if PFFT_DEBUG_SERTRAFO
, sertrafo_dbg *dbg_ptr
#endif
);
static void plan_remap(PX(fftw_plan) * plan,
INT nb, int rnk, const INT *n, INT howmany, R *in, R *out,
unsigned trafo_flag, unsigned transp_flag,
unsigned fftw_flags, unsigned ioarray_flag
#if PFFT_DEBUG_SERTRAFO
, sertrafo_dbg *dbg_ptr
#endif
);
static void malloc_and_fill_dims_trafo(
INT nb, int rnk, const INT *n, INT howmany,
unsigned trafo_flag, unsigned transp_flag,
int *dims_rnk_ptr, X(iodim64) **dims_ptr,
int *howmany_rnk_ptr, X(iodim64) **howmany_dims_ptr);
static void malloc_and_fill_dims_remap(
INT nb, int rnk, const INT *n, INT howmany,
unsigned trafo_flag, unsigned transp_flag, unsigned ioarray_flag,
int *howmany_rnk_ptr, X(iodim64) **howmany_dims_ptr);
static void malloc_and_calculate_strides_remap(
INT nb, int rnk, const INT *pn, INT howmany,
unsigned transp_flag,
INT **sz_ptr, INT **is_ptr, INT **os_ptr);
static void malloc_and_calculate_strides_trafo(
INT nb, int rnk, const INT *n, INT howmany,
unsigned trafo_flag, unsigned transp_flag,
INT **sz_ptr, INT **is_ptr, INT **os_ptr);
static void exchange_INT(
INT *a, INT *b);
static double measure_time(
X(plan) plan);
static sertrafo_plan sertrafo_mkplan(
void);
static int needs_transpose(
unsigned transp_flag);
#if PFFT_DEBUG_SERTRAFO
static sertrafo_dbg sertrafo_mkdbg(
int is_fft, INT nb, int rnk, const INT *n, INT howmany,
R *in, R *out,
unsigned trafo_flag, unsigned transp_flag, unsigned fftw_flags, unsigned ioarray_flag,
int dims_rnk, const X(iodim64) *dims, int howmany_rnk, const X(iodim64) *howmany_dims);
void sertrafo_rmdbg(
sertrafo_dbg ths);
static void print_vec(
int rnk, INT *vec);
static void print_dbg(
sertrafo_dbg ths);
INT calc_n_total(
sertrafo_dbg ths);
#endif
/* input array:
* nb x n0 x n1 x ... x nr x howmany (default, PFFT_TRANSPOSED_NONE)
* n0 x nb x n1 x ... x nr x howmany (PFFT_TRANSPOSED_IN)
* transformed output array:
* nb x N0 x N1 x ... x Nr x howmany (default, PFFT_TRANSPOSED_NONE)
* N0 x nb x N1 x ... x Nr x howmany (PFFT_TRANSPOSED_OUT)
* !!! first dimension 'nb' will not be transformed !!!
* - trafo can be c2c, r2c, c2r, r2r
* - special attention for r2c, c2r padding
* - r2r trafo needs r2r_kind for every dimension
* (different boundary conditions possible)
* - sign only needed for c2c, otherwise sign==0
* - kind only needed for r2r, otherwise kind==NULL
*
* If trafo_flag 'PFFTI_TRAFO_SKIP' is set, we execute only the
* local transpose on the input array, but ommit the transform.
* We use the transform type (C2C, R2C, C2R, R2R) to determine
* the data type of the input array (and the correct strides).
*
* If trafo_flag 'PFFTI_TRAFO_SKIP' AND transp_flag 'PFFT_TRANSPOSE_NONE'
* are set, we still copy the input for out-of-place plans.
*
* If trafo_flag 'PFFTI_TRAFO_PHANTOM' is set, we return a NULL pointer.
* Sometimes its easier to call the planner with PFFTI_TRAFO_PHANTOM than
* to handle the special case were nothing should be done.
*
* You can use io_flags for detailed influence on the algorithms:
* PFFT_BUFFERED_INPLACE: similar to a out-of-place plan whose outputs go back to 'in'
* PFFT_DESTROY_INPUT: out-of-place plan are allowed to overwrite the inputs (disabled on default)
* PFFT_PRESERVE_INPUT: cancels PFFT_DESTROY_INPUT
* */
INT PX(local_size_sertrafo)(
INT nb, int rnk, const INT *n, INT howmany,
unsigned trafo_flag
)
{
INT *pn, mem;
pn = PX(malloc_INT)(rnk);
PX(physical_dft_size)(rnk, n, trafo_flag,
pn);
mem = nb * PX(prod_INT)(rnk, pn) * howmany;
free(pn);
return mem;
}
/* FFTW sometimes produces very slow plans for 1dFFT with different is and os.
* Therefore we replace the FFTW 1dFFT by two FFTW plans, one 1d-FFT and one local_transposition.
* We can choose which one of these plans comes first, which one works out-of-place
* and which one has different is and os. This results in at most 8 different
* plans which we compare by time. */
/* DEFAULT SETTINGS:
*
* R2C FFTs can not combine FFT and transposition.
* => compute FFT and transposition in two different steps
*
* We want stride-1-FFTs to be the default whenever possible.
* => T_OUT: perform FFT before local transposition (maybe input has stride 1)
* T_NONE : perform FFT first, no transposition needed (stride 1)
* T_IN: perform local transposition before FFT (maybe the output has stride 1)
* T_IN and T_OUT: perform FFT as second step, no transposition needed (stride 1 not possible)
*
* For symmetry reasons we want the default of a T_IN trafo to be symmetric to a T_OUT trafo.
* => T_OUT: perform first step out-of-place
* T_IN: perform second step out-of-place
* If no transposition is needed, we want the FFT to be out-place.
* => T_NONE: perform first step out-of-place
* T_IN and T_OUT: perform second step out-of-place
*
* However, if PFFT_TUNE is set all the other options are also checked by the planner.
* */
/* At first we decide which plans work in-place or out-of-place. */
sertrafo_plan PX(plan_sertrafo)(
INT nb, int rnk, const INT *n, INT howmany,
R *in, R *out, int sign, const X(r2r_kind) *kinds,
unsigned trafo_flag, unsigned transp_flag, unsigned io_flag,
unsigned opt_flag, unsigned fftw_flags
)
{
double time0, time1;
sertrafo_plan ths0=NULL, ths1=NULL;
sertrafo_plan result;
if(trafo_flag & PFFTI_TRAFO_PHANTOM)
return NULL;
if(trafo_flag & PFFTI_TRAFO_SKIP) {
result = plan_remap_only(
nb, rnk, n, howmany, in, out,
trafo_flag, transp_flag, fftw_flags);
return result;
}
/* try all allowed sequences of in-place and out-of-place plans */
if(in == out) { /* in-place */
ths0 = plan_sertrafo_p(
nb, rnk, n, howmany, in, in, in, in, sign, kinds,
trafo_flag, transp_flag, opt_flag, fftw_flags,
&time0);
/* assure that plan0 wins, no plan1 possible */
time1 = time0 * 10.0;
} else { /* in != out */
if( io_flag & PFFT_BUFFERED_INPLACE ) { /* out-of-place with outputs back in input array */
ths0 = plan_sertrafo_p(
nb, rnk, n, howmany, in, out, out, in, sign, kinds,
trafo_flag, transp_flag, opt_flag, fftw_flags,
&time0);
/* assure that plan0 wins, if plan1 is not possible */
time1 = time0 * 10.0;
/* skip extra plan for PFFT_NO_TUNE */
if( opt_flag & PFFT_TUNE ) {
ths1 = plan_sertrafo_p(
nb, rnk, n, howmany, in, in, in, in, sign, kinds,
trafo_flag, transp_flag, opt_flag, fftw_flags,
&time1);
}
} else { /* out-of-place */
if(transp_flag & PFFT_TRANSPOSED_IN){
/* input does not have stride-1, transpose first */
/* default: perform second step out-of-place */
ths0 = plan_sertrafo_p(
nb, rnk, n, howmany, in, in, in, out, sign, kinds,
trafo_flag, transp_flag, opt_flag, fftw_flags,
&time0);
/* assure that plan0 wins, if plan1 is not possible */
time1 = time0 * 10.0;
/* skip extra plan for PFFT_NO_TUNE */
if( (~io_flag & PFFT_PRESERVE_INPUT) && (opt_flag & PFFT_TUNE) )
/* compare to first step out-of-place */
ths1 = plan_sertrafo_p(
nb, rnk, n, howmany, in, out, out, out, sign, kinds,
trafo_flag, transp_flag, opt_flag, fftw_flags,
&time1);
} else { /* input not transposed */
/* default: perform first step out-of-place */
ths0 = plan_sertrafo_p(
nb, rnk, n, howmany, in, out, out, out, sign, kinds,
trafo_flag, transp_flag, opt_flag, fftw_flags,
&time0);
/* assure that plan0 wins, if plan1 is not possible */
time1 = time0 * 10.0;
/* compare to second step out-of-place */
/* skip extra plan for PFFT_NO_TUNE */
if( (~io_flag & PFFT_PRESERVE_INPUT) && (opt_flag & PFFT_TUNE) )
ths1 = plan_sertrafo_p(
nb, rnk, n, howmany, in, in, in, out, sign, kinds,
trafo_flag, transp_flag, opt_flag, fftw_flags,
&time1);
}
}
}
/* choose best plan, delete the other one */
if(time1 < time0){
PX(sertrafo_rmplan)(ths0);
return ths1;
} else{
PX(sertrafo_rmplan)(ths1);
return ths0;
}
}
/* only perform remap */
static sertrafo_plan plan_remap_only(
INT nb, int rnk, const INT *n, INT howmany, R *in, R *out,
unsigned trafo_flag, unsigned transp_flag, unsigned fftw_flags
)
{
sertrafo_plan ths=NULL;
ths = sertrafo_mkplan();
#if PFFT_DEBUG_SERTRAFO
plan_remap(&ths->plan[0],
nb, rnk, n, howmany, in, out, trafo_flag, transp_flag, fftw_flags, PFFTI_ARRAY_INPUT,
&ths->dbg[0]);
#else
plan_remap(&ths->plan[0],
nb, rnk, n, howmany, in, out, trafo_flag, transp_flag, fftw_flags, PFFTI_ARRAY_INPUT);
#endif
ths->plan[1].plan = NULL;
return ths;
}
/* Decide if FFT and local transposition are performed at once or in two steps.
* default: calculate trafo and transposition in two different steps,
* since R2C trafo can not combine trafo and transposition */
static sertrafo_plan plan_sertrafo_p(
INT nb, int rnk, const INT *n, INT howmany,
R *in0, R *out0, R *in1, R *out1,
int sign, const X(r2r_kind) *kinds,
unsigned trafo_flag, unsigned transp_flag,
unsigned opt_flag, unsigned fftw_flags,
double *time
)
{
unsigned transp_flag0, transp_flag1;
double time0, time1;
sertrafo_plan ths0=NULL, ths1=NULL;
if(transp_flag & PFFT_TRANSPOSED_IN){
/* default: transpose during 1st plan */
transp_flag0 = transp_flag;
transp_flag1 = (transp_flag & PFFT_TRANSPOSED_OUT) ?
PFFT_TRANSPOSED_IN | PFFT_TRANSPOSED_OUT : PFFT_TRANSPOSED_NONE;
} else {
/* default: transpose during 2nd plan */
transp_flag0 = (transp_flag & PFFT_TRANSPOSED_IN) ?
PFFT_TRANSPOSED_IN | PFFT_TRANSPOSED_OUT : PFFT_TRANSPOSED_NONE;
transp_flag1 = transp_flag;
}
ths0 = plan_sertrafo_pt(
nb, rnk, n, howmany, in0, out0, in1, out1, sign, kinds,
trafo_flag, transp_flag0, transp_flag1, opt_flag, fftw_flags,
&time0);
/* assure that plan0 wins, if plan1 is not possible */
time1 = time0 * 10.0;
/* try this plan only for altered strides */
/* skip extra plan for PFFT_NO_TUNE */
if( (opt_flag & PFFT_TUNE) && needs_transpose(transp_flag) ){
if(transp_flag & PFFT_TRANSPOSED_IN){
/* compare to: transpose during 2nd plan */
transp_flag0 = (transp_flag & PFFT_TRANSPOSED_IN) ?
PFFT_TRANSPOSED_IN | PFFT_TRANSPOSED_OUT : PFFT_TRANSPOSED_NONE;
transp_flag1 = transp_flag;
} else {
/* compare to: transpose during 1st plan */
transp_flag0 = transp_flag;
transp_flag1 = (transp_flag & PFFT_TRANSPOSED_OUT) ?
PFFT_TRANSPOSED_IN | PFFT_TRANSPOSED_OUT : PFFT_TRANSPOSED_NONE;
}
ths1 = plan_sertrafo_pt(
nb, rnk, n, howmany, in0, out0, in1, out1, sign, kinds,
trafo_flag, transp_flag0, transp_flag1, opt_flag, fftw_flags,
&time1);
}
/* choose best plan, delete the other one */
if(time1 < time0){
*time = time1;
PX(sertrafo_rmplan)(ths0);
return ths1;
} else {
*time = time0;
PX(sertrafo_rmplan)(ths1);
return ths0;
}
}
static int needs_transpose(
unsigned transp_flag
)
{
/* no transposition flag set */
if(transp_flag == PFFT_TRANSPOSED_NONE)
return 0;
/* no transposition needed, if both flags are set */
if(transp_flag & PFFT_TRANSPOSED_IN)
if(transp_flag & PFFT_TRANSPOSED_OUT)
return 0;
/* need to transpose, if only one flag is set */
return 1;
}
/* Decide if FFT or remap is executed first:
* default: compute FFT first and remap second. */
static sertrafo_plan plan_sertrafo_pt(
INT nb, int rnk, const INT *n, INT howmany,
R *in0, R *out0, R *in1, R *out1, int sign,
const X(r2r_kind) *kinds, unsigned trafo_flag,
unsigned transp_flag0, unsigned transp_flag1,
unsigned opt_flag, unsigned fftw_flags,
double *time
)
{
double time0=0.0, time1=0.0;
sertrafo_plan ths0=NULL, ths1=NULL;
ths0 = sertrafo_mkplan();
if(transp_flag0 & PFFT_TRANSPOSED_IN){
/* default: perform remap first, fft second */
plan_remap(&ths0->plan[0],
nb, rnk, n, howmany, in0, out0, trafo_flag, transp_flag0, fftw_flags, PFFTI_ARRAY_INPUT
PFFT_DEBUG_SERTRAFO_PTR00);
plan_trafo(&ths0->plan[1],
nb, rnk, n, howmany, in1, out1, sign, kinds, trafo_flag, transp_flag1, fftw_flags
PFFT_DEBUG_SERTRAFO_PTR01);
} else {
/* default: perform FFT first, remap second */
plan_trafo(&ths0->plan[0],
nb, rnk, n, howmany, in0, out0, sign, kinds, trafo_flag, transp_flag0, fftw_flags
PFFT_DEBUG_SERTRAFO_PTR00);
plan_remap(&ths0->plan[1],
nb, rnk, n, howmany, in1, out1, trafo_flag, transp_flag1, fftw_flags, PFFTI_ARRAY_OUTPUT
PFFT_DEBUG_SERTRAFO_PTR01);
}
/* only measure times if we need them for comparison to plan1 */
if(opt_flag & PFFT_TUNE)
time0 = measure_time(ths0->plan[0].plan) + measure_time(ths0->plan[1].plan);
/* this plan differs for altered strides or out-of-place */
ths1 = sertrafo_mkplan();
if( opt_flag & PFFT_TUNE ){ /* skip extra plan for PFFT_NO_TUNE */
if( (in0 != out1)
|| needs_transpose(transp_flag0)
|| needs_transpose(transp_flag1)
)
{
if(transp_flag0 & PFFT_TRANSPOSED_IN){
/* default: perform remap first, fft second */
plan_trafo(&ths1->plan[0],
nb, rnk, n, howmany, in0, out0, sign, kinds, trafo_flag, transp_flag0, fftw_flags
PFFT_DEBUG_SERTRAFO_PTR10);
plan_remap(&ths1->plan[1],
nb, rnk, n, howmany, in1, out1, trafo_flag, transp_flag1, fftw_flags, PFFTI_ARRAY_OUTPUT
PFFT_DEBUG_SERTRAFO_PTR11);
} else {
/* compare to: perform remap first, FFT second */
plan_remap(&ths1->plan[0],
nb, rnk, n, howmany, in0, out0, trafo_flag, transp_flag0, fftw_flags, PFFTI_ARRAY_INPUT
PFFT_DEBUG_SERTRAFO_PTR10);
plan_trafo(&ths1->plan[1],
nb, rnk, n, howmany, in1, out1, sign, kinds, trafo_flag, transp_flag1, fftw_flags
PFFT_DEBUG_SERTRAFO_PTR11);
}
time1 = measure_time(ths1->plan[0].plan) + measure_time(ths1->plan[1].plan);
}
}
if( (opt_flag & PFFT_TUNE) && (time1 < time0) ){
*time = time1;
PX(sertrafo_rmplan)(ths0);
return ths1;
} else {
*time = time0;
PX(sertrafo_rmplan)(ths1);
return ths0;
}
}
static void plan_trafo(
PX(fftw_plan) * plan,
INT nb, int rnk, const INT *n, INT howmany,
R *in, R *out, int sign, const X(r2r_kind) *kinds,
unsigned trafo_flag, unsigned transp_flag, unsigned fftw_flags
#if PFFT_DEBUG_SERTRAFO
, sertrafo_dbg *dbg_ptr
#endif
)
{
int dims_rnk, howmany_rnk;
X(iodim64) *dims, *howmany_dims;
/* R2C can not combine trafo and transposition */
if( (trafo_flag & PFFTI_TRAFO_RDFT) && needs_transpose(transp_flag) ) {
plan->plan = NULL;
return;
}
/* R2R can not combine trafo and transposition */
if( (trafo_flag & PFFTI_TRAFO_R2R) && needs_transpose(transp_flag) ) {
plan->plan = NULL;
return;
}
malloc_and_fill_dims_trafo(
nb, rnk, n, howmany, trafo_flag, transp_flag,
&dims_rnk, &dims, &howmany_rnk, &howmany_dims);
/* choose appropriate fftw planner for trafo */
if(trafo_flag & PFFTI_TRAFO_R2C){
plan->plan = X(plan_guru64_dft_r2c)(
dims_rnk, dims, howmany_rnk, howmany_dims,
in, (C*) out, fftw_flags);
plan->planned_in = in;
plan->planned_out = out;
plan->execute = (PX(fftw_execute)) X(execute_dft_r2c);
} else if(trafo_flag & PFFTI_TRAFO_C2R){
plan->plan = X(plan_guru64_dft_c2r)(
dims_rnk, dims, howmany_rnk, howmany_dims,
(C*) in, out, fftw_flags);
plan->planned_in = in;
plan->planned_out = out;
plan->execute = (PX(fftw_execute)) X(execute_dft_c2r);
} else if(trafo_flag & PFFTI_TRAFO_R2R){
plan->plan = X(plan_guru64_r2r)(
dims_rnk, dims, howmany_rnk, howmany_dims,
in, out, kinds, fftw_flags);
plan->planned_in = in;
plan->planned_out = out;
plan->execute = (PX(fftw_execute)) X(execute_r2r);
} else {
plan->plan = X(plan_guru64_dft)(
dims_rnk, dims, howmany_rnk, howmany_dims,
(C*) in, (C*) out, sign, fftw_flags);
plan->planned_in = in;
plan->planned_out = out;
plan->execute = (PX(fftw_execute)) X(execute_dft);
}
#if PFFT_DEBUG_SERTRAFO
*dbg_ptr = sertrafo_mkdbg(1, nb, rnk, n, howmany, in, out,
trafo_flag, transp_flag, fftw_flags, PFFTI_ARRAY_UNDEFINED,
dims_rnk, dims, howmany_rnk, howmany_dims);
#endif
free(dims); free(howmany_dims);
}
static void malloc_and_fill_dims_trafo(
INT nb, int rnk, const INT *n, INT howmany,
unsigned trafo_flag, unsigned transp_flag,
int *dims_rnk_ptr, X(iodim64) **dims_ptr,
int *howmany_rnk_ptr, X(iodim64) **howmany_dims_ptr
)
{
int dims_rnk, howmany_rnk;
INT *sz, *is, *os;
X(iodim64) *dims, *howmany_dims;
malloc_and_calculate_strides_trafo(
nb, rnk, n, howmany, trafo_flag, transp_flag,
&sz, &is, &os);
dims_rnk = rnk;
dims = (X(iodim64)*) malloc(sizeof(X(iodim64)) * (size_t) dims_rnk);
for(int t=0; tplan = X(plan_guru64_dft)(
dims_rnk, dims, howmany_rnk, howmany_dims,
(C*) in, (C*) out, sign, fftw_flags);
plan->planned_in = in;
plan->planned_out = out;
plan->execute = (PX(fftw_execute)) X(execute_dft);
} else {
plan->plan = X(plan_guru64_r2r)(
dims_rnk, dims, howmany_rnk, howmany_dims,
in, out, kinds, fftw_flags);
plan->planned_in = in;
plan->planned_out = out;
plan->execute = (PX(fftw_execute)) X(execute_r2r);
}
#if PFFT_DEBUG_SERTRAFO
*dbg_ptr = sertrafo_mkdbg(0, nb, rnk, n, howmany, in, out,
trafo_flag, transp_flag, fftw_flags, ioarray_flag,
dims_rnk, dims, howmany_rnk, howmany_dims);
#endif
free(howmany_dims);
}
static void malloc_and_fill_dims_remap(
INT nb, int rnk, const INT *n, INT howmany,
unsigned trafo_flag, unsigned transp_flag, unsigned ioarray_flag,
int *howmany_rnk_ptr, X(iodim64) **howmany_dims_ptr
)
{
int howmany_rnk;
INT *sz, *is, *os, *pn;
X(iodim64) *howmany_dims;
pn = PX(malloc_INT)(rnk);
PX(physical_dft_size)(rnk, n, trafo_flag, pn);
if( trafo_flag & PFFTI_TRAFO_R2C ){
if( ioarray_flag & PFFTI_ARRAY_OUTPUT )
howmany *= 2;
else
pn[rnk-1] *= 2;
}
if( trafo_flag & PFFTI_TRAFO_C2R ){
if( ioarray_flag & PFFTI_ARRAY_INPUT )
howmany *= 2;
else
pn[rnk-1] *= 2;
}
malloc_and_calculate_strides_remap(
nb, rnk, pn, howmany, transp_flag,
&sz, &is, &os);
howmany_rnk = rnk+2;
howmany_dims = (X(iodim64)*) malloc(sizeof(X(iodim64)) * (size_t) howmany_rnk);
for(int t=0; t=0; t--){
is[t] = is[t+1] * pni[t+1];
os[t] = os[t+1] * pno[t+1];
}
/* permute first two dims of strides */
if(transp_flag & PFFT_TRANSPOSED_IN )
exchange_INT(&is[0], &is[1]);
if(transp_flag & PFFT_TRANSPOSED_OUT )
exchange_INT(&os[0], &os[1]);
free(pni); free(pno);
*sz_ptr = sz;
*is_ptr = is;
*os_ptr = os;
}
static void malloc_and_calculate_strides_trafo(
INT nb, int rnk, const INT *n, INT howmany,
unsigned trafo_flag, unsigned transp_flag,
INT **sz_ptr, INT **is_ptr, INT **os_ptr
)
{
INT *sz, *is, *os, *pni, *pno;
/* allocate return arrays */
sz = PX(malloc_INT)(rnk+2);
is = PX(malloc_INT)(rnk+2);
os = PX(malloc_INT)(rnk+2);
/* we need the physical array sizes to compute strides */
pni = PX(malloc_INT)(rnk+2);
pno = PX(malloc_INT)(rnk+2);
/* set FFTW array size */
sz[0] = nb;
for(int t=0; t=0; t--){
is[t] = is[t+1] * pni[t+1];
os[t] = os[t+1] * pno[t+1];
}
/* permute first two dims of strides */
if(transp_flag & PFFT_TRANSPOSED_IN )
exchange_INT(&is[0], &is[1]);
if(transp_flag & PFFT_TRANSPOSED_OUT )
exchange_INT(&os[0], &os[1]);
free(pni); free(pno);
*sz_ptr = sz;
*is_ptr = is;
*os_ptr = os;
}
static void exchange_INT(
INT *a, INT *b
)
{
INT buf = *a;
*a = *b;
*b = buf;
}
static double measure_time(
X(plan) plan
)
{
double time=0;
if(plan != NULL){
/* 1st execution is often very slow, discard it */
X(execute)(plan);
/* measure 2nd execution */
time = -MPI_Wtime();
X(execute)(plan);
time += MPI_Wtime();
}
return time;
}
static sertrafo_plan sertrafo_mkplan(
void
)
{
sertrafo_plan ths = (sertrafo_plan) malloc(sizeof(sertrafo_plan_s));
/* initialize to NULL for easy checks */
for(int t=0; t<2; t++)
ths->plan[t].plan = NULL;
/* initialize debug info */
#if PFFT_DEBUG_SERTRAFO
for(int t=0; t<2; t++)
ths->dbg[t] = NULL;
#endif
return ths;
}
void PX(sertrafo_rmplan)(
sertrafo_plan ths
)
{
/* plan was already destroyed or never initialized */
if(ths==NULL)
return;
/* take care of unsuccessful FFTW planing */
for(int t=0; t<2; t++)
if(ths->plan[t].plan != NULL)
X(destroy_plan)(ths->plan[t].plan);
#if PFFT_DEBUG_SERTRAFO
for(int t=0; t<2; t++)
if(ths->dbg[t] != NULL)
sertrafo_rmdbg(ths->dbg[t]);
#endif
/* free memory */
free(ths);
/* ths=NULL; would be senseless, since we can not change the pointer itself */
}
#if PFFT_DEBUG_SERTRAFO
static sertrafo_dbg sertrafo_mkdbg(
int is_fft, INT nb, int rnk, const INT *n, INT howmany,
R *in, R *out,
unsigned trafo_flag, unsigned transp_flag, unsigned fftw_flags, unsigned ioarray_flag,
int dims_rnk, const X(iodim64) *dims, int howmany_rnk, const X(iodim64) *howmany_dims
)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
sertrafo_dbg ths = (sertrafo_dbg) malloc(sizeof(sertrafo_dbg_s));
ths->is_fft = is_fft;
ths->nb = nb;
ths->rnk = rnk;
if(rnk > 0){
ths->n = (INT*) malloc(sizeof(INT)*rnk);
for(int t=0; tn[t] = n[t];
} else
ths->n = NULL;
ths->howmany = howmany;
ths->in = in;
ths->out = out;
ths->trafo_flag = trafo_flag;
ths->transp_flag = transp_flag;
ths->fftw_flags = fftw_flags;
ths->ioarray_flag = ioarray_flag;
ths->dims_rnk = dims_rnk;
if(dims_rnk > 0){
ths->dims = (X(iodim64)*) malloc(sizeof(X(iodim64)) * (size_t) dims_rnk);
for(int t=0; tdims[t].n = dims[t].n;
ths->dims[t].is = dims[t].is;
ths->dims[t].os = dims[t].os;
}
} else
ths->dims = NULL;
ths->howmany_rnk = howmany_rnk;
if(howmany_rnk > 0){
ths->howmany_dims = (X(iodim64)*) malloc(sizeof(X(iodim64)) * (size_t) howmany_rnk);
for(int t=0; thowmany_dims[t].n = howmany_dims[t].n;
ths->howmany_dims[t].is = howmany_dims[t].is;
ths->howmany_dims[t].os = howmany_dims[t].os;
}
} else
ths->howmany_dims = NULL;
return ths;
}
void sertrafo_rmdbg(
sertrafo_dbg ths
)
{
/* plan was already destroyed or never initialized */
if(ths==NULL)
return;
if(ths->n != NULL)
free(ths->n);
if(ths->dims != NULL)
free(ths->dims);
if(ths->howmany_dims != NULL)
free(ths->howmany_dims);
/* free memory */
free(ths);
/* ths=NULL; would be senseless, since we can not change the pointer itself */
}
static void print_vec(
int rnk, INT *vec
)
{
fprintf(stderr, "[");
for(int t=0; thowmany, ths->in, ths->out);
fprintf(stderr, "PFFT_DBG_SERTRAFO: transp_flag = ");
if(ths->transp_flag & PFFT_TRANSPOSED_IN){
if(ths->transp_flag & PFFT_TRANSPOSED_OUT)
fprintf(stderr, "PFFT_TRANSPOSED_IN | PFFT_TRANSPOSED_OUT");
else
fprintf(stderr, "PFFT_TRANSPOSED_IN");
} else {
if(ths->transp_flag & PFFT_TRANSPOSED_OUT)
fprintf(stderr, "PFFT_TRANSPOSED_OUT");
else
fprintf(stderr, "PFFT_TRANSPOSED_NONE");
}
fprintf(stderr, "\n");
fprintf(stderr, "PFFT_DBG_SERTRAFO: trafo_flag = ");
if(ths->trafo_flag & PFFTI_TRAFO_R2C){
fprintf(stderr, "PFFTI_TRAFO_R2C");
} else if(ths->trafo_flag & PFFTI_TRAFO_C2R){
fprintf(stderr, "PFFTI_TRAFO_C2R");
} else if(ths->trafo_flag & PFFTI_TRAFO_R2R){
fprintf(stderr, "PFFTI_TRAFO_R2R");
} else if(ths->trafo_flag & PFFTI_TRAFO_C2C){
fprintf(stderr, "PFFTI_TRAFO_C2C");
} else {
fprintf(stderr, "!!! UNKNOWN TRAFO FLAG !!!");
}
fprintf(stderr, "\n");
fprintf(stderr, "PFFT_DBG_SERTRAFO: ioarray_flag = ");
if(ths->ioarray_flag & PFFTI_ARRAY_INPUT){
fprintf(stderr, "PFFTI_ARRAY_INPUT");
} else if(ths->ioarray_flag & PFFTI_ARRAY_OUTPUT){
fprintf(stderr, "PFFTI_ARRAY_OUTPUT");
} else if(ths->ioarray_flag & PFFTI_ARRAY_UNDEFINED){
fprintf(stderr, "PFFTI_ARRAY_UNDEFINED");
} else {
fprintf(stderr, "!!! UNKNOWN ARRAY FLAG !!!");
}
fprintf(stderr, "\n");
fprintf(stderr, "PFFT_DBG_SERTRAFO: dims_rnk = %d, ", ths->dims_rnk);
for(int t=0; tdims_rnk; t++)
fprintf(stderr, "dims[%d] = [n=%td, is=%td, os=%td], ", t, ths->dims[t].n, ths->dims[t].is, ths->dims[t].os);
fprintf(stderr, "\n");
fprintf(stderr, "PFFT_DBG_SERTRAFO: howmany_rnk = %d, ", ths->howmany_rnk);
for(int t=0; thowmany_rnk; t++)
fprintf(stderr, "howmany_dims[%d] = [n=%td, is=%td, os=%td], ", t, ths->howmany_dims[t].n, ths->howmany_dims[t].is, ths->howmany_dims[t].os);
fprintf(stderr, "\n");
}
INT calc_n_total(
sertrafo_dbg ths
)
{
INT tuple, n_total;
if(ths==NULL)
return 0;
tuple = (ths->trafo_flag & PFFTI_TRAFO_C2C) ? 2 : 1;
n_total = ths->howmany * ths->nb * tuple;
for(int t=0; trnk; t++)
n_total *= ths->n[t];
return n_total;
}
#endif
void PX(execute_fftw_plan)(
PX(fftw_plan) *fftwplan,
R *planned_in,
R *planned_out,
R *executed_in,
R *executed_out)
{
R *in = (fftwplan->planned_in == planned_in) ? executed_in : executed_out;
R *out = (fftwplan->planned_out == planned_out) ? executed_out : executed_in;
(fftwplan->execute)(fftwplan->plan, in, out);
}
void PX(execute_sertrafo)(
sertrafo_plan ths, R *planned_in, R *planned_out, R *executed_in, R *executed_out
)
{
if(ths==NULL)
return;
#if PFFT_DEBUG_SERTRAFO
static int counter=0;
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
R lsum, gsum;
INT n_total;
if(!counter)
if(!myrank)
fprintf(stderr, "PFFT_DBG_SERTRAFO: !!! Attention: checksums for out0 and in1 may differ between different runs, since the order of FFT and remap may change on all processes. Use FFTW_ESTIMATE to be sure that FFT is performed first on all processes. !!!\n");
if(!myrank) fprintf(stderr, "\n");
if(!myrank){
if(ths->plan[0].plan != NULL){
fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, plan0\n", counter);
print_dbg(ths->dbg[0]);
} else
fprintf(stderr, "PFFT_DBG_SERTRAFO: nothing to do for FFTW plan0\n");
}
n_total = calc_n_total(ths->dbg[0]);
/* Checksum inputs */
lsum=0.0;
if(ths->plan[0].plan != NULL)
for(INT k=0; kdbg[0]->in[k]);
MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
if(ths->plan[0].plan != NULL)
if(!myrank) fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, Checksum(in0) = %e\n", counter, gsum);
// #if PFFT_DEBUG_GVARS
// if(counter==1){
// int np;
// int np0, np1, rnk0, rnk1;
// MPI_Comm_size(MPI_COMM_WORLD, &np);
// MPI_Comm_size(gdbg_comms_pm[0], &np0);
// MPI_Comm_size(gdbg_comms_pm[1], &np1);
// MPI_Comm_rank(gdbg_comms_pm[0], &rnk0);
// MPI_Comm_rank(gdbg_comms_pm[1], &rnk1);
// INT local_N[3], local_N_start[3];
//
// local_N[0] = 512/np0; local_N_start[0] = 512/np0 * rnk0;
// local_N[1] = 512/np1; local_N_start[1] = 512/np1 * rnk1;
// local_N[2] = 512; local_N_start[2] = 0;
//
// if(!myrank) fprintf(stderr, "!!! check all coefficients !!!\n");
// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n",
// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]);
//
// int lerr=0;
// INT m=0;
// for(INT k2=local_N_start[2]; k2dbg[0]->in[m];
// if( (data - ind) > 1e-13){
// if(!lerr)
// if(!myrank)
// fprintf(stderr, "data[%td] = %e, ind = %e, k2=%td, k0=%td, k1=%td\n", data, m, ind, k2, k0, k1);
// lerr = 1;
// }
// }
// if(!myrank) fprintf(stderr, "coords = [%d, %d], lerr = %d\n", rnk0, rnk1, lerr);
// }
// #endif
/* Serial trafo */
if(ths->plan[0].plan != NULL) {
PX(execute_fftw_plan)(&ths->plan[0], planned_in, planned_out, executed_in, executed_out);
}
/* Checksum outputs */
lsum=0.0;
if(ths->plan[0].plan != NULL)
for(INT k=0; kdbg[0]->out[k]);
MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
if(ths->plan[0].plan != NULL)
if(!myrank) fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, Checksum(out0) = %e - Value may change\n", counter, gsum);
// #if PFFT_DEBUG_GVARS
// if(counter==1){
// int np;
// int np0, np1, rnk0, rnk1;
// MPI_Comm_size(MPI_COMM_WORLD, &np);
// MPI_Comm_size(gdbg_comms_pm[0], &np0);
// MPI_Comm_size(gdbg_comms_pm[1], &np1);
// MPI_Comm_rank(gdbg_comms_pm[0], &rnk0);
// MPI_Comm_rank(gdbg_comms_pm[1], &rnk1);
// INT local_N[3], local_N_start[3];
//
// local_N[0] = 512/np0; local_N_start[0] = 512/np0 * rnk0;
// local_N[1] = 512/np1; local_N_start[1] = 512/np1 * rnk1;
// local_N[2] = 512; local_N_start[2] = 0;
//
// if(!myrank) fprintf(stderr, "!!! check all coefficients !!!\n");
// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n",
// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]);
//
// int lerr=0;
// INT m=0;
// for(INT k0=local_N_start[0]; k0dbg[0]->out[m];
// if( (data - ind) > 1e-13){
// if(!lerr)
// if(!myrank)
// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2);
// lerr = 1;
// }
// }
// fprintf(stderr, "coords = [%d, %d], lerr = %d\n", rnk0, rnk1, lerr);
//
// MPI_Barrier(MPI_COMM_WORLD);
// }
// #endif
if(!myrank) fprintf(stderr, "\n");
if(!myrank){
if(ths->plan[1].plan != NULL){
fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, plan1\n", counter);
print_dbg(ths->dbg[1]);
} else
fprintf(stderr, "PFFT_DBG_SERTRAFO: nothing to do for FFTW plan1\n");
}
n_total = calc_n_total(ths->dbg[1]);
/* Checksum inputs */
lsum=0.0;
if(ths->plan[1].plan != NULL)
for(INT k=0; kdbg[1]->in[k]);
MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
if(ths->plan[1].plan != NULL)
if(!myrank) fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, Checksum(in1) = %e - Value may change.\n", counter, gsum);
/* Serial trafo */
if(ths->plan[1].plan != NULL) {
PX(execute_fftw_plan)(&ths->plan[1], planned_in, planned_out, executed_in, executed_out);
}
/* Checksum outputs */
lsum=0.0;
if(ths->plan[1].plan != NULL)
for(INT k=0; kdbg[1]->out[k]);
MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
if(ths->plan[1].plan != NULL)
if(!myrank) fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, Checksum(out1) = %e\n", counter, gsum);
counter++;
#else
/* execute all initialized serfft plans */
for(int t=0; t<2; t++) {
if(ths->plan[t].plan != NULL)
PX(execute_fftw_plan)(&ths->plan[t], planned_in, planned_out, executed_in, executed_out);
}
#endif
}