/*
* Copyright (c) 2010-2013 Michael Pippig
*
* This file is part of PFFT.
*
* PFFT is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* PFFT is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with PFFT. If not, see .
*
*/
#include "pfft.h"
#include "ipfft.h"
#include "util.h"
/* use MPI_Datatype to send ghostcells */
/* This option is more comfortable for programming and avoids extra buffers for send/recv.
* However the buffered send/recv is slightly faster. */
#define PFFT_GC_USE_MPI_DATATYPE 0
/* time parts of the ghostcell send */
#define PFFT_ENABLE_GC_TIMER 0
/* definition of timing macros */
#if PFFT_ENABLE_GC_TIMER
#define PFFT_GC_INIT_TIMING(comm) \
int tm_rank; \
MPI_Comm_rank(comm, &tm_rank); \
double tm_timer, tm_global_timer_max, tm_global_timer_min;
#define PFFT_GC_START_TIMING(comm) \
MPI_Barrier(comm); \
tm_timer = -MPI_Wtime();
#define PFFT_GC_FINISH_TIMING(comm, str) \
tm_timer += MPI_Wtime(); \
MPI_Reduce(&tm_timer, &tm_global_timer_max, 1, MPI_DOUBLE, MPI_MAX, 0, comm); \
MPI_Reduce(&tm_timer, &tm_global_timer_min, 1, MPI_DOUBLE, MPI_MIN, 0, comm); \
if(!tm_rank) printf("PFFT_GC_TIMING: %s takes %.2e s (minimum %.2e s, load imbalance %.2f %%)\n", str, tm_global_timer_max, tm_global_timer_min, 100.0*(tm_global_timer_max-tm_global_timer_min)/tm_global_timer_min);
#else
#define PFFT_GC_INIT_TIMING(comm)
#define PFFT_GC_START_TIMING(comm)
#define PFFT_GC_FINISH_TIMING(comm, str)
#endif
static void exchange_gcells_along_one_dim(
PX(gcplan) ths, int dim);
static void reduce_gcells_along_one_dim(
PX(gcplan) ths, int dim);
static void exchange_gcells_above_along_one_dim(
PX(gcplan) ths, int dim);
static void exchange_gcells_below_along_one_dim(
PX(gcplan) ths, int dim);
static void reduce_gcells_above_along_one_dim(
PX(gcplan) ths, int dim);
static void reduce_gcells_below_along_one_dim(
PX(gcplan) ths, int dim);
static void sendrecv_gcells_along_one_dim(
PX(gcplan) ths, int dim, int dir,
INT numSend, INT sendOffset, INT numRecv, INT recvOffset);
static void addsendrecv_gcells_along_one_dim(
PX(gcplan) ths, int dim, int dir,
INT numSend, INT sendOffset, INT numRecv, INT recvOffset);
#if PFFT_GC_USE_MPI_DATATYPE
static void isend_slices(
R *data, int rnk_n, const INT *localArraySize, int dim, int dir, INT tupleSize,
INT numSend, INT sendOffset, MPI_Comm commCart1d,
MPI_Request *request);
static void irecv_slices(
R *data, int rnk_n, const INT *localArraySize, int dim, int dir, INT tupleSize,
INT numRecv, INT recvOffset, MPI_Comm commCart1d, MPI_Request *request);
static void addrecv_slices(
R *data, int rnk_n, const INT *localArraySize, int dim, int dir, INT tupleSize,
INT numRecv, INT recvOffset, MPI_Comm commCart1d);
static void create_mpi_datatype_slices(
int rnk_n, const INT *n, INT tuple,
int dim, INT num_slices, INT offset,
MPI_Datatype *newtype);
#endif
/* FIXME: Generalize to arbitrary dimesions
* Idea: canonicalize to three dims: n0 * slices * n1 */
static void add_buffer_to_slices(
R *data, R *buffer,
const INT *localArraySize3d, INT tupleSize,
int dim, INT numSlices, INT offset);
#if !PFFT_GC_USE_MPI_DATATYPE
static void copy_slices_to_buffer(
R *data, R *buffer,
const INT *localArraySize3d, INT tupleSize,
int dim, INT numSlices, INT offset);
static void copy_buffer_to_slices(
R *data, R *buffer,
const INT *localArraySize3d, INT tupleSize,
int dim, INT numSlices, INT offset);
static INT calculate_buffer_size(
int rnk_n, const INT *n, INT tuple, INT num_slices, int dim);
static R* allocate_buffer(
int rnk_n, const INT *n, INT tuple,
int dim, INT num_slices,
INT *buffer_size);
static void isend_packed_slices(
int dir, INT buf_size, R *buffer, MPI_Comm commCart1d,
MPI_Request *request);
static void irecv_packed_slices(
int dir, INT buf_size, MPI_Comm commCart1d,
MPI_Request *request, R *buffer);
static void pack_slices(
INT buffer_size, int rnk_n, const INT *n, INT tuple,
int dim, INT num_slices, INT offset, R *data,
R *buffer);
static void unpack_slices(
int rnk_n, const INT *n, INT tuple,
int dim, INT num_slices, INT offset,
INT buffer_size, R *buffer,
unsigned add_buffer,
R *data);
#endif
void PX(exchange_gc_sendrecv)(
PX(gcplan) ths
)
{
for(int dim=0; dimrnk_n; dim++)
exchange_gcells_along_one_dim(ths, dim);
}
void PX(reduce_gc_sendrecv)(
PX(gcplan) ths
)
{
for(int dim=0; dimrnk_n; dim++)
reduce_gcells_along_one_dim(ths, dim);
}
static void exchange_gcells_along_one_dim(
PX(gcplan) ths, int dim
)
{
exchange_gcells_below_along_one_dim(ths, dim);
exchange_gcells_above_along_one_dim(ths, dim);
}
static void reduce_gcells_along_one_dim(
PX(gcplan) ths, int dim
)
{
reduce_gcells_below_along_one_dim(ths, dim);
reduce_gcells_above_along_one_dim(ths, dim);
}
static void exchange_gcells_above_along_one_dim(
PX(gcplan) ths, int dim
)
{
INT numSendLeftOver, numRecvLeftOver, sendOffset, recvOffset;
INT numCurrentSend, numCurrentRecv, numSendAvail, numRecvAvail;
INT localArrayStart, localArrayEnd;
INT globalArraySize = ths->n[dim], blockSize = ths->blk[dim];
localArrayStart = localArrayEnd = ths->gc_below[dim];
localArrayEnd += ths->loc_n[dim];
#if PFFT_DEBUG_GHOSTCELLS
int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
R rsum, grsum;
INT start[3], end[3], ngc[3], k;
INT l0, l1, l2, l, m0, m1, m2, m, n0, n1, n2, n;
for(int t=0; t<3; t++)
ngc[t] = ths->gc_below[t] + ths->loc_n[t] + ths->gc_above[t];
#endif
#if PFFT_DEBUG_GHOSTCELLS
for(int t=0; t<3; t++){
start[t] = ths->gc_below[t];
end[t] = ths->gc_below[t] + ths->loc_n[t];
}
rsum = 0.0;
for(INT k0=start[0]; k0tuple; t++){
k = t + ths->tuple*(k2 + ngc[2]*(k1 + ngc[1]*k0));
rsum += fabs(ths->data[k]);
}
}
}
}
MPI_Reduce(&rsum, &grsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
if(!myrank) fprintf(stderr, "PFFT GC-Send Above: dim = %d, Sum of initial data: %e\n", dim, grsum);
#endif
#if PFFT_DEBUG_GHOSTCELLS
for(int t=0; t<3; t++){
start[t] = (t<=dim) ? 0 : ths->gc_below[t];
end[t] = ths->gc_below[t] + ths->loc_n[t];
end[t] += (tgc_above[t] : 0;
}
rsum = 0.0;
for(INT k0=start[0]; k0tuple; t++){
k = t + ths->tuple*(k2 + ngc[2]*(k1 + ngc[1]*k0));
rsum += fabs(ths->data[k]);
}
}
}
}
MPI_Reduce(&rsum, &grsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
if(!myrank) fprintf(stderr, "PFFT GC-Send Above: dim = %d, Sum of data before gcsend: %e\n", dim, grsum);
#endif
#if PFFT_DEBUG_GHOSTCELLS
if(!myrank) fprintf(stderr, "PFFT GC-Send Above: dim = %d, Before gcsend: loc_n = [%td, %td, %td], start = [%td, %td, %td], end = [%td, %td, %td], ngc = [%td, %td, %td], tuple = %td\n",
dim, ths->loc_n[0], ths->loc_n[1], ths->loc_n[2], start[0], start[1], start[2], end[0], end[1], end[2], ngc[0], ngc[1], ngc[2], ths->tuple);
#endif
#if PFFT_DEBUG_GHOSTCELLS
l0=ths->gc_below[0]; l1=ths->gc_below[1]; l2=ths->gc_below[2];
m0=l0+ths->gc_above[0]-1, m1=l1; m2=l2;
n0=0, n1=l1; n2=l2;
l=l2 + ngc[2]*( l1+ngc[1]*l0 );
m=m2 + ngc[2]*( m1+ngc[1]*m0 );
n=n2 + ngc[2]*( n1+ngc[1]*n0 );
if(!myrank) fprintf(stderr, "PFFT GC-Send Above: dim = %d, Before gcsend: data[%td, %td, %td] = %e + I* %e, data[%td, %td, %td] = %e + I* %e, data[%td, %td, %td] = %e + I* %e\n",
dim, n0, n1, n2, ths->data[2*n], ths->data[2*n+1], l0, l1, l2, ths->data[2*l], ths->data[2*l+1], m0, m1, m2, ths->data[2*m], ths->data[2*m+1]);
#endif
sendOffset = localArrayStart;
recvOffset = localArrayEnd;
numSendLeftOver = numRecvLeftOver = ths->gc_above[dim];
for(int shift=0; (numSendLeftOver > 0) || (numRecvLeftOver > 0); shift++){
numSendAvail = PX(local_block_size_shifted)(
globalArraySize, blockSize, shift, ths->comms_pm[dim]);
numRecvAvail = PX(local_block_size_shifted)(
globalArraySize, blockSize, shift+1, ths->comms_pm[dim]);
numCurrentSend = MIN(numSendLeftOver, numSendAvail);
numCurrentRecv = MIN(numRecvLeftOver, numRecvAvail);
sendrecv_gcells_along_one_dim(
ths, dim, -1, numCurrentSend, sendOffset, numCurrentRecv, recvOffset);
sendOffset += numCurrentSend;
recvOffset += numCurrentRecv;
numSendLeftOver -= numCurrentSend;
numRecvLeftOver -= numCurrentRecv;
}
#if PFFT_DEBUG_GHOSTCELLS
for(int t=0; t<3; t++){
start[t] = (t<=dim) ? 0 : ths->gc_below[t];
end[t] = ths->gc_below[t] + ths->loc_n[t];
end[t] += (t<=dim) ? ths->gc_above[t] : 0;
}
rsum = 0.0;
for(INT k0=start[0]; k0tuple; t++){
k = t + ths->tuple*(k2 + ngc[2]*(k1 + ngc[1]*k0));
rsum += fabs(ths->data[k]);
}
}
}
}
MPI_Reduce(&rsum, &grsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
if(!myrank) fprintf(stderr, "PFFT GC-Send Above: dim = %d, Sum of data after gcsend: %e\n", dim, grsum);
#endif
#if PFFT_DEBUG_GHOSTCELLS
if(!myrank) fprintf(stderr, "PFFT GC-Send Above: dim = %d, After gcsend: loc_n = [%td, %td, %td], start = [%td, %td, %td], end = [%td, %td, %td], tuple = %td\n",
dim, ths->loc_n[0], ths->loc_n[1], ths->loc_n[2], start[0], start[1], start[2], end[0], end[1], end[2], ths->tuple);
#endif
#if PFFT_DEBUG_GHOSTCELLS
l0=ths->gc_above[0]-1; l1=ths->gc_below[1]; l2=ths->gc_below[2];
m0=ths->gc_below[0] + ths->loc_n[0], m1=l1; m2=l2;
n0=0, n1=l1; n2=l2;
l=l2 + ngc[2]*( l1+ngc[1]*l0 );
m=m2 + ngc[2]*( m1+ngc[1]*m0 );
n=n2 + ngc[2]*( n1+ngc[1]*n0 );
if(!myrank) fprintf(stderr, "PFFT GC-Send Above: dim = %d, After gcsend: data[%td, %td, %td] = %e + I* %e, data[%td, %td, %td] = %e + I* %e, data[%td, %td, %td] = %e + I* %e\n",
dim, n0, n1, n2, ths->data[2*n], ths->data[2*n+1], l0, l1, l2, ths->data[2*l], ths->data[2*l+1], m0, m1, m2, ths->data[2*m], ths->data[2*m+1]);
#endif
}
static void exchange_gcells_below_along_one_dim(
PX(gcplan) ths, int dim
)
{
INT numSendLeftOver, numRecvLeftOver, sendOffset, recvOffset;
INT numCurrentSend, numCurrentRecv, numSendAvail, numRecvAvail;
INT localArrayStart, localArrayEnd;
INT globalArraySize = ths->n[dim], blockSize = ths->blk[dim];
localArrayStart = localArrayEnd = ths->gc_below[dim];
localArrayEnd += ths->loc_n[dim];
#if PFFT_DEBUG_GHOSTCELLS
int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
R rsum, grsum;
INT start[3], end[3], ngc[3], k;
for(int t=0; t<3; t++)
ngc[t] = ths->gc_below[t] + ths->loc_n[t] + ths->gc_above[t];
#endif
#if PFFT_DEBUG_GHOSTCELLS
for(int t=0; t<3; t++){
start[t] = ths->gc_below[t];
end[t] = ths->gc_below[t] + ths->loc_n[t];
}
rsum = 0.0;
for(INT k0=start[0]; k0tuple; t++){
k = t + ths->tuple*(k2 + ngc[2]*(k1 + ngc[1]*k0));
rsum += fabs(ths->data[k]);
}
}
}
}
MPI_Reduce(&rsum, &grsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
if(!myrank) fprintf(stderr, "PFFT GC-Send Below: dim = %d, Sum of initial data: %e\n", dim, grsum);
#endif
#if PFFT_DEBUG_GHOSTCELLS
for(int t=0; t<3; t++){
start[t] = (tgc_below[t];
end[t] = ths->gc_below[t] + ths->loc_n[t];
end[t] += (tgc_above[t] : 0;
}
rsum = 0.0;
for(INT k0=start[0]; k0tuple; t++){
k = t + ths->tuple*(k2 + ngc[2]*(k1 + ngc[1]*k0));
rsum += fabs(ths->data[k]);
}
}
}
}
MPI_Reduce(&rsum, &grsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
if(!myrank) fprintf(stderr, "PFFT GC-Send Below: dim = %d, Sum of data before gcsend: %e\n", dim, grsum);
#endif
#if PFFT_DEBUG_GHOSTCELLS
if(!myrank) fprintf(stderr, "PFFT GC-Send Below: dim = %d, Before gcsend: loc_n = [%td, %td, %td], start = [%td, %td, %td], end = [%td, %td, %td], ngc = [%td, %td, %td], tuple = %td\n",
dim, ths->loc_n[0], ths->loc_n[1], ths->loc_n[2], start[0], start[1], start[2], end[0], end[1], end[2], ngc[0], ngc[1], ngc[2], ths->tuple);
#endif
sendOffset = localArrayEnd;
recvOffset = localArrayStart;
numSendLeftOver = numRecvLeftOver = ths->gc_below[dim];
for(int shift=0; (numSendLeftOver > 0) || (numRecvLeftOver > 0); shift-- ){
numSendAvail = PX(local_block_size_shifted)(
globalArraySize, blockSize, shift, ths->comms_pm[dim]);
numRecvAvail = PX(local_block_size_shifted)(
globalArraySize, blockSize, shift-1, ths->comms_pm[dim]);
numCurrentSend = MIN(numSendLeftOver, numSendAvail);
numCurrentRecv = MIN(numRecvLeftOver, numRecvAvail);
sendOffset -= numCurrentSend;
recvOffset -= numCurrentRecv;
sendrecv_gcells_along_one_dim(
ths, dim, +1, numCurrentSend, sendOffset, numCurrentRecv, recvOffset);
numSendLeftOver -= numCurrentSend;
numRecvLeftOver -= numCurrentRecv;
}
#if PFFT_DEBUG_GHOSTCELLS
for(int t=0; t<3; t++){
start[t] = (t<=dim) ? 0 : ths->gc_below[t];
end[t] = ths->gc_below[t] + ths->loc_n[t];
end[t] += (tgc_above[t] : 0;
}
rsum = 0.0;
for(INT k0=start[0]; k0tuple; t++){
k = t + ths->tuple*(k2 + ngc[2]*(k1 + ngc[1]*k0));
rsum += fabs(ths->data[k]);
}
}
}
}
MPI_Reduce(&rsum, &grsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
if(!myrank) fprintf(stderr, "PFFT GC-Send Below: dim = %d, Sum of data after gcsend: %e\n", dim, grsum);
#endif
#if PFFT_DEBUG_GHOSTCELLS
if(!myrank) fprintf(stderr, "PFFT GC-Send Below: dim = %d, After gcsend: loc_n = [%td, %td, %td], start = [%td, %td, %td], end = [%td, %td, %td], tuple = %td\n",
dim, ths->loc_n[0], ths->loc_n[1], ths->loc_n[2], start[0], start[1], start[2], end[0], end[1], end[2], ths->tuple);
#endif
}
static void reduce_gcells_above_along_one_dim(
PX(gcplan) ths, int dim
)
{
INT numSendLeftOver, numRecvLeftOver, sendOffset, recvOffset;
INT numCurrentSend, numCurrentRecv, numSendAvail, numRecvAvail;
INT localArrayStart, localArrayEnd, numSendPossible, numRecvPossible;
INT globalArraySize = ths->n[dim], blockSize = ths->blk[dim];
int numShifts;
localArrayStart = localArrayEnd = ths->gc_below[dim];
localArrayEnd += ths->loc_n[dim];
/* corresponds to loop in exchange_gcells_above_along_one_dim
* with switched sender and receiver */
numShifts = 0;
numSendPossible = numRecvPossible = 0;
numSendLeftOver = numRecvLeftOver = ths->gc_above[dim];
for(int shift=0; (numSendLeftOver > 0) || (numRecvLeftOver > 0); shift++){
numSendAvail = PX(local_block_size_shifted)(
globalArraySize, blockSize, shift+1, ths->comms_pm[dim]);
numRecvAvail = PX(local_block_size_shifted)(
globalArraySize, blockSize, shift, ths->comms_pm[dim]);
numSendPossible += numSendAvail;
numRecvPossible += numRecvAvail;
numCurrentSend = MIN(numSendLeftOver, numSendAvail);
numCurrentRecv = MIN(numRecvLeftOver, numRecvAvail);
numSendLeftOver -= numCurrentSend;
numRecvLeftOver -= numCurrentRecv;
numShifts++;
}
sendOffset = localArrayEnd + ths->gc_above[dim];
recvOffset = localArrayStart + ths->gc_above[dim];
numSendLeftOver = numRecvLeftOver = ths->gc_above[dim];
for(int shift = numShifts-1; shift >= 0; shift-- ){
numSendPossible -= PX(local_block_size_shifted)(
globalArraySize, blockSize, shift+1, ths->comms_pm[dim]);
numRecvPossible -= PX(local_block_size_shifted)(
globalArraySize, blockSize, shift, ths->comms_pm[dim]);
numCurrentSend = numSendLeftOver - MIN(numSendLeftOver, numSendPossible);
numCurrentRecv = numRecvLeftOver - MIN(numRecvLeftOver, numRecvPossible);
sendOffset -= numCurrentSend;
recvOffset -= numCurrentRecv;
addsendrecv_gcells_along_one_dim(
ths, dim, +1, numCurrentSend, sendOffset, numCurrentRecv, recvOffset);
numSendLeftOver -= numCurrentSend;
numRecvLeftOver -= numCurrentRecv;
}
}
static void reduce_gcells_below_along_one_dim(
PX(gcplan) ths, int dim
)
{
INT numSendLeftOver, numRecvLeftOver, sendOffset, recvOffset;
INT numCurrentSend, numCurrentRecv, numSendAvail, numRecvAvail;
INT localArrayEnd, numSendPossible, numRecvPossible;
INT globalArraySize = ths->n[dim], blockSize = ths->blk[dim];
int numShifts;
localArrayEnd = ths->gc_below[dim];
localArrayEnd += ths->loc_n[dim];
/* corresponds to loop in exchange_gcells_above_along_one_dim
* with switched sender and receiver */
numShifts = 0;
numSendPossible = numRecvPossible = 0;
numSendLeftOver = numRecvLeftOver = ths->gc_below[dim];
for(int shift=0; (numSendLeftOver > 0) || (numRecvLeftOver > 0); shift--){
numSendAvail = PX(local_block_size_shifted)(
globalArraySize, blockSize, shift-1, ths->comms_pm[dim]);
numRecvAvail = PX(local_block_size_shifted)(
globalArraySize, blockSize, shift, ths->comms_pm[dim]);
numSendPossible += numSendAvail;
numRecvPossible += numRecvAvail;
numCurrentSend = MIN(numSendLeftOver, numSendAvail);
numCurrentRecv = MIN(numRecvLeftOver, numRecvAvail);
numSendLeftOver -= numCurrentSend;
numRecvLeftOver -= numCurrentRecv;
numShifts--;
}
sendOffset = 0;
recvOffset = localArrayEnd - ths->gc_below[dim];
numSendLeftOver = numRecvLeftOver = ths->gc_below[dim];
for(int shift = numShifts+1; shift <= 0; shift++ ){
numSendPossible -= PX(local_block_size_shifted)(
globalArraySize, blockSize, shift-1, ths->comms_pm[dim]);
numRecvPossible -= PX(local_block_size_shifted)(
globalArraySize, blockSize, shift, ths->comms_pm[dim]);
numCurrentSend = numSendLeftOver - MIN(numSendLeftOver, numSendPossible);
numCurrentRecv = numRecvLeftOver - MIN(numRecvLeftOver, numRecvPossible);
addsendrecv_gcells_along_one_dim(
ths, dim, -1, numCurrentSend, sendOffset, numCurrentRecv, recvOffset);
sendOffset += numCurrentSend;
recvOffset += numCurrentRecv;
numSendLeftOver -= numCurrentSend;
numRecvLeftOver -= numCurrentRecv;
}
}
#if PFFT_GC_USE_MPI_DATATYPE
static void sendrecv_gcells_along_one_dim(
PX(gcplan) ths, int dim, int dir,
INT numSend, INT sendOffset, INT numRecv, INT recvOffset
)
{
MPI_Request mpi_req[2];
PFFT_GC_INIT_TIMING(MPI_COMM_WORLD);
PFFT_GC_START_TIMING();
isend_slices(ths->data, ths->rnk_n, ths->ngc, dim, dir,
ths->tuple, numSend, sendOffset, ths->comms_pm[dim], &mpi_req[0]);
irecv_slices(ths->data, ths->rnk_n, ths->ngc, dim, dir,
ths->tuple, numRecv, recvOffset, ths->comms_pm[dim], &mpi_req[1]);
MPI_Waitall(2, mpi_req, MPI_STATUSES_IGNORE);
PFFT_GC_FINISH_TIMING(MPI_COMM_WORLD, "send/recv");
}
#else
static void sendrecv_gcells_along_one_dim(
PX(gcplan) ths, int dim, int dir,
INT numSend, INT sendOffset, INT numRecv, INT recvOffset
)
{
INT sbuf_size=0, rbuf_size=0;
R *sbuf=NULL, *rbuf=NULL;
MPI_Request mpi_req[2];
unsigned add_buffer=0;
PFFT_GC_INIT_TIMING(MPI_COMM_WORLD);
PFFT_GC_START_TIMING();
sbuf = allocate_buffer(ths->rnk_n, ths->ngc, ths->tuple, dim, numSend,
&sbuf_size);
rbuf = allocate_buffer(ths->rnk_n, ths->ngc, ths->tuple, dim, numRecv,
&rbuf_size);
PFFT_GC_FINISH_TIMING(MPI_COMM_WORLD, "allocate buffers");
PFFT_GC_START_TIMING();
pack_slices(sbuf_size, ths->rnk_n, ths->ngc, ths->tuple, dim, numSend, sendOffset, ths->data,
sbuf);
PFFT_GC_FINISH_TIMING(MPI_COMM_WORLD, "pack_slices");
#if PFFT_ENABLE_GC_TIMER
if(!tm_rank) printf("sbuf_size = %td, rbuf_size = %td\n", sbuf_size, rbuf_size);
#endif
PFFT_GC_START_TIMING();
isend_packed_slices(dir, sbuf_size, sbuf, ths->comms_pm[dim],
&mpi_req[0]);
irecv_packed_slices(dir, rbuf_size, ths->comms_pm[dim],
&mpi_req[1], rbuf);
MPI_Waitall(2, mpi_req, MPI_STATUSES_IGNORE);
PFFT_GC_FINISH_TIMING(MPI_COMM_WORLD, "send/recv");
PFFT_GC_START_TIMING();
unpack_slices(ths->rnk_n, ths->ngc, ths->tuple, dim, numRecv, recvOffset, rbuf_size, rbuf, add_buffer,
ths->data);
PFFT_GC_FINISH_TIMING(MPI_COMM_WORLD, "unpack_slices");
if(sbuf != NULL) free(sbuf);
if(rbuf != NULL) free(rbuf);
}
#endif
#if PFFT_GC_USE_MPI_DATATYPE
static void addsendrecv_gcells_along_one_dim(
PX(gcplan) ths, int dim, int dir,
INT numSend, INT sendOffset, INT numRecv, INT recvOffset
)
{
MPI_Request request;
PFFT_GC_INIT_TIMING(MPI_COMM_WORLD);
PFFT_GC_START_TIMING();
isend_slices(ths->data, ths->rnk_n, ths->ngc, dim, dir,
ths->tuple, numSend, sendOffset, ths->comms_pm[dim], &request);
addrecv_slices(ths->data, ths->rnk_n, ths->ngc, dim, dir,
ths->tuple, numRecv, recvOffset, ths->comms_pm[dim]);
MPI_Wait(&request, MPI_STATUS_IGNORE);
PFFT_GC_FINISH_TIMING(MPI_COMM_WORLD, "send/addrecv");
}
#else
static void addsendrecv_gcells_along_one_dim(
PX(gcplan) ths, int dim, int dir,
INT numSend, INT sendOffset, INT numRecv, INT recvOffset
)
{
INT sbuf_size=0, rbuf_size=0;
R *sbuf=NULL, *rbuf=NULL;
MPI_Request mpi_req[2];
unsigned add_buffer=1;
sbuf = allocate_buffer(ths->rnk_n, ths->ngc, ths->tuple, dim, numSend,
&sbuf_size);
rbuf = allocate_buffer(ths->rnk_n, ths->ngc, ths->tuple, dim, numRecv,
&rbuf_size);
pack_slices(sbuf_size, ths->rnk_n, ths->ngc, ths->tuple, dim, numSend, sendOffset, ths->data,
sbuf);
isend_packed_slices(dir, sbuf_size, sbuf, ths->comms_pm[dim],
&mpi_req[0]);
irecv_packed_slices(dir, rbuf_size, ths->comms_pm[dim],
&mpi_req[1], rbuf);
MPI_Waitall(2, mpi_req, MPI_STATUSES_IGNORE);
unpack_slices(ths->rnk_n, ths->ngc, ths->tuple, dim, numRecv, recvOffset, rbuf_size, rbuf, add_buffer,
ths->data);
if(sbuf != NULL) free(sbuf);
if(rbuf != NULL) free(rbuf);
}
#endif
#if PFFT_GC_USE_MPI_DATATYPE
static void isend_slices(
R *data, int rnk_n, const INT *localArraySize, int dim, int dir, INT tupleSize,
INT numSend, INT sendOffset, MPI_Comm commCart1d,
MPI_Request *request
)
{
int from, to, disp;
INT numSendTotal=tupleSize;
MPI_Datatype sendtype;
for(int t=0; t=0; t--){
subind_vec[t] = tmp % subsizes[t];
tmp /= subsizes[t];
}
/* calculate vectorized index in strided buffer */
ind_lin = subind_vec[0] + starts[0];
for(t=1; t=0; t--){
subind_vec[t] = tmp % subsizes[t];
tmp /= subsizes[t];
}
/* calculate vectorized index in strided buffer */
ind_lin=subind_vec[0] + starts[0];
for(t=1; t