/*
* Copyright (c) 2010-2013 Michael Pippig
*
* This file is part of PFFT.
*
* PFFT is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* PFFT is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with PFFT. If not, see .
*
*/
#include "pfft.h"
#include "ipfft.h"
#include "util.h"
static PX(gcplan) gc_mkplan(
int rnk_n);
static void decompose(
const INT *pn, const INT *block,
int rnk_pm, const MPI_Comm *comms_pm,
INT *local_n, INT *local_start);
static int gc_RMA_applicable(
int rnk_n, int rnk_pm, const INT *gc_below, const INT *gc_above);
static void vcopy_INT_transposed(
int rnk_n, const INT* n,
int rnk_pm, unsigned transp_flag,
INT *cp);
static void create_neighbor_group(
MPI_Comm comm_cart, MPI_Group *group_nbr);
/* FIXME: Generalize these functions to arbitray dimensions */
static void pad_array_with_zeros_outside_3d(
const INT *n, const INT *zeros_below, const INT *zeros_above,
INT tuple_size, R *real_data);
static void truncate_array_outside_3d(
const INT *n, const INT *trunc_below, const INT *trunc_above,
INT tuple_size, R *real_data);
INT PX(local_size_gc_internal)(
int rnk_n, const INT *local_n, const INT *local_start,
INT tuple, const INT *gc_below_user, const INT *gc_above_user,
INT *local_ngc, INT *local_gc_start
)
{
INT mem = tuple;
INT *gc_below, *gc_above;
gc_below = PX(malloc_INT)(rnk_n);
gc_above = PX(malloc_INT)(rnk_n);
PX(evaluate_user_gcells)(
rnk_n, gc_below_user, gc_above_user,
gc_below, gc_above);
PX(vadd_INT)(rnk_n, gc_below, local_n, local_ngc);
PX(vadd_INT)(rnk_n, gc_above, local_ngc, local_ngc);
PX(vsub_INT)(rnk_n, local_start, gc_below, local_gc_start);
for(int t = 0; t < rnk_n; t++)
mem *= gc_below[t] + local_n[t] + gc_above[t];
free(gc_below); free(gc_above);
return mem;
}
/* pfft_flag in {0, PFFT_TRANSPOSED_IN, PFFT_TRANSPOSED_OUT} */
PX(gcplan) PX(plan_rgc_internal)(
int rnk_n, const INT *n, INT tuple_size, const INT *block_user,
const INT *gc_below_user, const INT *gc_above_user,
R *data, int rnk_pm, MPI_Comm *comms_pm, MPI_Comm comm_cart,
unsigned gc_flags
)
{
int dims = 1, periods = 1, reorder = 0, rnk_self;
INT ngc_total, *dummy;
INT *gc_below, *gc_above;
PX(gcplan) ths = NULL;
MPI_Comm_rank(comm_cart, &rnk_self);
gc_below = PX(malloc_INT)(rnk_n);
gc_above = PX(malloc_INT)(rnk_n);
PX(evaluate_user_gcells)(
rnk_n, gc_below_user, gc_above_user,
gc_below, gc_above);
/* return NULL for zero gcells */
int nothing_to_do = 1;
for(int t=0; tdata = data;
ths->tuple = tuple_size;
/* save arrays in transposed order */
vcopy_INT_transposed(rnk_n, n, rnk_pm, gc_flags, ths->n);
vcopy_INT_transposed(rnk_n, gc_below, rnk_pm, gc_flags, ths->gc_below);
vcopy_INT_transposed(rnk_n, gc_above, rnk_pm, gc_flags, ths->gc_above);
/* calculate 'blk' from transposed array */
int *np_pm = PX(malloc_int)(rnk_pm);
for(int t=0; tn, block_user, np_pm,
ths->blk);
for(int t=rnk_pm; tblk[t] = ths->n[t];
free(np_pm);
/* calculate 'loc_n' from transposed array */
dummy = PX(malloc_INT)(rnk_pm);
decompose(ths->n, ths->blk, rnk_pm, comms_pm,
ths->loc_n, dummy);
for(int t=rnk_pm; tloc_n[t] = ths->n[t];
free(dummy);
/* calculate 'ngc' from transposed arrays */
for(int t=0; tngc[t] = ths->gc_below[t] + ths->loc_n[t] + ths->gc_above[t];
/* procmesh remains the same for transposed layout */
for(int t=0; trnk_prec[t], &ths->rnk_succ[t]);
MPI_Comm_dup(comms_pm[t], &ths->comms_pm[t]);
MPI_Comm_size(comms_pm[t], &ths->np[t]);
}
for(int t=rnk_pm; trnk_prec[t] = ths->rnk_succ[t] = rnk_self;
MPI_Cart_create(MPI_COMM_SELF, 1, &dims, &periods, reorder, &ths->comms_pm[t]);
ths->np[t] = 1;
}
/* create memory window for RMA algorithm */
ngc_total = tuple_size;
for(int t=0; tngc[t];
MPI_Win_create(data, (MPI_Aint) ((size_t) ngc_total * sizeof(R)), sizeof(R),
MPI_INFO_NULL, comm_cart, &ths->win);
MPI_Comm_dup(comm_cart, &ths->comm_cart);
/* group all neighbor prcesses for RMA algorithm */
create_neighbor_group(comm_cart, &ths->grp);
if(gc_flags & PFFT_GC_RMA)
if( !gc_RMA_applicable(rnk_n, rnk_pm, ths->gc_below, ths->gc_above) )
PX(fprintf)(comm_cart, stderr,
"Error: RMA Gcell algorithm doesn't support this data decompostion.\n");
/* default: call RMA ghost cell send if possible */
ths->alg_flag = PFFT_GC_RMA;
if( !gc_RMA_applicable(rnk_n, rnk_pm, ths->gc_below, ths->gc_above) )
ths->alg_flag = PFFT_GC_SENDRECV;
if( gc_flags & PFFT_GC_SENDRECV )
ths->alg_flag = PFFT_GC_SENDRECV;
/* calculate neighboring local size of distributed dimensions */
for(int t=0; tngc_prec[t] = ths->gc_below[t] + ths->gc_above[t];
ths->ngc_prec[t] += PX(local_block_size_shifted)(
ths->n[t], ths->blk[t], -1, ths->comms_pm[t]);
ths->ngc_succ[t] = ths->gc_below[t] + ths->gc_above[t];
ths->ngc_succ[t] += PX(local_block_size_shifted)(
ths->n[t], ths->blk[t], +1, ths->comms_pm[t]);
}
free(gc_below); free(gc_above);
return ths;
}
static int gc_RMA_applicable(
int rnk_n, int rnk_pm, const INT *gc_below, const INT *gc_above
)
{
/* The RMA ghostcell implementation only supports data distributions
* where the first two dimensions of a three-dimensional array
* are distributed on a two-dimensional process mesh. */
if(rnk_n != 3)
return 0;
if(rnk_pm != 2)
return 0;
if(gc_below[2] != 0)
return 0;
if(gc_above[2] != 0)
return 0;
return 1;
}
static void vcopy_INT_transposed(
int rnk_n, const INT* n,
int rnk_pm, unsigned transp_flag,
INT *cp
)
{
for(int t=0; trnk_n = rnk_n;
/* allocate arrays */
ths->n = PX(malloc_INT)(rnk_n);
ths->loc_n = PX(malloc_INT)(rnk_n);
ths->gc_below = PX(malloc_INT)(rnk_n);
ths->gc_above = PX(malloc_INT)(rnk_n);
ths->ngc = PX(malloc_INT)(rnk_n);
ths->ngc_prec = PX(malloc_INT)(rnk_n);
ths->ngc_succ = PX(malloc_INT)(rnk_n);
ths->np = PX(malloc_int)(rnk_n);
ths->rnk_prec = PX(malloc_int)(rnk_n);
ths->rnk_succ = PX(malloc_int)(rnk_n);
/* allocate mem for rnk_n comms, instead of only rnk_pm */
ths->blk = PX(malloc_INT)(rnk_n);
ths->comms_pm = (MPI_Comm*) malloc(sizeof(MPI_Comm) * (size_t) rnk_n);
ths->timer_exg = PX(gc_mktimer)();
ths->timer_red = PX(gc_mktimer)();
return ths;
}
void PX(rmplan_gc)(
PX(gcplan) ths
)
{
/* plan was already destroyed or never initialized */
if(ths == NULL)
return;
free(ths->n);
free(ths->loc_n);
free(ths->gc_below);
free(ths->gc_above);
free(ths->ngc);
free(ths->ngc_prec);
free(ths->ngc_succ);
free(ths->np);
free(ths->rnk_prec);
free(ths->rnk_succ);
free(ths->blk);
for(int t=0; t < ths->rnk_n; t++)
MPI_Comm_free(&ths->comms_pm[t]);
free(ths->comms_pm);
/* take care of MPI variables for RMA algorithm */
MPI_Group_free(&ths->grp);
MPI_Win_free(&ths->win);
MPI_Comm_free(&ths->comm_cart);
PX(destroy_gctimer)(ths->timer_exg);
PX(destroy_gctimer)(ths->timer_red);
/* free memory */
free(ths);
/* ths=NULL; would be senseless, since we can not change the pointer itself */
}
static void create_neighbor_group(
MPI_Comm comm_cart, MPI_Group *group_nbr
)
{
int rnk_pm, *dims, *periods, *coords;
int mpi_self, mpi_left, mpi_right;
int *ranks_nbr, n_nbr = 0, include_self = 0;
MPI_Group group_cart;
MPI_Comm_rank(comm_cart, &mpi_self);
MPI_Cartdim_get(comm_cart, &rnk_pm);
MPI_Comm_group(comm_cart, &group_cart);
dims = PX(malloc_int)(rnk_pm);
periods = PX(malloc_int)(rnk_pm);
coords = PX(malloc_int)(rnk_pm);
MPI_Cart_get(comm_cart, rnk_pm, dims, periods, coords);
ranks_nbr = PX(malloc_int)(2*rnk_pm);
/* avoid multiple inlcudes of same ranks */
for(int t=0; tcomm_cart, ths->timer_exg->pad_zeros);
pad_array_with_zeros_outside_3d(
ths->loc_n, ths->gc_below, ths->gc_above, ths->tuple,
ths->data);
PFFT_FINISH_TIMING(ths->timer_exg->pad_zeros);
PFFT_START_TIMING(ths->comm_cart, ths->timer_exg->exchange);
if(ths->alg_flag & PFFT_GC_RMA)
PX(exchange_gc_RMA)(ths);
else
PX(exchange_gc_sendrecv)(ths);
PFFT_FINISH_TIMING(ths->timer_exg->exchange);
}
void PX(reduce_gc)(
PX(gcplan) ths
)
{
if(ths == NULL)
return;
PFFT_START_TIMING(ths->comm_cart, ths->timer_red->exchange);
if(ths->alg_flag & PFFT_GC_RMA)
PX(reduce_gc_RMA)(ths);
else
PX(reduce_gc_sendrecv)(ths);
PFFT_FINISH_TIMING(ths->timer_red->exchange);
PFFT_START_TIMING(ths->comm_cart, ths->timer_red->pad_zeros);
truncate_array_outside_3d(
ths->ngc, ths->gc_below, ths->gc_above, ths->tuple,
ths->data);
PFFT_FINISH_TIMING(ths->timer_red->pad_zeros);
}
static void pad_array_with_zeros_outside_3d(
const INT *n, const INT *zeros_below, const INT *zeros_above,
INT tuple_size, R *real_data
)
{
INT m, mo;
INT n_start[3], n_end[3], no[3];
INT k0, k1, k2, h;
int k0_is_outside, k1_is_outside, k2_is_outside, index_is_outside;
PX(vcopy_INT)(3, zeros_below, n_start);
PX(vadd_INT)(3, n_start, n, n_end);
PX(vadd_INT)(3, n_end, zeros_above, no);
if( PX(equal_INT)(3, n, no) )
return;
m = tuple_size * PX(prod_INT)(3, n) - 1;
mo = tuple_size * PX(prod_INT)(3, no) - 1;
for(k0 = no[0] - 1 ; k0 >= 0; k0--){
k0_is_outside = (k0 < n_start[0]) || (n_end[0] <= k0);
for(k1 = no[1] - 1 ; k1 >= 0; k1--){
k1_is_outside = (k1 < n_start[1]) || (n_end[1] <= k1);
for(k2 = no[2] - 1 ; k2 >= 0; k2--){
k2_is_outside = (k2 < n_start[2]) || (n_end[2] <= k2);
index_is_outside = k0_is_outside || k1_is_outside || k2_is_outside;
for(h = 0; h < tuple_size; h++)
real_data[mo--] = (index_is_outside) ? 0 : real_data[m--];
}
}
}
}
static void truncate_array_outside_3d(
const INT *n, const INT *trunc_below, const INT *trunc_above,
INT tuple_size, R *real_data
)
{
INT n_total, mo, k0, k1, k2, h;
INT no_start[3], no_end[3], no[3];
PX(vcopy_INT)(3, trunc_below, no_start);
PX(vsub_INT)(3, n, trunc_above, no_end);
PX(vsub_INT)(3, no_end, trunc_below, no);
if( PX(equal_INT)(3, n, no) )
return;
mo = 0;
for(k0 = no_start[0]; k0 < no_end[0]; k0++){
for(k1 = no_start[1]; k1 < no_end[1]; k1++){
for(k2 = no_start[2]; k2 < no_end[2]; k2++){
for(h = 0; h < tuple_size; h++)
real_data[mo++] = real_data[h+tuple_size*(k2+n[2]*(k1+n[1]*k0))];
}
}
}
n_total = tuple_size * PX(prod_INT)(3, n);
for(k0 = mo; k0 < n_total; k0++)
real_data[k0] = 0;
}