/* * Copyright (c) 2010-2013 Michael Pippig * * This file is part of PFFT. * * PFFT is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * PFFT is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with PFFT. If not, see . * */ #include "pfft.h" #include "ipfft.h" #include "util.h" /* Warning: This ghostcell implementation only supports data distributions where the first two dimensions are distributed among a two-dimensional processor grid. */ /* get corners from the following neighbors | v -->oxxo x x x x oxxo<-- ^ | */ static void exchange_gcells( PX(gcplan) ths, unsigned gcflags); static void reduce_gcells( PX(gcplan) ths, unsigned gcflags); static void exchange_gcells_along_n0( PX(gcplan) ths, unsigned gcflags, int direction, INT gc_xstart, INT gc_target_xstart, INT gc_xget); static void exchange_gcells_along_n1( PX(gcplan) ths, unsigned gcflag, int direction, INT gc_ystart, INT gc_target_ystart, INT gc_yget); static void execute_gcell_exchange( INT gc_xstart, INT gc_ystart, INT gc_target_xstart, INT gc_target_ystart, INT gc_xget, INT gc_yget, const INT *ngc, const INT* target_ngc, INT tuple_size, int target_rank, MPI_Win win, unsigned gcflags, R *data); static int sync_communication_finished( const INT *gc_below_remain, const INT *gc_above_remain, MPI_Comm comm); void PX(exchange_gc_RMA)( PX(gcplan) ths ) { exchange_gcells(ths, PFFTI_GC_BORDERS); exchange_gcells(ths, PFFTI_GC_CORNERS); } void PX(reduce_gc_RMA)( PX(gcplan) ths ) { reduce_gcells(ths, PFFTI_GC_BORDERS); reduce_gcells(ths, PFFTI_GC_CORNERS); } static void exchange_gcells( PX(gcplan) ths, unsigned gcflags ) { const int d=2; INT gc_below_remain[d], gc_above_remain[d]; INT gc_below_start[d], gc_above_start[d]; INT gc_below_avail[d], gc_above_avail[d]; INT gc_prev_start[d], gc_next_start[d]; INT gc_below_get[d], gc_above_get[d]; for(int t=0; t<2; t++){ gc_below_remain[t] = ths->gc_below[t]; gc_above_remain[t] = ths->gc_above[t]; gc_below_start[t] = ths->gc_below[t]; gc_above_start[t] = ths->gc_below[t] + ths->loc_n[t]; gc_prev_start[t] = ths->gc_below[t] + PX(local_block_size_shifted)( ths->n[t], ths->blk[t], -1, ths->comms_pm[t]); gc_next_start[t] = ths->gc_below[t]; } for(int shift = 1; 1; shift++ ){ for(int t=0; t<2; t++){ gc_below_avail[t] = PX(local_block_size_shifted)( ths->n[t], ths->blk[t], -shift, ths->comms_pm[t]); gc_above_avail[t] = PX(local_block_size_shifted)( ths->n[t], ths->blk[t], +shift, ths->comms_pm[t]); gc_below_get[t] = MIN(gc_below_remain[t], gc_below_avail[t]); gc_above_get[t] = MIN(gc_above_remain[t], gc_above_avail[t]); gc_below_start[t] -= gc_below_get[t]; gc_prev_start[t] -= gc_below_get[t]; } /* exchange is implemented with MPI_Get */ MPI_Win_fence(MPI_MODE_NOPRECEDE | MPI_MODE_NOPUT, ths->win); // MPI_Win_post(ths->grp, 0, ths->win); // MPI_Win_start(ths->grp, 0, ths->win); exchange_gcells_along_n0( ths, gcflags| PFFTI_GC_TRAFO, -1, gc_below_start[0], gc_prev_start[0], gc_below_get[0]); exchange_gcells_along_n0( ths, gcflags| PFFTI_GC_TRAFO, +1, gc_above_start[0], gc_next_start[0], gc_above_get[0]); exchange_gcells_along_n1( ths, gcflags| PFFTI_GC_TRAFO, -1, gc_below_start[1], gc_prev_start[1], gc_below_get[1]); exchange_gcells_along_n1( ths, gcflags| PFFTI_GC_TRAFO, +1, gc_above_start[1], gc_next_start[1], gc_above_get[1]); for(int t=0; t<2; t++){ gc_above_start[t] += gc_above_get[t]; gc_next_start[t] += gc_above_get[t]; gc_below_remain[t] -= gc_below_avail[t]; gc_above_remain[t] -= gc_above_avail[t]; } MPI_Win_fence(MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED , ths->win); // MPI_Win_complete(ths->win); // MPI_Win_wait(ths->win); // if( (gc_below_remain[0] <= 0) && (gc_above_remain[0] <= 0) ) // if( (gc_below_remain[1] <= 0) && (gc_above_remain[1] <= 0) ) // break; /* synchronized end of communication */ if( sync_communication_finished(gc_below_remain, gc_above_remain, ths->comm_cart) ) break; } } static void reduce_gcells( PX(gcplan) ths, unsigned gcflags ) { const int d=2; int shifts; INT gc_below_remain[d], gc_above_remain[d]; INT gc_below_start[d], gc_above_start[d]; INT gc_below_avail[d], gc_above_avail[d]; INT gc_prev_start[d], gc_next_start[d]; INT gc_below_put[d], gc_above_put[d]; for(int t=0; t<2; t++){ gc_below_remain[t] = ths->gc_below[t]; gc_above_remain[t] = ths->gc_above[t]; } for(shifts = 0; 1; shifts++){ // if( (gc_below_remain[0] <= 0) && (gc_above_remain[0] <= 0) ) // if( (gc_below_remain[1] <= 0) && (gc_above_remain[1] <= 0) ) // break; /* synchronized end of communication */ if( sync_communication_finished(gc_below_remain, gc_above_remain, ths->comm_cart) ) break; for(int t=0; t<2; t++){ gc_below_avail[t] = PX(local_block_size_shifted)( ths->n[t], ths->blk[t], -shifts, ths->comms_pm[t]); gc_above_avail[t] = PX(local_block_size_shifted)( ths->n[t], ths->blk[t], +shifts, ths->comms_pm[t]); gc_below_remain[t] -= gc_below_avail[t]; gc_above_remain[t] -= gc_above_avail[t]; } } for(int t=0; t<2; t++){ gc_below_start[t] = 0; gc_above_start[t] = ths->ngc[t]; gc_prev_start[t] = PX(local_block_size_shifted)( ths->n[t], ths->blk[t], -1, ths->comms_pm[t]); gc_next_start[t] = ths->gc_below[t] + ths->gc_above[t]; } for(int shift = shifts; shift > 0; shift-- ){ for(int t=0; t<2; t++){ gc_below_avail[t] = PX(local_block_size_shifted)( ths->n[t], ths->blk[t], -shift, ths->comms_pm[t]); gc_above_avail[t] = PX(local_block_size_shifted)( ths->n[t], ths->blk[t], +shift, ths->comms_pm[t]); gc_below_remain[t] += gc_below_avail[t]; gc_above_remain[t] += gc_above_avail[t]; gc_below_put[t] = MAX(0, MIN(gc_below_remain[t], gc_below_avail[t])); gc_above_put[t] = MAX(0, MIN(gc_above_remain[t], gc_above_avail[t])); gc_above_start[t] -= gc_above_put[t]; gc_next_start[t] -= gc_above_put[t]; } MPI_Win_fence(MPI_MODE_NOPRECEDE, ths->win); // MPI_Win_post(ths->grp, 0, ths->win); // MPI_Win_start(ths->grp, 0, ths->win); exchange_gcells_along_n0( ths, gcflags| PFFTI_GC_ADJOINT, -1, gc_below_start[0], gc_prev_start[0], gc_below_put[0]); exchange_gcells_along_n0( ths, gcflags| PFFTI_GC_ADJOINT, +1, gc_above_start[0], gc_next_start[0], gc_above_put[0]); exchange_gcells_along_n1( ths, gcflags| PFFTI_GC_ADJOINT, -1, gc_below_start[1], gc_prev_start[1], gc_below_put[1]); exchange_gcells_along_n1( ths, gcflags| PFFTI_GC_ADJOINT, +1, gc_above_start[1], gc_next_start[1], gc_above_put[1]); for(int t=0; t<2; t++){ gc_below_start[t] += gc_below_put[t]; gc_prev_start[t] += gc_below_put[t]; } MPI_Win_fence(MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED , ths->win); // MPI_Win_complete(ths->win); // MPI_Win_wait(ths->win); } } /* The end of communication shifts must be synchronized on all processes. * Otherwise deadlocks occur because ofunmatched window sychronization calls. */ static int sync_communication_finished( const INT *gc_below_remain, const INT *gc_above_remain, MPI_Comm comm ) { int finished = 0, all_finished; if( (gc_below_remain[0] <= 0) && (gc_above_remain[0] <= 0) ) if( (gc_below_remain[1] <= 0) && (gc_above_remain[1] <= 0) ) finished = 1; MPI_Allreduce(&finished, &all_finished, 1, MPI_INT, MPI_MIN, comm); return all_finished; } static void exchange_gcells_along_n0( PX(gcplan) ths, unsigned gcflags, int direction, INT gc_xstart, INT gc_target_xstart, INT gc_xget ) { int target_rank, dim=0; INT *target_ngc, gc_ystart, gc_target_ystart, gc_yget; if(gc_xget <= 0) return; target_ngc = PX(malloc_INT)(ths->rnk_n); for(int t=0; trnk_n; t++) target_ngc[t] = ths->loc_n[t]; target_rank = (direction == -1) ? ths->rnk_prec[dim] : ths->rnk_succ[dim]; target_ngc[dim] = (direction == -1) ? ths->ngc_prec[dim] : ths->ngc_succ[dim]; if(gcflags & PFFTI_GC_CORNERS){ gc_ystart = gc_target_ystart = (direction == -1) ? ths->gc_below[1] + ths->loc_n[1] : 0; gc_yget = (direction == -1) ? ths->gc_above[1] : ths->gc_below[1]; } else { gc_ystart = gc_target_ystart = ths->gc_below[1]; gc_yget = ths->loc_n[1]; } execute_gcell_exchange( gc_xstart, gc_ystart, gc_target_xstart, gc_target_ystart, gc_xget, gc_yget, ths->ngc, target_ngc, ths->tuple, target_rank, ths->win, gcflags, ths->data); } static void exchange_gcells_along_n1( PX(gcplan) ths, unsigned gcflags, int direction, INT gc_ystart, INT gc_target_ystart, INT gc_yget ) { int target_rank, dim=1; INT *target_ngc, gc_xstart, gc_target_xstart, gc_xget; if(gc_yget <= 0) return; target_ngc = PX(malloc_INT)(ths->rnk_n); for(int t=0; trnk_n; t++) target_ngc[t] = ths->loc_n[t]; target_rank = (direction == -1) ? ths->rnk_prec[dim] : ths->rnk_succ[dim]; target_ngc[dim] = (direction == -1) ? ths->ngc_prec[dim] : ths->ngc_succ[dim]; if(gcflags & PFFTI_GC_CORNERS){ gc_xstart = gc_target_xstart = (direction == -1) ? 0 : ths->gc_below[0] + ths->loc_n[0]; gc_xget = (direction == -1) ? ths->gc_below[0] : ths->gc_above[0]; } else { gc_xstart = gc_target_xstart = ths->gc_below[0]; gc_xget = ths->loc_n[0]; } execute_gcell_exchange( gc_xstart, gc_ystart, gc_target_xstart, gc_target_ystart, gc_xget, gc_yget, ths->ngc, target_ngc, ths->tuple, target_rank, ths->win, gcflags, ths->data); free(target_ngc); } static void execute_gcell_exchange( INT gc_xstart, INT gc_ystart, INT gc_target_xstart, INT gc_target_ystart, INT gc_xget, INT gc_yget, const INT *ngc, const INT* target_ngc, INT tuple_size, int target_rank, MPI_Win win, unsigned gcflags, R *data ) { int num_recv, num_send; /* FIXME: 64-bit portability issue, but MPI forces to use int */ INT offset_recv, offset_send; for(INT k0 = 0; k0 < gc_xget; k0++){ offset_recv = MACRO_PLAIN_INDEX_3D( k0 + gc_xstart, gc_ystart, 0, ngc) * tuple_size; offset_send = MACRO_PLAIN_INDEX_3D( k0 + gc_target_xstart, gc_target_ystart, 0, target_ngc) * tuple_size; num_recv = num_send = (int) (gc_yget * ngc[2] * tuple_size); if(gcflags & PFFTI_GC_ADJOINT) MPI_Accumulate(data + offset_recv, num_recv, PFFT_MPI_REAL_TYPE, target_rank, offset_send, num_send, PFFT_MPI_REAL_TYPE, MPI_SUM, win); else MPI_Get(data + offset_recv, num_recv, PFFT_MPI_REAL_TYPE, target_rank, offset_send, num_send, PFFT_MPI_REAL_TYPE, win); } }