/* * Copyright (c) 1997-1999 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include #include #include #include #include "sched.h" #include "TOMS_transpose.h" /**************************************************************************/ static int transpose_mpi_get_block_size(int n, int n_pes) { return((n + n_pes - 1) / n_pes); } void transpose_mpi_get_local_size(int n, int my_pe, int n_pes, int *local_n, int *local_start) { int block_size; block_size = transpose_mpi_get_block_size(n,n_pes); n_pes = (n + block_size - 1) / block_size; if (my_pe >= n_pes) { *local_n = 0; *local_start = 0; } else { *local_start = my_pe * block_size; if (my_pe == n_pes - 1) *local_n = n - *local_start; else *local_n = block_size; } } #define MAX2(a,b) ((a) > (b) ? (a) : (b)) int transpose_mpi_get_local_storage_size(int nx, int ny, int my_pe, int n_pes) { int local_nx, local_ny, local_x_start, local_y_start; transpose_mpi_get_local_size(nx,my_pe,n_pes,&local_nx,&local_x_start); transpose_mpi_get_local_size(ny,my_pe,n_pes,&local_ny,&local_y_start); return MAX2(1, MAX2(local_nx*ny, nx*local_ny)); } static int gcd(int a, int b) { int r; do { r = a % b; a = b; b = r; } while (r != 0); return a; } /**************************************************************************/ transpose_mpi_plan transpose_mpi_create_plan(int nx, int ny, MPI_Comm transpose_comm) { transpose_mpi_plan p; int my_pe, n_pes, pe; int x_block_size, local_nx, local_x_start; int y_block_size, local_ny, local_y_start; transpose_mpi_exchange *exchange = 0; int step, send_block_size = 0, recv_block_size = 0, num_steps = 0; int **sched, sched_npes, sched_sortpe, sched_sort_ascending = 0; int *perm_block_dest = NULL; int num_perm_blocks = 0, perm_block_size = 0, perm_block; char *move = NULL; int move_size = 0; int *send_block_sizes = 0, *send_block_offsets = 0; int *recv_block_sizes = 0, *recv_block_offsets = 0; MPI_Comm comm; /* create a new "clone" communicator so that transpose communications do not interfere with caller communications. */ MPI_Comm_dup(transpose_comm, &comm); MPI_Comm_rank(comm,&my_pe); MPI_Comm_size(comm,&n_pes); /* work space for in-place transpose routine: */ move_size = (nx + ny) / 2; move = (char *) fftw_malloc(sizeof(char) * move_size); x_block_size = transpose_mpi_get_block_size(nx,n_pes); transpose_mpi_get_local_size(nx,my_pe,n_pes,&local_nx,&local_x_start); y_block_size = transpose_mpi_get_block_size(ny,n_pes); transpose_mpi_get_local_size(ny,my_pe,n_pes,&local_ny,&local_y_start); /* allocate pre-computed post-transpose permutation: */ perm_block_size = gcd(nx,x_block_size); num_perm_blocks = (nx / perm_block_size) * local_ny; perm_block_dest = (int *) fftw_malloc(sizeof(int) * num_perm_blocks); for (perm_block = 0; perm_block < num_perm_blocks; ++perm_block) perm_block_dest[perm_block] = num_perm_blocks; /* allocate block sizes/offsets arrays for out-of-place transpose: */ send_block_sizes = (int *) fftw_malloc(n_pes * sizeof(int)); send_block_offsets = (int *) fftw_malloc(n_pes * sizeof(int)); recv_block_sizes = (int *) fftw_malloc(n_pes * sizeof(int)); recv_block_offsets = (int *) fftw_malloc(n_pes * sizeof(int)); for (step = 0; step < n_pes; ++step) send_block_sizes[step] = send_block_offsets[step] = recv_block_sizes[step] = recv_block_offsets[step] = 0; if (local_nx > 0 || local_ny > 0) { sched_npes = n_pes; sched_sortpe = -1; for (pe = 0; pe < n_pes; ++pe) { int pe_nx, pe_x_start, pe_ny, pe_y_start; transpose_mpi_get_local_size(nx,pe,n_pes, &pe_nx,&pe_x_start); transpose_mpi_get_local_size(ny,pe,n_pes, &pe_ny,&pe_y_start); if (pe_nx == 0 && pe_ny == 0) { sched_npes = pe; break; } else if (pe_nx * y_block_size != pe_ny * x_block_size && pe_nx != 0 && pe_ny != 0) { if (sched_sortpe != -1) fftw_mpi_die("BUG: More than one PE needs sorting!\n"); sched_sortpe = pe; sched_sort_ascending = pe_nx * y_block_size > pe_ny * x_block_size; } } sched = make_comm_schedule(sched_npes); if (!sched) { MPI_Comm_free(&comm); return 0; } if (sched_sortpe != -1) { sort_comm_schedule(sched,sched_npes,sched_sortpe); if (!sched_sort_ascending) invert_comm_schedule(sched,sched_npes); } send_block_size = local_nx * y_block_size; recv_block_size = local_ny * x_block_size; num_steps = sched_npes; exchange = (transpose_mpi_exchange *) fftw_malloc(num_steps * sizeof(transpose_mpi_exchange)); if (!exchange) { free_comm_schedule(sched,sched_npes); MPI_Comm_free(&comm); return 0; } for (step = 0; step < sched_npes; ++step) { int dest_pe; int dest_nx, dest_x_start; int dest_ny, dest_y_start; int num_perm_blocks_received, num_perm_rows_received; exchange[step].dest_pe = dest_pe = exchange[step].block_num = sched[my_pe][step]; if (exchange[step].block_num == -1) fftw_mpi_die("BUG: schedule ended too early.\n"); transpose_mpi_get_local_size(nx,dest_pe,n_pes, &dest_nx,&dest_x_start); transpose_mpi_get_local_size(ny,dest_pe,n_pes, &dest_ny,&dest_y_start); exchange[step].send_size = local_nx * dest_ny; exchange[step].recv_size = dest_nx * local_ny; send_block_sizes[dest_pe] = exchange[step].send_size; send_block_offsets[dest_pe] = dest_pe * send_block_size; recv_block_sizes[dest_pe] = exchange[step].recv_size; recv_block_offsets[dest_pe] = dest_pe * recv_block_size; /* Precompute the post-transpose permutation (ugh): */ if (exchange[step].recv_size > 0) { num_perm_blocks_received = exchange[step].recv_size / perm_block_size; num_perm_rows_received = num_perm_blocks_received / local_ny; for (perm_block = 0; perm_block < num_perm_blocks_received; ++perm_block) { int old_block, new_block; old_block = perm_block + exchange[step].block_num * (recv_block_size / perm_block_size); new_block = perm_block % num_perm_rows_received + dest_x_start / perm_block_size + (perm_block / num_perm_rows_received) * (nx / perm_block_size); if (old_block >= num_perm_blocks || new_block >= num_perm_blocks) fftw_mpi_die("bad block index in permutation!"); perm_block_dest[old_block] = new_block; } } } free_comm_schedule(sched,sched_npes); } /* if (local_nx > 0 || local_ny > 0) */ p = (transpose_mpi_plan) fftw_malloc(sizeof(transpose_mpi_plan_struct)); if (!p) { fftw_free(exchange); MPI_Comm_free(&comm); return 0; } p->comm = comm; p->nx = nx; p->ny = ny; p->local_nx = local_nx; p->local_ny = local_ny; p->my_pe = my_pe; p->n_pes = n_pes; p->exchange = exchange; p->send_block_size = send_block_size; p->recv_block_size = recv_block_size; p->num_steps = num_steps; p->perm_block_dest = perm_block_dest; p->num_perm_blocks = num_perm_blocks; p->perm_block_size = perm_block_size; p->move = move; p->move_size = move_size; p->send_block_sizes = send_block_sizes; p->send_block_offsets = send_block_offsets; p->recv_block_sizes = recv_block_sizes; p->recv_block_offsets = recv_block_offsets; p->all_blocks_equal = send_block_size * n_pes * n_pes == nx * ny && recv_block_size * n_pes * n_pes == nx * ny; if (p->all_blocks_equal) for (step = 0; step < n_pes; ++step) if (send_block_sizes[step] != send_block_size || recv_block_sizes[step] != recv_block_size) { p->all_blocks_equal = 0; break; } if (nx % n_pes == 0 && ny % n_pes == 0 && !p->all_blocks_equal) fftw_mpi_die("n_pes divided dimensions but blocks are unequal!"); /* Set the type constant for passing to the MPI routines; here, we assume that TRANSPOSE_EL_TYPE is one of the floating-point types. */ if (sizeof(TRANSPOSE_EL_TYPE) == sizeof(double)) p->el_type = MPI_DOUBLE; else if (sizeof(TRANSPOSE_EL_TYPE) == sizeof(float)) p->el_type = MPI_FLOAT; else fftw_mpi_die("Unknown TRANSPOSE_EL_TYPE!\n"); return p; } /**************************************************************************/ void transpose_mpi_destroy_plan(transpose_mpi_plan p) { if (p) { if (p->exchange) fftw_free(p->exchange); if (p->perm_block_dest) fftw_free(p->perm_block_dest); if (p->move) fftw_free(p->move); if (p->send_block_sizes) fftw_free(p->send_block_sizes); if (p->send_block_offsets) fftw_free(p->send_block_offsets); if (p->recv_block_sizes) fftw_free(p->recv_block_sizes); if (p->recv_block_offsets) fftw_free(p->recv_block_offsets); MPI_Comm_free(&p->comm); fftw_free(p); } } /**************************************************************************/ static void exchange_elements(TRANSPOSE_EL_TYPE *buf1, TRANSPOSE_EL_TYPE *buf2, int n) { int i; TRANSPOSE_EL_TYPE t0,t1,t2,t3; for (i = 0; i < (n & 3); ++i) { t0 = buf1[i]; buf1[i] = buf2[i]; buf2[i] = t0; } for (; i < n; i += 4) { t0 = buf1[i]; t1 = buf1[i+1]; t2 = buf1[i+2]; t3 = buf1[i+3]; buf1[i] = buf2[i]; buf1[i+1] = buf2[i+1]; buf1[i+2] = buf2[i+2]; buf1[i+3] = buf2[i+3]; buf2[i] = t0; buf2[i+1] = t1; buf2[i+2] = t2; buf2[i+3] = t3; } } static void do_permutation(TRANSPOSE_EL_TYPE *data, int *perm_block_dest, int num_perm_blocks, int perm_block_size) { int start_block; /* Perform the permutation in the perm_block_dest array, following the cycles around and *changing* the perm_block_dest array to reflect the permutations that have already been performed. At the end of this routine, we change the perm_block_dest array back to its original state. (ugh) */ for (start_block = 0; start_block < num_perm_blocks; ++start_block) { int cur_block = start_block; int new_block = perm_block_dest[start_block]; while (new_block > 0 && new_block < num_perm_blocks && new_block != start_block) { exchange_elements(data + perm_block_size*start_block, data + perm_block_size*new_block, perm_block_size); perm_block_dest[cur_block] = -1 - new_block; cur_block = new_block; new_block = perm_block_dest[cur_block]; } if (new_block == start_block) perm_block_dest[cur_block] = -1 - new_block; } /* reset the permutation array (ugh): */ for (start_block = 0; start_block < num_perm_blocks; ++start_block) perm_block_dest[start_block] = -1 - perm_block_dest[start_block]; } TRANSPOSE_EL_TYPE *transpose_allocate_send_buf(transpose_mpi_plan p, int el_size) { TRANSPOSE_EL_TYPE *send_buf = 0; /* allocate the send buffer: */ if (p->send_block_size > 0) { send_buf = (TRANSPOSE_EL_TYPE *) fftw_malloc(p->send_block_size * el_size * sizeof(TRANSPOSE_EL_TYPE)); if (!send_buf) fftw_mpi_die("Out of memory!\n"); } return send_buf; } void transpose_in_place_local(transpose_mpi_plan p, int el_size, TRANSPOSE_EL_TYPE *local_data, transpose_in_place_which which) { switch (which) { case BEFORE_TRANSPOSE: if (el_size == 1) TOMS_transpose_2d(local_data, p->local_nx, p->ny, p->move, p->move_size); else TOMS_transpose_2d_arbitrary(local_data, p->local_nx, p->ny, el_size, p->move, p->move_size); break; case AFTER_TRANSPOSE: do_permutation(local_data, p->perm_block_dest, p->num_perm_blocks, p->perm_block_size * el_size); break; } } /**************************************************************************/ static void local_transpose_copy(TRANSPOSE_EL_TYPE *src, TRANSPOSE_EL_TYPE *dest, int el_size, int nx, int ny) { int x, y; if (el_size == 1) for (x = 0; x < nx; ++x) for (y = 0; y < ny; ++y) dest[y * nx + x] = src[x * ny + y]; else if (el_size == 2) for (x = 0; x < nx; ++x) for (y = 0; y < ny; ++y) { dest[y * (2 * nx) + 2*x] = src[x * (2 * ny) + 2*y]; dest[y * (2 * nx) + 2*x + 1] = src[x * (2 * ny) + 2*y + 1]; } else for (x = 0; x < nx; ++x) for (y = 0; y < ny; ++y) memcpy(&dest[y * (el_size*nx) + (el_size*x)], &src[x * (el_size*ny) + (el_size*y)], el_size * sizeof(TRANSPOSE_EL_TYPE)); } /* Out-of-place version of transpose_mpi (or rather, in place using a scratch array): */ static void transpose_mpi_out_of_place(transpose_mpi_plan p, int el_size, TRANSPOSE_EL_TYPE *local_data, TRANSPOSE_EL_TYPE *work) { local_transpose_copy(local_data, work, el_size, p->local_nx, p->ny); if (p->all_blocks_equal) MPI_Alltoall(work, p->send_block_size * el_size, p->el_type, local_data, p->recv_block_size * el_size, p->el_type, p->comm); else { int i, n_pes = p->n_pes; for (i = 0; i < n_pes; ++i) { p->send_block_sizes[i] *= el_size; p->recv_block_sizes[i] *= el_size; p->send_block_offsets[i] *= el_size; p->recv_block_offsets[i] *= el_size; } MPI_Alltoallv(work, p->send_block_sizes, p->send_block_offsets, p->el_type, local_data, p->recv_block_sizes, p->recv_block_offsets, p->el_type, p->comm); for (i = 0; i < n_pes; ++i) { p->send_block_sizes[i] /= el_size; p->recv_block_sizes[i] /= el_size; p->send_block_offsets[i] /= el_size; p->recv_block_offsets[i] /= el_size; } } do_permutation(local_data, p->perm_block_dest, p->num_perm_blocks, p->perm_block_size * el_size); } /**************************************************************************/ void transpose_mpi(transpose_mpi_plan p, int el_size, TRANSPOSE_EL_TYPE *local_data, TRANSPOSE_EL_TYPE *work) { /* if local_data and work are both NULL, we have no way of knowing whether we should use in-place or out-of-place transpose routine; if we guess wrong, MPI_Alltoall will block. We prevent this by making sure that transpose_mpi_get_local_storage_size returns at least 1. */ if (!local_data && !work) fftw_mpi_die("local_data and work are both NULL!"); if (work) transpose_mpi_out_of_place(p, el_size, local_data, work); else if (p->local_nx > 0 || p->local_ny > 0) { int step; TRANSPOSE_EL_TYPE *send_buf = transpose_allocate_send_buf(p,el_size); transpose_in_place_local(p, el_size, local_data, BEFORE_TRANSPOSE); for (step = 0; step < p->num_steps; ++step) { transpose_finish_exchange_step(p, step - 1); transpose_start_exchange_step(p, el_size, local_data, send_buf, step, TRANSPOSE_SYNC); } transpose_finish_exchange_step(p, step - 1); transpose_in_place_local(p, el_size, local_data, AFTER_TRANSPOSE); if (send_buf) fftw_free(send_buf); } /* if (local_nx > 0 || local_ny > 0) */ } /**************************************************************************/ /* non-blocking routines for overlapping communication and computation: */ #define USE_SYNCHRONOUS_ISEND 1 #if USE_SYNCHRONOUS_ISEND #define ISEND MPI_Issend #else #define ISEND MPI_Isend #endif void transpose_get_send_block(transpose_mpi_plan p, int step, int *block_y_start, int *block_ny) { if (p->local_nx > 0) { *block_y_start = p->send_block_size / p->local_nx * p->exchange[step].block_num; *block_ny = p->exchange[step].send_size / p->local_nx; } else { *block_y_start = 0; *block_ny = 0; } } void transpose_start_exchange_step(transpose_mpi_plan p, int el_size, TRANSPOSE_EL_TYPE *local_data, TRANSPOSE_EL_TYPE *send_buf, int step, transpose_sync_type sync_type) { if (p->local_nx > 0 || p->local_ny > 0) { transpose_mpi_exchange *exchange = p->exchange; int block = exchange[step].block_num; int send_block_size = p->send_block_size; int recv_block_size = p->recv_block_size; if (exchange[step].dest_pe != p->my_pe) { /* first, copy to send buffer: */ if (exchange[step].send_size > 0) memcpy(send_buf, local_data + el_size*send_block_size*block, el_size * exchange[step].send_size * sizeof(TRANSPOSE_EL_TYPE)); #define DO_ISEND \ if (exchange[step].send_size > 0) { \ ISEND(send_buf, \ exchange[step].send_size * el_size, \ p->el_type, \ exchange[step].dest_pe, 0, \ p->comm, \ &p->request[0]); \ } p->request[0] = MPI_REQUEST_NULL; p->request[1] = MPI_REQUEST_NULL; if (sync_type == TRANSPOSE_ASYNC) { /* Note that we impose an ordering on the sends and receives (lower pe sends first) so that we won't have deadlock if Isend & Irecv are blocking in some MPI implementation: */ if (p->my_pe < exchange[step].dest_pe) DO_ISEND; if (exchange[step].recv_size > 0) { MPI_Irecv(local_data + el_size*recv_block_size*block, exchange[step].recv_size * el_size, p->el_type, exchange[step].dest_pe, MPI_ANY_TAG, p->comm, &p->request[1]); } if (p->my_pe > exchange[step].dest_pe) DO_ISEND; } else /* (sync_type == TRANSPOSE_SYNC) */ { MPI_Status status; MPI_Sendrecv(send_buf, exchange[step].send_size * el_size, p->el_type, exchange[step].dest_pe, 0, local_data + el_size*recv_block_size*block, exchange[step].recv_size * el_size, p->el_type, exchange[step].dest_pe, MPI_ANY_TAG, p->comm, &status); } } else if (exchange[step].recv_size > 0 && recv_block_size != send_block_size) memmove(local_data + el_size*recv_block_size*block, local_data + el_size*send_block_size*block, exchange[step].recv_size * el_size * sizeof(TRANSPOSE_EL_TYPE)); } } void transpose_finish_exchange_step(transpose_mpi_plan p, int step) { if ((p->local_nx > 0 || p->local_ny > 0) && step >= 0 && p->exchange[step].dest_pe != p->my_pe) { MPI_Status status[2]; MPI_Waitall(2,p->request,status); } }