From 034ea5b33bcb51b4e33725f45dd2f7371c9e9834 Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@univ-reims.fr> Date: Tue, 21 Oct 2014 13:56:12 +0200 Subject: [PATCH] Dynamic multi-gpu communication size on remeshing for to left comm. --- .../cl_src/kernels/comm_remeshing_noVec.cl | 27 ++-- HySoP/hysop/gpu/gpu_operator.py | 3 - .../hysop/gpu/multi_gpu_particle_advection.py | 122 +++++++++++------- HySoP/hysop/operator/advection.py | 2 - 4 files changed, 91 insertions(+), 63 deletions(-) diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl index b9213ac65..5c1dad886 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl @@ -31,6 +31,7 @@ __kernel void buff_remesh_l(__global const float* ppos, __global const float* pscal, __global float* buffer_l, + int used_width, __constant struct AdvectionMeshInfo* mesh ) { @@ -50,7 +51,7 @@ float y; /* Normalized distance to nearest left grid point */ float* loc_ptr; // Initialize buffers - for(i=0; i<BUFF_WIDTH; i++) + for(i=0; i<used_width; i++) l_buff_loc[i] = 0.0; /* Synchronize work-group */ @@ -71,28 +72,28 @@ float y; /* Normalized distance to nearest left grid point */ index = ind - REMESH_SHIFT; w = REMESH(alpha)(y); - if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(beta)(y); - if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(gamma)(y); - if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(delta)(y); - if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); @@ -100,14 +101,14 @@ float y; /* Normalized distance to nearest left grid point */ #if REMESH_SHIFT > 1 index = index + 1; w = REMESH(eta)(y); - if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(zeta)(y); - if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); @@ -116,14 +117,14 @@ float y; /* Normalized distance to nearest left grid point */ #if REMESH_SHIFT > 2 index = index + 1; w = REMESH(theta)(y); - if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(iota)(y); - if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); @@ -132,14 +133,14 @@ float y; /* Normalized distance to nearest left grid point */ #if REMESH_SHIFT > 3 index = index + 1; w = REMESH(kappa)(y); - if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(mu)(y); - if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if(index<START_INDEX){ loc_ptr = l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); @@ -150,8 +151,8 @@ float y; /* Normalized distance to nearest left grid point */ barrier(CLK_LOCAL_MEM_FENCE); // Store buffers - for(i=0; i<BUFF_WIDTH; i++) - buffer_l[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = l_buff_loc[i]; + for(i=0; i<used_width; i++) + buffer_l[i + gidY*used_width + gidZ*used_width*NB_II] = l_buff_loc[i]; } __kernel void buff_remesh_r(__global const float* ppos, diff --git a/HySoP/hysop/gpu/gpu_operator.py b/HySoP/hysop/gpu/gpu_operator.py index a81e49ab2..cb419ee90 100644 --- a/HySoP/hysop/gpu/gpu_operator.py +++ b/HySoP/hysop/gpu/gpu_operator.py @@ -76,6 +76,3 @@ class GPUOperator(object): assert len(values) <= len(suffix), str(values) + str(suffix) for v, s in zip(values, suffix): self._size_constants += " -D " + prefix + s + "=" + str(v) - - def setup_gpu(self): - pass diff --git a/HySoP/hysop/gpu/multi_gpu_particle_advection.py b/HySoP/hysop/gpu/multi_gpu_particle_advection.py index ca67ce656..b5b6e2215 100644 --- a/HySoP/hysop/gpu/multi_gpu_particle_advection.py +++ b/HySoP/hysop/gpu/multi_gpu_particle_advection.py @@ -159,7 +159,6 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._v_l_buff_loc_flat = self._v_l_buff_loc.ravel(order='F') self._v_r_buff_loc_flat = self._v_r_buff_loc.ravel(order='F') - self._v_buff_size = self._v_buff_width * \ self.v_resol_dir[1] * self.v_resol_dir[2] self._v_pitches_host = (int(_v_l_buff[:, 0, 0].nbytes), @@ -187,7 +186,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._s_froml_buff_max = npw.zeros((self._s_buff_width, self.resol_dir[1], self.resol_dir[2])) - self._s_fromr_buff = npw.zeros_like(self._s_froml_buff_max) + self._s_fromr_buff_max = npw.zeros_like(self._s_froml_buff_max) self._cl_s_r_buff = self.cl_env.global_allocation(_s_l_buff) self._cl_s_l_buff = self.cl_env.global_allocation(_s_r_buff) cl.enqueue_copy(self.cl_env.queue, @@ -224,51 +223,49 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): order=ORDER) evt.wait() self._s_froml_buff_flat = self._s_froml_buff_max.ravel(order='F') - self._s_fromr_buff_flat = self._s_fromr_buff.ravel(order='F') - self._origin_locr = (int((self.resol_dir[0] - self._s_buff_width) - * PARMES_REAL(0).nbytes), 0, 0) + self._s_fromr_buff_flat = self._s_fromr_buff_max.ravel(order='F') - self._s_block_size = 1024 * 1024 # 1MByte + #self._s_block_size = 1024 * 1024 # 1MByte self._v_block_size = 1024 * 1024 # 1MByte - while self._s_l_buff.nbytes % self._s_block_size != 0: - self._s_block_size /= 2 + #while self._s_l_buff.nbytes % self._s_block_size != 0: + # self._s_block_size /= 2 while self._v_l_buff.nbytes % self._v_block_size != 0: self._v_block_size /= 2 w = "WARNING: block size for pipelined GPU-to-GPU transfer is small, " - if self._s_block_size < 256 * 1024: - self._s_block_size = self._s_l_buff.nbytes / 4 - print w + "use blocks of {0} MB (4 blocks scalar)".format( - self._s_block_size / (1024. * 1024.)) + #if self._s_block_size < 256 * 1024: + # self._s_block_size = self._s_l_buff.nbytes / 4 + # print w + "use blocks of {0} MB (4 blocks scalar)".format( + # self._s_block_size / (1024. * 1024.)) if self._v_block_size < 256 * 1024: self._v_block_size = self._v_l_buff.nbytes / 4 print w + "use blocks of {0} MB (4 blocks velocity)".format( self._v_block_size / (1024. * 1024.)) - self._s_n_blocks = self._s_l_buff.nbytes / self._s_block_size + #self._s_n_blocks = self._s_l_buff.nbytes / self._s_block_size self._v_n_blocks = self._v_l_buff.nbytes / self._v_block_size - self._s_elem_block = np.prod(self._s_l_buff.shape) / self._s_n_blocks + #self._s_elem_block = np.prod(self._s_l_buff.shape) / self._s_n_blocks self._v_elem_block = np.prod(self._v_l_buff.shape) / self._v_n_blocks - print "MULTI-GPU Communications of size {0}MB, by {1} blocs of {2}MB ({3} width)".format( - self._s_l_buff.nbytes / (1024. * 1024.), - self._s_n_blocks, - self._s_block_size / (1024. * 1024.), - str((self._s_buff_width, self.resol_dir[1], self.resol_dir[2]))) - self._evt_get_l = [None, ] * self._s_n_blocks + #print "MULTI-GPU Communications of size {0}MB, by {1} blocs of {2}MB ({3} width)".format( + # self._s_l_buff.nbytes / (1024. * 1024.), + # self._s_n_blocks, + # self._s_block_size / (1024. * 1024.), + # str((self._s_buff_width, self.resol_dir[1], self.resol_dir[2]))) + #self._evt_get_l = [None, ] * self._s_n_blocks #self._evt_get_r = [None, ] * self._s_n_blocks - self._l_send = [None, ] * self._s_n_blocks + #self._l_send = [None, ] * self._s_n_blocks #self._r_send = [None, ] * self._s_n_blocks #self._l_recv = [None, ] * self._s_n_blocks - self._r_recv = [None, ] * self._s_n_blocks + #self._r_recv = [None, ] * self._s_n_blocks self._l_recv_v = [None, ] * self._v_n_blocks self._r_recv_v = [None, ] * self._v_n_blocks self._send_to_l_v = [None, ] * self._v_n_blocks self._send_to_r_v = [None, ] * self._v_n_blocks self._evt_l_v = [None, ] * self._v_n_blocks self._evt_r_v = [None, ] * self._v_n_blocks - self._s_buff_block_slice = [None, ] * self._s_n_blocks + #self._s_buff_block_slice = [None, ] * self._s_n_blocks self._v_buff_block_slice = [None, ] * self._v_n_blocks - for b in xrange(self._s_n_blocks): - self._s_buff_block_slice[b] = slice( - b * self._s_elem_block, (b + 1) * self._s_elem_block) + #for b in xrange(self._s_n_blocks): + # self._s_buff_block_slice[b] = slice( + # b * self._s_elem_block, (b + 1) * self._s_elem_block) for b in xrange(self._v_n_blocks): self._v_buff_block_slice[b] = slice( b * self._v_elem_block, (b + 1) * self._v_elem_block) @@ -325,12 +322,13 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._build_exec_list() def setup_gpu(self): + pass # self._s_locl_buff = \ # self.fields_on_grid[0].host_data_pinned[0].reshape( # self.resol_dir, order=ORDER)[:self._s_buff_width, :, :] - self._s_locr_buff = \ - self.fields_on_grid[0].host_data_pinned[0].reshape( - self.resol_dir, order=ORDER)[-self._s_buff_width:, :, :] + # self._s_locr_buff = \ + # self.fields_on_grid[0].host_data_pinned[0].reshape( + # self.resol_dir, order=ORDER)[-self._s_buff_width:, :, :] def _collect_kernels_cl_src_2k(self): pass @@ -458,12 +456,28 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._s_locl_buff = \ self.fields_on_grid[0].host_data_pinned[0].reshape( self.resol_dir, order=ORDER)[:self._s_buff_width_from_l, :, :] + s = self._s_buff_width_from_r * \ + self.resol_dir[1] * self.resol_dir[2] + self._s_fromr_buff = self._s_fromr_buff_flat[:s].reshape( + (self._s_buff_width_from_r, + self.resol_dir[1], + self.resol_dir[2]), order=ORDER) + self._s_locr_buff = \ + self.fields_on_grid[0].host_data_pinned[0].reshape( + self.resol_dir, order=ORDER)[-self._s_buff_width_from_r:, :, :] self._s_buffer_region_on_l = ( int(SIZEOF_PARMES_REAL * self._s_buff_width_from_l), int(self.resol_dir[1]), int(self.resol_dir[2])) self._origin_locl = (0, 0, 0) + self._s_buffer_region_on_r = ( + int(SIZEOF_PARMES_REAL * self._s_buff_width_from_r), + int(self.resol_dir[1]), + int(self.resol_dir[2])) + self._origin_locr = ( + int((self.resol_dir[0] - self._s_buff_width_from_r) + * PARMES_REAL(0).nbytes), 0, 0) # Recompute blocks number and block size self._s_block_size_to_r, self._s_n_blocks_to_r, \ @@ -471,11 +485,22 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._compute_block_number_and_size( SIZEOF_PARMES_REAL * self._s_buff_width_loc_p * self.resol_dir[1] * self.resol_dir[2]) + self._s_block_size_to_l, self._s_n_blocks_to_l, \ + self._s_elem_block_to_l, self._s_buff_block_slice_to_l = \ + self._compute_block_number_and_size( + SIZEOF_PARMES_REAL * self._s_buff_width_loc_m * + self.resol_dir[1] * self.resol_dir[2]) + self._s_block_size_from_r, self._s_n_blocks_from_r, \ + self._s_elem_block_from_r, self._s_buff_block_slice_from_r = \ + self._compute_block_number_and_size(self._s_fromr_buff.nbytes) self._s_block_size_from_l, self._s_n_blocks_from_l, \ self._s_elem_block_from_l, self._s_buff_block_slice_from_l = \ self._compute_block_number_and_size(self._s_froml_buff.nbytes) + self._r_recv = [None, ] * self._s_n_blocks_from_r self._l_recv = [None, ] * self._s_n_blocks_from_l + self._evt_get_l = [None, ] * self._s_n_blocks_to_l self._evt_get_r = [None, ] * self._s_n_blocks_to_r + self._l_send = [None, ] * self._s_n_blocks_to_l self._r_send = [None, ] * self._s_n_blocks_to_r def _compute_block_number_and_size(self, buff_size): @@ -650,6 +675,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self.part_position[0], self.fields_on_part[self.fields_on_grid[0]][0], self._cl_s_l_buff, + PARMES_INTEGER(self._s_buff_width_loc_m), self._cl_mesh_info, wait_for=wait_list) @@ -756,33 +782,35 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): [self._s_froml_buff_flat[self._s_buff_block_slice_from_l[b]], self._s_elem_block_from_l, PARMES_MPI_REAL], source=self._L_rk, tag=888 + self._L_rk + 19 * b) - for b in xrange(self._s_n_blocks): + for b in xrange(self._s_n_blocks_from_r): self._r_recv[b] = self._comm.Irecv( - [self._s_fromr_buff_flat[self._s_buff_block_slice[b]], - self._s_elem_block, PARMES_MPI_REAL], + [self._s_fromr_buff_flat[self._s_buff_block_slice_from_r[b]], + self._s_elem_block_from_r, PARMES_MPI_REAL], source=self._R_rk, tag=333 + self._R_rk + 17 * b) # Fill and get the left buffer evt_comm_l = self._num_comm_l(wait_evts, dt) - for b in xrange(self._s_n_blocks): + s = int(self._s_buff_width_loc_m * + self.resol_dir[1] * self.resol_dir[2]) + for b in xrange(self._s_n_blocks_to_l): self._evt_get_l[b] = cl.enqueue_copy( self._queue_comm_m, self._s_l_buff, self._cl_s_l_buff, - host_origin=(b * self._s_block_size, 0, 0), - host_pitches=(self._s_l_buff.nbytes, 0), - buffer_origin=(b * self._s_block_size, 0, 0), - buffer_pitches=(self._s_l_buff.nbytes, 0), - region=(self._s_block_size, 1, 1), + host_origin=(b * self._s_block_size_to_l, 0, 0), + host_pitches=(s * SIZEOF_PARMES_REAL, 0), + buffer_origin=(b * self._s_block_size_to_l, 0, 0), + buffer_pitches=(s * SIZEOF_PARMES_REAL, 0), + region=(self._s_block_size_to_l, 1, 1), is_blocking=False, wait_for=[evt_comm_l]) # Send the left buffer ctime = MPI.Wtime() - for b in xrange(self._s_n_blocks): + for b in xrange(self._s_n_blocks_to_l): self._evt_get_l[b].wait() self._l_send[b] = self._comm.Issend( - [self._s_l_buff[self._s_buff_block_slice[b]], - self._s_elem_block, PARMES_MPI_REAL], + [self._s_l_buff[self._s_buff_block_slice_to_l[b]], + self._s_elem_block_to_l, PARMES_MPI_REAL], dest=self._L_rk, tag=333 + self._comm_rank + 17 * b) ctime_send_l = MPI.Wtime() - ctime @@ -833,20 +861,22 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): buffer_origin=self._origin_locr, buffer_pitches=self._pitches_dev, host_pitches=self._pitches_dev, - region=self._s_buffer_region, + region=self._s_buffer_region_on_r, is_blocking=False, wait_for=self.evt_num_remesh) ctime = MPI.Wtime() # Wait MPI transfer of data from left, add them to local data and send back to device - for b in xrange(self._s_n_blocks): + for b in xrange(self._s_n_blocks_to_r): self._r_send[b].Wait() + for b in xrange(self._s_n_blocks_from_l): self._l_recv[b].Wait() evt_get_locl.wait() ctime_wait_l = MPI.Wtime() - ctime calctime = MPI.Wtime() self._s_locl_buff += self._s_froml_buff + print self._comm_rank, self._s_locl_buff.shape, self._s_froml_buff.shape self.profiler['comm_calc_remesh'] += MPI.Wtime() - calctime evt_set_locl = cl.enqueue_copy( self.cl_env.queue, @@ -861,13 +891,15 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): # Wait MPI transfer of data from right, add them to local data and send back to device ctime = MPI.Wtime() - for b in xrange(self._s_n_blocks): + for b in xrange(self._s_n_blocks_to_l): self._l_send[b].Wait() + for b in xrange(self._s_n_blocks_from_r): self._r_recv[b].Wait() evt_get_locr.wait() ctime_wait_r = MPI.Wtime() - ctime calctime = MPI.Wtime() self._s_locr_buff += self._s_fromr_buff + print self._comm_rank, self._s_locr_buff.shape, self._s_fromr_buff.shape self.profiler['comm_calc_remesh'] += MPI.Wtime() - calctime evt_set_locr = cl.enqueue_copy( self.cl_env.queue, @@ -877,7 +909,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): buffer_origin=self._origin_locr, buffer_pitches=self._pitches_dev, host_pitches=self._pitches_dev, - region=self._s_buffer_region, + region=self._s_buffer_region_on_r, is_blocking=False) if CL_PROFILE: diff --git a/HySoP/hysop/operator/advection.py b/HySoP/hysop/operator/advection.py index 4942c6061..9c443c29d 100644 --- a/HySoP/hysop/operator/advection.py +++ b/HySoP/hysop/operator/advection.py @@ -492,8 +492,6 @@ class Advection(Computational): if gpu_rank == 0: s += " Total : {0:9d}".format(total_lmem) + "Bytes" print s - for d in xrange(self.domain.dimension): - dOp = self.advec_dir[d].discreteOperator.setup_gpu() @debug @opapply -- GitLab