From c36236c0b16bbc18b7e9053398b78162f9a9097e Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@univ-reims.fr> Date: Tue, 21 Oct 2014 14:56:14 +0200 Subject: [PATCH] Cleanup --- .../hysop/gpu/multi_gpu_particle_advection.py | 156 +++++++++--------- 1 file changed, 75 insertions(+), 81 deletions(-) diff --git a/HySoP/hysop/gpu/multi_gpu_particle_advection.py b/HySoP/hysop/gpu/multi_gpu_particle_advection.py index b5b6e2215..0862c2ca0 100644 --- a/HySoP/hysop/gpu/multi_gpu_particle_advection.py +++ b/HySoP/hysop/gpu/multi_gpu_particle_advection.py @@ -167,6 +167,26 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): int(self._v_buff_width * PARMES_REAL(0.).nbytes), int(self.v_resol_dir[1]), int(self.v_resol_dir[2])) + self._v_block_size = 1024 * 1024 # 1MByte + while self._v_l_buff.nbytes % self._v_block_size != 0: + self._v_block_size /= 2 + w = "WARNING: block size for pipelined GPU-to-GPU transfer is small, " + if self._v_block_size < 256 * 1024: + self._v_block_size = self._v_l_buff.nbytes / 4 + print w + "use blocks of {0} MB (4 blocks velocity)".format( + self._v_block_size / (1024. * 1024.)) + self._v_n_blocks = self._v_l_buff.nbytes / self._v_block_size + self._v_elem_block = np.prod(self._v_l_buff.shape) / self._v_n_blocks + self._l_recv_v = [None, ] * self._v_n_blocks + self._r_recv_v = [None, ] * self._v_n_blocks + self._send_to_l_v = [None, ] * self._v_n_blocks + self._send_to_r_v = [None, ] * self._v_n_blocks + self._evt_l_v = [None, ] * self._v_n_blocks + self._evt_r_v = [None, ] * self._v_n_blocks + self._v_block_slice = [None, ] * self._v_n_blocks + for b in xrange(self._v_n_blocks): + self._v_block_slice[b] = slice( + b * self._v_elem_block, (b + 1) * self._v_elem_block) ## Python remeshing formula for the multiscale interpolation self._py_ms_formula = self.method[MultiScale] @@ -194,14 +214,6 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): cl.enqueue_copy(self.cl_env.queue, self._cl_s_l_buff, _s_l_buff).wait() self._cl_work_size += 2 * self._s_froml_buff_max.nbytes - self._s_buff_size = self._s_buff_width * \ - self.resol_dir[1] * self.resol_dir[2] - self._s_pitches_host = (int(self._s_froml_buff_max[:, 0, 0].nbytes), - int(self._s_froml_buff_max[:, :, 0].nbytes)) - self._s_buffer_region = ( - int(self._s_buff_width * PARMES_REAL(0.).nbytes), - int(self.resol_dir[1]), - int(self.resol_dir[2])) self._s_l_buff, evt = cl.enqueue_map_buffer( self.cl_env.queue, self._cl_s_l_buff, @@ -224,51 +236,24 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): evt.wait() self._s_froml_buff_flat = self._s_froml_buff_max.ravel(order='F') self._s_fromr_buff_flat = self._s_fromr_buff_max.ravel(order='F') - - #self._s_block_size = 1024 * 1024 # 1MByte - self._v_block_size = 1024 * 1024 # 1MByte - #while self._s_l_buff.nbytes % self._s_block_size != 0: - # self._s_block_size /= 2 - while self._v_l_buff.nbytes % self._v_block_size != 0: - self._v_block_size /= 2 - w = "WARNING: block size for pipelined GPU-to-GPU transfer is small, " - #if self._s_block_size < 256 * 1024: - # self._s_block_size = self._s_l_buff.nbytes / 4 - # print w + "use blocks of {0} MB (4 blocks scalar)".format( - # self._s_block_size / (1024. * 1024.)) - if self._v_block_size < 256 * 1024: - self._v_block_size = self._v_l_buff.nbytes / 4 - print w + "use blocks of {0} MB (4 blocks velocity)".format( - self._v_block_size / (1024. * 1024.)) - #self._s_n_blocks = self._s_l_buff.nbytes / self._s_block_size - self._v_n_blocks = self._v_l_buff.nbytes / self._v_block_size - #self._s_elem_block = np.prod(self._s_l_buff.shape) / self._s_n_blocks - self._v_elem_block = np.prod(self._v_l_buff.shape) / self._v_n_blocks - #print "MULTI-GPU Communications of size {0}MB, by {1} blocs of {2}MB ({3} width)".format( - # self._s_l_buff.nbytes / (1024. * 1024.), - # self._s_n_blocks, - # self._s_block_size / (1024. * 1024.), - # str((self._s_buff_width, self.resol_dir[1], self.resol_dir[2]))) - #self._evt_get_l = [None, ] * self._s_n_blocks - #self._evt_get_r = [None, ] * self._s_n_blocks - #self._l_send = [None, ] * self._s_n_blocks - #self._r_send = [None, ] * self._s_n_blocks - #self._l_recv = [None, ] * self._s_n_blocks - #self._r_recv = [None, ] * self._s_n_blocks - self._l_recv_v = [None, ] * self._v_n_blocks - self._r_recv_v = [None, ] * self._v_n_blocks - self._send_to_l_v = [None, ] * self._v_n_blocks - self._send_to_r_v = [None, ] * self._v_n_blocks - self._evt_l_v = [None, ] * self._v_n_blocks - self._evt_r_v = [None, ] * self._v_n_blocks - #self._s_buff_block_slice = [None, ] * self._s_n_blocks - self._v_buff_block_slice = [None, ] * self._v_n_blocks - #for b in xrange(self._s_n_blocks): - # self._s_buff_block_slice[b] = slice( - # b * self._s_elem_block, (b + 1) * self._s_elem_block) - for b in xrange(self._v_n_blocks): - self._v_buff_block_slice[b] = slice( - b * self._v_elem_block, (b + 1) * self._v_elem_block) + # attributes declarations, values are recomputed at each time + self._s_buff_width_loc_p, self._s_buff_width_loc_m = 0, 0 + self._s_buff_width_from_l, self._s_buff_width_from_r = 0, 0 + self._s_froml_buff, self._s_locl_buff = None, None + self._s_fromr_buff, self._s_locr_buff = None, None + self._s_buffer_region_on_l, self._s_buffer_region_on_r = None, None + self._origin_locl, self._origin_locr = None, None + self._s_block_size_to_r, self._s_block_size_to_l = None, None + self._s_block_size_from_r, self._s_block_size_from_l = None, None + self._s_n_blocks_to_r, self._s_n_blocks_to_l = None, None + self._s_n_blocks_from_r, self._s_n_blocks_from_l = None, None + self._s_elem_block_to_r, self._s_elem_block_to_l = None, None + self._s_elem_block_from_r, self._s_elem_block_from_l = None, None + self._s_block_slice_to_r, self._s_block_slice_to_l = None, None + self._s_block_slice_from_r, self._s_block_slice_from_l = None, None + self._r_recv, self._l_recv = None, None + self._evt_get_l, self._evt_get_r = None, None + self._l_send, self._r_send = None, None self._queue_comm_m = self.cl_env.create_other_queue() self._queue_comm_p = self.cl_env.create_other_queue() @@ -321,15 +306,6 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._build_exec_list() - def setup_gpu(self): - pass - # self._s_locl_buff = \ - # self.fields_on_grid[0].host_data_pinned[0].reshape( - # self.resol_dir, order=ORDER)[:self._s_buff_width, :, :] - # self._s_locr_buff = \ - # self.fields_on_grid[0].host_data_pinned[0].reshape( - # self.resol_dir, order=ORDER)[-self._s_buff_width:, :, :] - def _collect_kernels_cl_src_2k(self): pass @@ -481,21 +457,40 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): # Recompute blocks number and block size self._s_block_size_to_r, self._s_n_blocks_to_r, \ - self._s_elem_block_to_r, self._s_buff_block_slice_to_r = \ + self._s_elem_block_to_r, self._s_block_slice_to_r = \ self._compute_block_number_and_size( SIZEOF_PARMES_REAL * self._s_buff_width_loc_p * self.resol_dir[1] * self.resol_dir[2]) self._s_block_size_to_l, self._s_n_blocks_to_l, \ - self._s_elem_block_to_l, self._s_buff_block_slice_to_l = \ + self._s_elem_block_to_l, self._s_block_slice_to_l = \ self._compute_block_number_and_size( SIZEOF_PARMES_REAL * self._s_buff_width_loc_m * self.resol_dir[1] * self.resol_dir[2]) self._s_block_size_from_r, self._s_n_blocks_from_r, \ - self._s_elem_block_from_r, self._s_buff_block_slice_from_r = \ + self._s_elem_block_from_r, self._s_block_slice_from_r = \ self._compute_block_number_and_size(self._s_fromr_buff.nbytes) self._s_block_size_from_l, self._s_n_blocks_from_l, \ - self._s_elem_block_from_l, self._s_buff_block_slice_from_l = \ + self._s_elem_block_from_l, self._s_block_slice_from_l = \ self._compute_block_number_and_size(self._s_froml_buff.nbytes) + print "[" + str(self._comm_rank) + \ + "] Multi-GPU comm: send to L=({0} MB, {1} bloc),".format( + self._s_block_size_to_l * self._s_n_blocks_to_l / + (1024. * 1024), + self._s_n_blocks_to_l) + \ + " R=({0} MB, {1} bloc)".format( + self._s_block_size_to_r * self._s_n_blocks_to_r / + (1024. * 1024), + self._s_n_blocks_to_r,) + \ + "; recv from L=({0} MB, {1} bloc),".format( + self._s_block_size_from_l * self._s_n_blocks_from_l / + (1024. * 1024), + self._s_n_blocks_from_l) + \ + " R=({0} MB, {1} bloc)".format( + self._s_block_size_from_r * self._s_n_blocks_from_r / + (1024. * 1024), + self._s_n_blocks_from_r,) + + # Events lists self._r_recv = [None, ] * self._s_n_blocks_from_r self._l_recv = [None, ] * self._s_n_blocks_from_l self._evt_get_l = [None, ] * self._s_n_blocks_to_l @@ -564,20 +559,20 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): ctime = MPI.Wtime() for b in xrange(self._v_n_blocks): self._l_recv_v[b] = self._comm.Irecv( - [self._v_l_buff_flat[self._v_buff_block_slice[b]], + [self._v_l_buff_flat[self._v_block_slice[b]], self._v_elem_block, PARMES_MPI_REAL], source=self._L_rk, tag=17 + 19 * self._L_rk + 59 * b) self._r_recv_v[b] = self._comm.Irecv( - [self._v_r_buff_flat[self._v_buff_block_slice[b]], + [self._v_r_buff_flat[self._v_block_slice[b]], self._v_elem_block, PARMES_MPI_REAL], source=self._R_rk, tag=29 + 23 * self._R_rk + 57 * b) for b in xrange(self._v_n_blocks): self._send_to_r_v[b] = self._comm.Issend( - [self._v_r_buff_loc_flat[self._v_buff_block_slice[b]], + [self._v_r_buff_loc_flat[self._v_block_slice[b]], self._v_elem_block, PARMES_MPI_REAL], dest=self._R_rk, tag=17 + 19 * self._comm_rank + 59 * b) self._send_to_l_v[b] = self._comm.Issend( - [self._v_l_buff_loc_flat[self._v_buff_block_slice[b]], + [self._v_l_buff_loc_flat[self._v_block_slice[b]], self._v_elem_block, PARMES_MPI_REAL], dest=self._L_rk, tag=29 + 23 * self._comm_rank + 57 * b) if CL_PROFILE: @@ -779,12 +774,12 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): # Prepare the MPI receptions for b in xrange(self._s_n_blocks_from_l): self._l_recv[b] = self._comm.Irecv( - [self._s_froml_buff_flat[self._s_buff_block_slice_from_l[b]], + [self._s_froml_buff_flat[self._s_block_slice_from_l[b]], self._s_elem_block_from_l, PARMES_MPI_REAL], source=self._L_rk, tag=888 + self._L_rk + 19 * b) for b in xrange(self._s_n_blocks_from_r): self._r_recv[b] = self._comm.Irecv( - [self._s_fromr_buff_flat[self._s_buff_block_slice_from_r[b]], + [self._s_fromr_buff_flat[self._s_block_slice_from_r[b]], self._s_elem_block_from_r, PARMES_MPI_REAL], source=self._R_rk, tag=333 + self._R_rk + 17 * b) @@ -809,7 +804,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): for b in xrange(self._s_n_blocks_to_l): self._evt_get_l[b].wait() self._l_send[b] = self._comm.Issend( - [self._s_l_buff[self._s_buff_block_slice_to_l[b]], + [self._s_l_buff[self._s_block_slice_to_l[b]], self._s_elem_block_to_l, PARMES_MPI_REAL], dest=self._L_rk, tag=333 + self._comm_rank + 17 * b) ctime_send_l = MPI.Wtime() - ctime @@ -834,14 +829,13 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): for b in xrange(self._s_n_blocks_to_r): self._evt_get_r[b].wait() self._r_send[b] = self._comm.Issend( - [self._s_r_buff[self._s_buff_block_slice_to_r[b]], + [self._s_r_buff[self._s_block_slice_to_r[b]], self._s_elem_block_to_r, PARMES_MPI_REAL], dest=self._R_rk, tag=888 + self._comm_rank + 19 * b) ctime_send_r = MPI.Wtime() - ctime # remesh in-domain particles and get left-right layer evt = self._num_comm(wait_evts, dt) - self.evt_num_remesh = [evt] evt_get_locl = cl.enqueue_copy( self.cl_env.queue, self.fields_on_grid[0].host_data_pinned[0], @@ -852,7 +846,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): host_pitches=self._pitches_dev, region=self._s_buffer_region_on_l, is_blocking=False, - wait_for=self.evt_num_remesh) + wait_for=[evt]) evt_get_locr = cl.enqueue_copy( self.cl_env.queue, self.fields_on_grid[0].host_data_pinned[0], @@ -863,10 +857,11 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): host_pitches=self._pitches_dev, region=self._s_buffer_region_on_r, is_blocking=False, - wait_for=self.evt_num_remesh) + wait_for=[evt]) ctime = MPI.Wtime() - # Wait MPI transfer of data from left, add them to local data and send back to device + # Wait MPI transfer of data from left, add them to local + # data and send back to device for b in xrange(self._s_n_blocks_to_r): self._r_send[b].Wait() for b in xrange(self._s_n_blocks_from_l): @@ -876,7 +871,6 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): calctime = MPI.Wtime() self._s_locl_buff += self._s_froml_buff - print self._comm_rank, self._s_locl_buff.shape, self._s_froml_buff.shape self.profiler['comm_calc_remesh'] += MPI.Wtime() - calctime evt_set_locl = cl.enqueue_copy( self.cl_env.queue, @@ -889,7 +883,8 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): region=self._s_buffer_region_on_l, is_blocking=False) - # Wait MPI transfer of data from right, add them to local data and send back to device + # Wait MPI transfer of data from right, add them to local + # data and send back to device ctime = MPI.Wtime() for b in xrange(self._s_n_blocks_to_l): self._l_send[b].Wait() @@ -899,7 +894,6 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): ctime_wait_r = MPI.Wtime() - ctime calctime = MPI.Wtime() self._s_locr_buff += self._s_fromr_buff - print self._comm_rank, self._s_locr_buff.shape, self._s_fromr_buff.shape self.profiler['comm_calc_remesh'] += MPI.Wtime() - calctime evt_set_locr = cl.enqueue_copy( self.cl_env.queue, -- GitLab