diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_diffusion.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_diffusion.cl index 6b503bc21b4f9d50a9ff686ae0aaac3565992871..87a2d4ad8e0caf1bf593e536961e92f5dbdd8283 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/comm_diffusion.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_diffusion.cl @@ -56,14 +56,14 @@ __kernel void diffusion(__global const float* scal_in, /* // fill tile edges */ tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; - tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; } #if CUT_DIR_Y == 1 tile_XY[lidX][0] = (t_gidY*TILE_SIZE>=1)? scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + NB_X + gidZ*NB_X*2]; - tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE+1<NB_Y) ? scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + gidZ*NB_X*2]; + tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE+1<NB_Y) ? scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + gidZ*NB_X*2]; #else tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; - tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; #endif /* Synchronize work-group */ @@ -107,14 +107,14 @@ __kernel void diffusion(__global const float* scal_in, /* // fill tile edges */ tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; - tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; } #if CUT_DIR_Y == 1 tile_XY[lidX][0] = (t_gidY*TILE_SIZE>=1)? scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + NB_X + gidZ*NB_X*2]; - tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE+1<NB_Y) ? scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + gidZ*NB_X*2]; + tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE+1<NB_Y) ? scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + gidZ*NB_X*2]; #else tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; - tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; #endif /* Synchronize work-group */ @@ -147,4 +147,3 @@ __kernel void diffusion(__global const float* scal_in, scal_out[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] = s; } } - diff --git a/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl b/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl index d764a3a9b547dc210c15c51c5bd0a829f7673283..f5543921290be8188563f55b7f783849d9d02f17 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl @@ -44,10 +44,10 @@ __kernel void diffusion(__global const float* scal_in, /* // fill tile edges */ tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; - tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; } tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; - tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; /* Synchronize work-group */ barrier(CLK_LOCAL_MEM_FENCE); @@ -90,10 +90,10 @@ __kernel void diffusion(__global const float* scal_in, /* // fill tile edges */ tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; - tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; } tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; - tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; /* Synchronize work-group */ barrier(CLK_LOCAL_MEM_FENCE); diff --git a/HySoP/hysop/gpu/gpu_diffusion.py b/HySoP/hysop/gpu/gpu_diffusion.py new file mode 100644 index 0000000000000000000000000000000000000000..97e5a857ccbec2e753a1e4963d71aff3fc9071d5 --- /dev/null +++ b/HySoP/hysop/gpu/gpu_diffusion.py @@ -0,0 +1,229 @@ +""" +@file gpu_diffusion.py + +Diffusion on GPU +""" +from parmepy.constants import debug, np, S_DIR, PARMES_MPI_REAL, ORDERMPI +import parmepy.tools.numpywrappers as npw +from parmepy.operator.discrete.discrete import DiscreteOperator +from parmepy.gpu import cl +from parmepy.gpu.gpu_operator import GPUOperator +from parmepy.gpu.gpu_kernel import KernelLauncher +from parmepy.gpu.gpu_discrete import GPUDiscreteField +from parmepy.tools.profiler import FProfiler +from parmepy.mpi.main_var import MPI + + +class GPUDiffusion(DiscreteOperator, GPUOperator): + + @debug + def __init__(self, vorticity, viscosity, vorticity_tmp=None, + platform_id=None, device_id=None, device_type=None, **kwds): + ## Discretisation of the solution field + self.vorticity = vorticity + ## Viscosity. + self.viscosity = viscosity + assert 'variables' not in kwds, 'variables parameter is useless.' + super(GPUDiffusion, self).__init__(variables=[vorticity], + **kwds) + self.input = [self.vorticity] + self.output = [self.vorticity] + self.direction = 0 + self._cl_work_size = 0 + + GPUOperator.__init__(self, platform_id=platform_id, + device_id=device_id, + device_type=device_type, + **kwds) + + ## GPU allocation. + alloc = not isinstance(self.vorticity, GPUDiscreteField) + GPUDiscreteField.fromField(self.cl_env, self.vorticity, + self.gpu_precision, simple_layout=False) + if not self.vorticity.gpu_allocated: + self.vorticity.allocate() + if alloc: + self.size_global_alloc += self.vorticity.mem_size + + if vorticity_tmp is None: + self.vorticity_tmp = self.cl_env.global_allocation( + self.vorticity.data[0]) + else: + self.vorticity_tmp = vorticity_tmp + + topo = self.vorticity.topology + self._cutdir_list = np.where(topo.cutdir)[0].tolist() + self._comm = topo.comm + self._comm_size = self._comm.Get_size() + self._comm_rank = self._comm.Get_rank() + if self._comm_size > 1: + self._to_send = [None] * self.dim + self._to_recv_buf = [None] * self.dim + self._to_recv = [None] * self.dim + self.mpi_type_diff_l = {} + self.mpi_type_diff_r = {} + self.profiler += FProfiler('comm_diffusion') + for d in self._cutdir_list: + shape = list(self.vorticity.data[0].shape) + shape_b = list(self.vorticity.data[0].shape) + start_l = [0, ] * 3 + start_r = [0, ] * 3 + start_r[d] = 1 + shape[d] = 2 + shape_b[d] = 1 + # _to_send[..., 0] contains [..., 0] data + # _to_send[..., 1] contains [..., Nz-1] data + # _to_recv[..., 0] contains [..., Nz] data (right ghosts) + # _to_recv[..., 1] contains [..., -1] data (left ghosts) + self._to_send[d] = npw.zeros(tuple(shape)) + self._to_recv[d] = npw.zeros(tuple(shape)) + self.mpi_type_diff_l[d] = PARMES_MPI_REAL.Create_subarray( + shape, shape_b, start_l, order=ORDERMPI) + self.mpi_type_diff_l[d].Commit() + self.mpi_type_diff_r[d] = PARMES_MPI_REAL.Create_subarray( + shape, shape_b, start_r, order=ORDERMPI) + self.mpi_type_diff_r[d].Commit() + self._to_recv_buf[d] = cl.Buffer( + self.cl_env.ctx, cl.mem_flags.READ_WRITE, + size=self._to_recv[d].nbytes) + cl.enqueue_copy( + self.cl_env.queue, + self._to_recv_buf[d], + self._to_recv[d], + buffer_origin=(0, 0, 0), + host_origin=(0, 0, 0), + region=(self._to_recv[d][0, 0, 0].nbytes, )).wait() + self._cl_work_size += self._to_recv[d].nbytes + self._compute = self._compute_diffusion_comm + else: + self._compute = self._compute_diffusion + + self._mesh_size = npw.ones(4, dtype=self.gpu_precision) + self._mesh_size[:self.dim] = self._reorderVect(topo.mesh.space_step) + shape = topo.mesh.resolution + resol = shape.copy() + self.resol_dir = npw.dim_ones((self.dim,)) + self.resol_dir[:self.dim] = self._reorderVect(shape) + self._append_size_constants(resol) + + src, tile_size, nb_part_per_wi, vec, f_space = \ + self._kernel_cfg['diffusion'] + + build_options = self._size_constants + build_options += " -D TILE_SIZE=" + str(tile_size) + build_options += " -D NB_PART=" + str(nb_part_per_wi) + build_options += " -D L_WIDTH=" + str(tile_size / nb_part_per_wi) + for d in xrange(self.dim): + build_options += " -D CUT_DIR" + S_DIR[d] + "=" + build_options += str(1 if topo.shape[d] > 1 else 0) + if self._comm_size > 1: + src = [s.replace('diffusion', 'comm_diffusion') + for s in src] + + prg = self.cl_env.build_src(src, build_options, vec) + + gwi, lwi = f_space(self.vorticity.data[0].shape, + nb_part_per_wi, tile_size) + self.num_diffusion = KernelLauncher( + prg.diffusion, self.cl_env.queue, gwi, lwi) + + def _compute_diffusion(self, simulation): + wait_evt = self.vorticity.events + d_evt = self.num_diffusion( + self.vorticity.gpu_data[0], + self.vorticity_tmp, + self.gpu_precision(self.viscosity * simulation.timeStep), + self._mesh_size, + wait_for=wait_evt) + c_evt = cl.enqueue_copy(self.cl_env.queue, self.vorticity.gpu_data[0], + self.vorticity_tmp, wait_for=[d_evt]) + self.vorticity.events.append(c_evt) + + def _compute_diffusion_comm(self, simulation): + # Compute OpenCL transfer parameters + tc = MPI.Wtime() + topo = self.vorticity.topology + first_cut_dir = topo.cutdir.tolist().index(True) + wait_evt = [] + for d in self._cutdir_list: + r_orig = [0, ] * self.dim + br_orig = [0, ] * self.dim + r_orig[d] = self.vorticity.data[0].shape[d] - 1 + br_orig[d] = 1 + r_orig = tuple(r_orig) + br_orig = tuple(br_orig) + l_sl = [slice(None), ] * 3 + r_sl = [slice(None), ] * 3 + l_sl[d] = slice(0, 1) + r_sl[d] = slice(1, 2) + l_sl = tuple(l_sl) + r_sl = tuple(r_sl) + + region_size = list(self.vorticity.data[0].shape) + region_size[0] = self._to_send[d][:, 0, 0].nbytes + region_size[d] = 1 + wait_events = self.vorticity.events + e_l = cl.enqueue_copy(self.cl_env.queue, self._to_send[d], + self.vorticity.gpu_data[0], + host_origin=(0, 0, 0), + buffer_origin=(0, 0, 0), + region=tuple(region_size), + wait_for=wait_events) + e_r = cl.enqueue_copy(self.cl_env.queue, self._to_send[d], + self.vorticity.gpu_data[0], + host_origin=br_orig, buffer_origin=r_orig, + region=tuple(region_size), + wait_for=wait_events) + + # MPI send + r_send = [] + r_recv = [] + R_rk = topo.neighbours[1, d - first_cut_dir] + L_rk = topo.neighbours[0, d - first_cut_dir] + r_recv.append(self._comm.Irecv( + [self._to_recv[d], 1, self.mpi_type_diff_l[d]], + source=R_rk, tag=123 + R_rk)) + r_recv.append(self._comm.Irecv( + [self._to_recv[d], 1, self.mpi_type_diff_r[d]], + source=L_rk, tag=456 + L_rk)) + e_l.wait() + e_r.wait() + r_send.append(self._comm.Issend( + [self._to_send[d], 1, self.mpi_type_diff_l[d]], + dest=L_rk, tag=123 + self._comm_rank)) + r_send.append(self._comm.Issend( + [self._to_send[d], 1, self.mpi_type_diff_r[d]], + dest=R_rk, tag=456 + self._comm_rank)) + for r in r_send + r_recv: + r.Wait() + + # _to_recv[..., 0] contains [..., Nz] data (right ghosts) + # _to_recv[..., 1] contains [..., -1] data (left ghosts) + wait_evt.append(cl.enqueue_copy(self.cl_env.queue, + self._to_recv_buf[d], + self._to_recv[d])) + self.profiler['comm_diffusion'] += MPI.Wtime() - tc + + if len(self._cutdir_list) == 1: + d_evt = self.num_diffusion( + self.vorticity.gpu_data[0], + self._to_recv_buf[self._cutdir_list[0]], + self.vorticity_tmp, + self.gpu_precision(self.viscosity * simulation.timeStep), + self._mesh_size, + wait_for=wait_evt) + if len(self._cutdir_list) == 2: + d_evt = self.num_diffusion( + self.vorticity.gpu_data[0], + self._to_recv_buf[self._cutdir_list[0]], + self._to_recv_buf[self._cutdir_list[1]], + self.vorticity_tmp, + self.gpu_precision(self.viscosity * simulation.timeStep), + self._mesh_size, + wait_for=wait_evt) + c_evt = cl.enqueue_copy(self.cl_env.queue, self.vorticity.gpu_data[0], + self.vorticity_tmp, wait_for=[d_evt]) + self.vorticity.events.append(c_evt) + + def apply(self, simulation): + self._compute(simulation)