From 4fd0c854f88e2246a255222a222c41e88f8a6efa Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@univ-reims.fr> Date: Sat, 6 Sep 2014 12:09:32 +0200 Subject: [PATCH] Fix multi-GPU multiscale advection: ghosts are taken into account in computing velocity line indices --- .../cl_src/kernels/comm_MS_advection_noVec.cl | 44 +++++++----- .../comm_advection_MS_and_remeshing_noVec.cl | 44 +++++++----- .../hysop/gpu/multi_gpu_particle_advection.py | 72 ++++++++++--------- 3 files changed, 89 insertions(+), 71 deletions(-) diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_MS_advection_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_MS_advection_noVec.cl index e57be6ae2..2f7193be4 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/comm_MS_advection_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_MS_advection_noVec.cl @@ -15,7 +15,8 @@ __kernel void buff_advec(__global const float* gvelo, int gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ int i; /* Particle index in 1D problem */ int line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */ - float p,v,c, hY, hZ; + float p,v,c; + float2 hY, hZ; int i_ind, i_indY, i_indZ; @@ -25,32 +26,37 @@ __kernel void buff_advec(__global const float* gvelo, __local float* loc_ptr; - hY = (gidY * mesh->dx.y) * inv_v_dx_y; - hZ = (gidZ * mesh->dx.z) * inv_v_dx_z; - i_indY = convert_int_rtn(hY); - i_indZ = convert_int_rtn(hZ); - hY = hY - convert_float(i_indY); - hZ = hZ - convert_float(i_indZ); + hY.s0 = (gidY * mesh->dx.y) * inv_v_dx_y; + hZ.s0 = (gidZ * mesh->dx.z) * inv_v_dx_z; + i_indY = convert_int_rtn(hY.s0); + i_indZ = convert_int_rtn(hZ.s0); + hY.s0 = hY.s0 - convert_float(i_indY); + hZ.s0 = hZ.s0 - convert_float(i_indZ); + hY.s1 = (1.0-hY.s0); + hZ.s1 = (1.0-hZ.s0); + + i_indY = i_indY + V_GHOSTS_NB; + i_indZ = i_indZ + V_GHOSTS_NB; for(i=gidX; i<V_NB_I; i+=(WI_NB)){ - velocity_cache[noBC_id(i)] = (1.0-hY)*(1.0-hZ) * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II]; - velocity_cache[noBC_id(i)] += (1.0-hY)*hZ * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; - velocity_cache[noBC_id(i)] += hY*(1.0-hZ) * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II]; - velocity_cache[noBC_id(i)] += hY*hZ * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] = hY.s1*hZ.s1 * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY.s1*hZ.s0 * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY.s0*hZ.s1 * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY.s0*hZ.s0 * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; } for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)){ - buff_l_loc[i] = (1.0-hY)*(1.0-hZ)*buffer_l[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)]; - buff_l_loc[i] += (1.0-hY)*hZ*buffer_l[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)]; - buff_l_loc[i] += hY*(1.0-hZ)*buffer_l[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)]; - buff_l_loc[i] += hY*hZ*buffer_l[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)]; + buff_l_loc[i] = hY.s1*hZ.s1*buffer_l[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)]; + buff_l_loc[i] += hY.s1*hZ.s0*buffer_l[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)]; + buff_l_loc[i] += hY.s0*hZ.s1*buffer_l[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)]; + buff_l_loc[i] += hY.s0*hZ.s0*buffer_l[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)]; } for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)){ - buff_r_loc[i] = (1.0-hY)*(1.0-hZ)*buffer_r[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)]; - buff_r_loc[i] += (1.0-hY)*hZ*buffer_r[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)]; - buff_r_loc[i] += hY*(1.0-hZ)*buffer_r[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)]; - buff_r_loc[i] += hY*hZ*buffer_r[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)]; + buff_r_loc[i] = hY.s1*hZ.s1*buffer_r[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)]; + buff_r_loc[i] += hY.s1*hZ.s0*buffer_r[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)]; + buff_r_loc[i] += hY.s0*hZ.s1*buffer_r[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)]; + buff_r_loc[i] += hY.s0*hZ.s0*buffer_r[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)]; } /* Synchronize work-group */ diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl index 957c0d6b6..87127b396 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl @@ -18,7 +18,8 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo, int gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ int i; /* Particle index in 1D problem */ int line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */ - float p,v,c, hY, hZ,s,y,w; + float p,v,c,s,y,w; + float2 hY, hZ; int i_ind, i_indY, i_indZ; int ind, index; @@ -46,32 +47,37 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo, - hY = (gidY * mesh->dx.y) * inv_v_dx_y; - hZ = (gidZ * mesh->dx.z) * inv_v_dx_z; - i_indY = convert_int_rtn(hY); - i_indZ = convert_int_rtn(hZ); - hY = hY - convert_float(i_indY); - hZ = hZ - convert_float(i_indZ); + hY.s0 = (gidY * mesh->dx.y) * inv_v_dx_y; + hZ.s0 = (gidZ * mesh->dx.z) * inv_v_dx_z; + i_indY = convert_int_rtn(hY.s0); + i_indZ = convert_int_rtn(hZ.s0); + hY.s0 = hY.s0 - convert_float(i_indY); + hZ.s0 = hZ.s0 - convert_float(i_indZ); + hY.s1 = (1.0-hY.s0); + hZ.s1 = (1.0-hZ.s0); + + i_indY = i_indY + V_GHOSTS_NB; + i_indZ = i_indZ + V_GHOSTS_NB; for(i=gidX; i<V_NB_I; i+=(WI_NB)){ - velocity_cache[noBC_id(i)] = (1.0-hY)*(1.0-hZ) * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II]; - velocity_cache[noBC_id(i)] += (1.0-hY)*hZ * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; - velocity_cache[noBC_id(i)] += hY*(1.0-hZ) * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II]; - velocity_cache[noBC_id(i)] += hY*hZ * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] = hY.s1*hZ.s1 * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY.s1*hZ.s0 * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY.s0*hZ.s1 * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY.s0*hZ.s0 * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; } for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)){ - v_l_buff_loc[i] = (1.0-hY)*(1.0-hZ)*v_l_buff[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)]; - v_l_buff_loc[i] += (1.0-hY)*hZ*v_l_buff[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)]; - v_l_buff_loc[i] += hY*(1.0-hZ)*v_l_buff[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)]; - v_l_buff_loc[i] += hY*hZ*v_l_buff[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)]; + v_l_buff_loc[i] = hY.s1*hZ.s1*v_l_buff[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)]; + v_l_buff_loc[i] += hY.s1*hZ.s0*v_l_buff[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)]; + v_l_buff_loc[i] += hY.s0*hZ.s1*v_l_buff[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)]; + v_l_buff_loc[i] += hY.s0*hZ.s0*v_l_buff[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)]; } for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)){ - v_r_buff_loc[i] = (1.0-hY)*(1.0-hZ)*v_r_buff[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)]; - v_r_buff_loc[i] += (1.0-hY)*hZ*v_r_buff[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)]; - v_r_buff_loc[i] += hY*(1.0-hZ)*v_r_buff[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)]; - v_r_buff_loc[i] += hY*hZ*v_r_buff[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)]; + v_r_buff_loc[i] = hY.s1*hZ.s1*v_r_buff[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)]; + v_r_buff_loc[i] += hY.s1*hZ.s0*v_r_buff[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)]; + v_r_buff_loc[i] += hY.s0*hZ.s1*v_r_buff[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)]; + v_r_buff_loc[i] += hY.s0*hZ.s0*v_r_buff[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)]; } /* Synchronize work-group */ diff --git a/HySoP/hysop/gpu/multi_gpu_particle_advection.py b/HySoP/hysop/gpu/multi_gpu_particle_advection.py index 9b2f783ae..1162ddb27 100644 --- a/HySoP/hysop/gpu/multi_gpu_particle_advection.py +++ b/HySoP/hysop/gpu/multi_gpu_particle_advection.py @@ -25,8 +25,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): __metaclass__ = ABCMeta @debug - def __init__(self, platform_id=None, device_id=None, device_type=None, - user_src=None, max_cfl=5, **kwds): + def __init__(self, max_velocity=None, max_dt=None, **kwds): """ Create a Advection operator. Work on a given field (scalar or vector) at a given velocity to compute @@ -34,31 +33,23 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): OpenCL kernels are build once per dimension in order to handle directional splitting with resolution non uniform in directions. - @param velocity : Velocity field - @param fields_on_grid : Advected fields - @param d : Direction to advect - @param platform_id : OpenCL platform id (default = 0). - @param device_id : OpenCL device id (default = 0). - @param device_type : OpenCL device type (default = 'gpu'). - @param method : the method to use. {'m4prime', 'm6prime', 'm8prime', - 'l6star'} - @param user_src : User OpenCL sources. - @param splittingConfig : Directional splitting configuration - (parmepy.numerics.splitting.Splitting.__init__) + @param max_velocity : maximum velocity estimation for computing communications buffer sizes. + The estimation may be global or by components. + @param max_dt : maimum time step estimation. + @remark : Buffer widths are computed from max_velocity, max_dt and the mesh sizes: + - velocity buffers : |max_velocity * max_dt / v_dx| + 1 + - scalar buffers : |max_velocity * max_dt / s_dx| + 1 + remesh_stencil/2 """ fields_topo = kwds['fields_on_grid'][0].topology direction = kwds['direction'] self._cut_dir = fields_topo.cutdir self._is_cut_dir = self._cut_dir[direction] - super(MultiGPUParticleAdvection, self).__init__( - platform_id=platform_id, device_id=device_id, - device_type=device_type, user_src=user_src, - **kwds) - try: - self.max_cfl = max_cfl[self.direction] - except TypeError: - self.max_cfl = max_cfl + super(MultiGPUParticleAdvection, self).__init__(**kwds) + + msg = "max_dt and _max_velocity must be given to advection " + msg += "for computing communication buffer sizes." + assert max_dt is not None and max_velocity is not None msg = "The Multi-GPU works only with the RK2 TimeIntegrator" assert self.method[TimeIntegrator] == RK2, msg @@ -108,12 +99,27 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): # mesh step self._space_step = msh.space_step self._v_space_step = v_msh.space_step + + # Maximum cfl for velocity and scalar + try: + self.max_cfl_s = int(max_velocity[self.direction] * max_dt / + self._space_step[self.direction]) + 1 + self.max_cfl_v = int(max_velocity[self.direction] * max_dt / + self._v_space_step[self.direction]) + 1 + except TypeError: + self.max_cfl_s = int(max_velocity * max_dt / + self._space_step[self.direction]) + 1 + self.max_cfl_v = int(max_velocity * max_dt / + self._v_space_step[self.direction]) + 1 + + + # Slice self._sl_dim = slice(self.dim, 2 * self.dim) self._cl_work_size = 0 if self._is_cut_dir: #Advection variables - self._v_buff_width = self.max_cfl + self._v_buff_width = self.max_cfl_v self._v_r_buff = npw.zeros((self._v_buff_width, self.v_resol_dir[1], self.v_resol_dir[2])) @@ -128,12 +134,12 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._cl_v_l_buff, self._v_l_buff).wait() self._cl_work_size += 2 * self._v_l_buff.nbytes - self._v_buff_size = self.max_cfl * \ + self._v_buff_size = self._v_buff_width * \ self.v_resol_dir[1] * self.v_resol_dir[2] self._v_pitches_host = (int(self._v_l_buff[:, 0, 0].nbytes), int(self._v_l_buff[:, :, 0].nbytes)) self._v_buffer_region = ( - int(self.max_cfl * PARMES_REAL(0.).nbytes), + int(self._v_buff_width * PARMES_REAL(0.).nbytes), int(self.v_resol_dir[1]), int(self.v_resol_dir[2])) @@ -147,7 +153,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._py_remesh = self.method[Remesh]() ## Number of weights self._nb_w = len(self._py_remesh.weights) - self._s_buff_width = self.max_cfl + self._nb_w / 2 + self._s_buff_width = self.max_cfl_s + self._nb_w / 2 self._s_l_buff = npw.zeros((self._s_buff_width, self.resol_dir[1], self.resol_dir[2])) @@ -240,7 +246,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): build_options += " -D V_STOP_INDEX=" + str(self._v_stop_index) build_options += " -D START_INDEX=" + str(self._start_index) build_options += " -D STOP_INDEX=" + str(self._stop_index) - build_options += " -D V_BUFF_WIDTH=" + str(self.max_cfl) + build_options += " -D V_BUFF_WIDTH=" + str(self._v_buff_width) build_options += " -D FORMULA=" + self.method[Remesh].__name__.upper() build_options += " -D PART_NB_PER_WI=" build_options += str(self.resol_dir[0] / WINb) @@ -272,7 +278,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): build_options += " -D V_STOP_INDEX=" + str(self._v_stop_index) build_options += " -D START_INDEX=" + str(self._start_index) build_options += " -D STOP_INDEX=" + str(self._stop_index) - build_options += " -D V_BUFF_WIDTH=" + str(self.max_cfl) + build_options += " -D V_BUFF_WIDTH=" + str(self._v_buff_width) prg = self.cl_env.build_src( src, build_options, 1) self.num_advec = KernelLauncher( @@ -302,33 +308,33 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): def _exchange_velocity_buffers(self): ghosts = self.velocity_topo.ghosts()[self.direction] if self._is_cut_dir and self.direction == 0: - velo_sl = (slice(self.v_resol_dir[0] - self.max_cfl - ghosts, + velo_sl = (slice(self.v_resol_dir[0] - self._v_buff_width - ghosts, self.v_resol_dir[0] - ghosts), slice(None), slice(None),) self._v_r_buff_loc[...] = self.velocity.data[0][velo_sl] - velo_sl = (slice(0 + ghosts, self.max_cfl + ghosts), + velo_sl = (slice(0 + ghosts, self._v_buff_width + ghosts), slice(None), slice(None)) self._v_l_buff_loc[...] = self.velocity.data[0][velo_sl] if self._is_cut_dir and self.direction == 1: velo_sl = (slice(None), - slice(self.v_resol_dir[0] - self.max_cfl - ghosts, + slice(self.v_resol_dir[0] - self._v_buff_width - ghosts, self.v_resol_dir[0] - ghosts), slice(None)) self._v_r_buff_loc[...] = \ self.velocity.data[1][velo_sl].swapaxes(0, 1) velo_sl = (slice(None), - slice(0 + ghosts, self.max_cfl + ghosts), + slice(0 + ghosts, self._v_buff_width + ghosts), slice(None)) self._v_l_buff_loc[...] = \ self.velocity.data[1][velo_sl].swapaxes(0, 1) if self._is_cut_dir and self.direction == 2: velo_sl = (slice(None), slice(None), - slice(self.v_resol_dir[0] - self.max_cfl - ghosts, + slice(self.v_resol_dir[0] - self._v_buff_width - ghosts, self.v_resol_dir[0] - ghosts)) self._v_r_buff_loc[...] = \ self.velocity.data[2][velo_sl].swapaxes(0, 1).swapaxes(0, 2) velo_sl = (slice(None), slice(None), - slice(0 + ghosts, self.max_cfl + ghosts)) + slice(0 + ghosts, self._v_buff_width + ghosts)) self._v_l_buff_loc[...] = \ self.velocity.data[2][velo_sl].swapaxes(0, 1).swapaxes(0, 2) -- GitLab