From 026ba0757ba921e8e83705b0e0d05d753eaa175b Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@univ-reims.fr> Date: Wed, 22 Oct 2014 08:58:28 +0200 Subject: [PATCH] Multi-GPU communications optimisation on 1k advection-remeshing --- .../comm_advection_MS_and_remeshing_noVec.cl | 34 +++++++------ .../comm_advection_and_remeshing_noVec.cl | 50 ++++++++++--------- .../hysop/gpu/multi_gpu_particle_advection.py | 4 ++ 3 files changed, 48 insertions(+), 40 deletions(-) diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl index 90b055bad..8e9ea780a 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl @@ -6,6 +6,7 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, __global float* v_l_buff, __global const float* pscal, __global float* s_l_buff, + int used_width, float dt, float inv_v_dx_y, float inv_v_dx_z, __constant struct AdvectionMeshInfo* mesh) @@ -26,7 +27,7 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, float* loc_ptr; // Initialize buffers - for (i=0; i<BUFF_WIDTH; i++) + for (i=0; i<used_width; i++) s_l_buff_loc[i] = 0.0; barrier(CLK_LOCAL_MEM_FENCE); @@ -94,28 +95,28 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, index = ind - REMESH_SHIFT; w = REMESH(alpha)(y); - if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(beta)(y); - if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(gamma)(y); - if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(delta)(y); - if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); @@ -123,14 +124,14 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, #if REMESH_SHIFT > 1 index = index + 1; w = REMESH(eta)(y); - if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(zeta)(y); - if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); @@ -139,14 +140,14 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, #if REMESH_SHIFT > 2 index = index + 1; w = REMESH(theta)(y); - if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(iota)(y); - if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); @@ -155,14 +156,14 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, #if REMESH_SHIFT > 3 index = index + 1; w = REMESH(kappa)(y); - if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(mu)(y); - if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); @@ -173,8 +174,8 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, barrier(CLK_LOCAL_MEM_FENCE); // Store buffers - for (i=0; i<BUFF_WIDTH; i++) - s_l_buff[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_l_buff_loc[i]; + for (i=0; i<used_width; i++) + s_l_buff[i + gidY*used_width + gidZ*used_width*NB_II] = s_l_buff_loc[i]; } @@ -182,6 +183,7 @@ __kernel void buff_advec_and_remesh_r(__global const float* gvelo, __global float* v_r_buff, __global const float* pscal, __global float* s_r_buff, + int used_width, float dt, float inv_v_dx_y, float inv_v_dx_z, __constant struct AdvectionMeshInfo* mesh) @@ -202,7 +204,7 @@ __kernel void buff_advec_and_remesh_r(__global const float* gvelo, float* loc_ptr; // Initialize buffers - for(i=0; i<BUFF_WIDTH; i++) + for(i=0; i<used_width; i++) s_r_buff_loc[i] = 0.0; barrier(CLK_LOCAL_MEM_FENCE); @@ -349,8 +351,8 @@ __kernel void buff_advec_and_remesh_r(__global const float* gvelo, barrier(CLK_LOCAL_MEM_FENCE); // Store buffers - for(i=0;i<BUFF_WIDTH;i++) - s_r_buff[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_r_buff_loc[i]; + for(i=0;i<used_width;i++) + s_r_buff[i + gidY*used_width + gidZ*used_width*NB_II] = s_r_buff_loc[i]; } diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_and_remeshing_noVec.cl index ae019d2ad..1648c70e4 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_and_remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_and_remeshing_noVec.cl @@ -2,10 +2,11 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, - __global float* v_buffer_l, - __global const float* pscal, - __global float* s_buffer_l, - float dt, __constant struct AdvectionMeshInfo* mesh) + __global float* v_buffer_l, + __global const float* pscal, + __global float* s_buffer_l, + int used_width, + float dt, __constant struct AdvectionMeshInfo* mesh) { int gidY = get_global_id(0); /* OpenCL work-itme global index (Y) */ int gidZ = get_global_id(1); /* OpenCL work-itme global index (Z) */ @@ -21,7 +22,7 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, float* loc_ptr; // Initialize buffers - for (i=0;i<BUFF_WIDTH;i++) + for (i=0;i<used_width;i++) s_buff_l_loc[i] = 0.0; for(i=0; i<V_BUFF_WIDTH; i++) @@ -66,28 +67,28 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, index = ind - REMESH_SHIFT; w = REMESH(alpha)(y); - if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(beta)(y); - if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(gamma)(y); - if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(delta)(y); - if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); @@ -95,14 +96,14 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, #if REMESH_SHIFT > 1 index = index + 1; w = REMESH(eta)(y); - if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(zeta)(y); - if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); @@ -111,14 +112,14 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, #if REMESH_SHIFT > 2 index = index + 1; w = REMESH(theta)(y); - if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(iota)(y); - if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); @@ -127,14 +128,14 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, #if REMESH_SHIFT > 3 index = index + 1; w = REMESH(kappa)(y); - if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(mu)(y); - if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-used_width+1); w = w * s; (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); @@ -146,8 +147,8 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, barrier(CLK_LOCAL_MEM_FENCE); // Store buffers - for(i=0;i<BUFF_WIDTH;i++) - s_buffer_l[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_buff_l_loc[i]; + for(i=0;i<used_width;i++) + s_buffer_l[i + gidY*used_width + gidZ*used_width*NB_II] = s_buff_l_loc[i]; } @@ -158,10 +159,11 @@ __kernel void buff_advec_and_remesh_l(__global const float* gvelo, __kernel void buff_advec_and_remesh_r(__global const float* gvelo, - __global float* v_buffer_r, - __global const float* pscal, - __global float* s_buffer_r, - float dt, __constant struct AdvectionMeshInfo* mesh) + __global float* v_buffer_r, + __global const float* pscal, + __global float* s_buffer_r, + int used_width, + float dt, __constant struct AdvectionMeshInfo* mesh) { int gidY = get_global_id(0); /* OpenCL work-itme global index (Y) */ int gidZ = get_global_id(1); /* OpenCL work-itme global index (Z) */ @@ -177,7 +179,7 @@ __kernel void buff_advec_and_remesh_r(__global const float* gvelo, float* loc_ptr; // Initialize buffers - for(i=0;i<BUFF_WIDTH;i++) + for(i=0;i<used_width;i++) s_buff_r_loc[i] = 0.0; for(i=0;i<V_BUFF_WIDTH;i++) @@ -301,8 +303,8 @@ __kernel void buff_advec_and_remesh_r(__global const float* gvelo, /* Synchronize work-group */ barrier(CLK_LOCAL_MEM_FENCE); - for(i=0;i<BUFF_WIDTH;i++) - s_buffer_r[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_buff_r_loc[i]; + for(i=0;i<used_width;i++) + s_buffer_r[i + gidY*used_width + gidZ*used_width*NB_II] = s_buff_r_loc[i]; } diff --git a/HySoP/hysop/gpu/multi_gpu_particle_advection.py b/HySoP/hysop/gpu/multi_gpu_particle_advection.py index 0862c2ca0..f0dda3135 100644 --- a/HySoP/hysop/gpu/multi_gpu_particle_advection.py +++ b/HySoP/hysop/gpu/multi_gpu_particle_advection.py @@ -697,6 +697,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._cl_v_l_buff, self.fields_on_part[self.fields_on_grid[0]][0], self._cl_s_l_buff, + PARMES_INTEGER(self._s_buff_width_loc_m), self.gpu_precision(dt), self.gpu_precision(1. / self._v_mesh_size[1]), self.gpu_precision(1. / self._v_mesh_size[2]), @@ -709,6 +710,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._cl_v_r_buff, self.fields_on_part[self.fields_on_grid[0]][0], self._cl_s_r_buff, + PARMES_INTEGER(self._s_buff_width_loc_p), self.gpu_precision(dt), self.gpu_precision(1. / self._v_mesh_size[1]), self.gpu_precision(1. / self._v_mesh_size[2]), @@ -734,6 +736,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._cl_v_l_buff, self.fields_on_part[self.fields_on_grid[0]][0], self._cl_s_l_buff, + PARMES_INTEGER(self._s_buff_width_loc_m), self.gpu_precision(dt), self._cl_mesh_info, wait_for=wait_list) @@ -744,6 +747,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._cl_v_r_buff, self.fields_on_part[self.fields_on_grid[0]][0], self._cl_s_r_buff, + PARMES_INTEGER(self._s_buff_width_loc_p), self.gpu_precision(dt), self._cl_mesh_info, wait_for=wait_list) -- GitLab