From 4fd0c854f88e2246a255222a222c41e88f8a6efa Mon Sep 17 00:00:00 2001
From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@univ-reims.fr>
Date: Sat, 6 Sep 2014 12:09:32 +0200
Subject: [PATCH] Fix multi-GPU multiscale advection: ghosts are taken into
 account in computing velocity line indices

---
 .../cl_src/kernels/comm_MS_advection_noVec.cl | 44 +++++++-----
 .../comm_advection_MS_and_remeshing_noVec.cl  | 44 +++++++-----
 .../hysop/gpu/multi_gpu_particle_advection.py | 72 ++++++++++---------
 3 files changed, 89 insertions(+), 71 deletions(-)

diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_MS_advection_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_MS_advection_noVec.cl
index e57be6ae2..2f7193be4 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/comm_MS_advection_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/comm_MS_advection_noVec.cl
@@ -15,7 +15,8 @@ __kernel void buff_advec(__global const float* gvelo,
   int gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */
   int i;			/* Particle index in 1D problem */
   int line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */
-  float p,v,c, hY, hZ;
+  float p,v,c;
+  float2 hY, hZ;
   int i_ind, i_indY, i_indZ;
 
 
@@ -25,32 +26,37 @@ __kernel void buff_advec(__global const float* gvelo,
   __local float* loc_ptr;
 
 
-  hY = (gidY * mesh->dx.y) * inv_v_dx_y;
-  hZ = (gidZ * mesh->dx.z) * inv_v_dx_z;
-  i_indY = convert_int_rtn(hY);
-  i_indZ = convert_int_rtn(hZ);
-  hY = hY - convert_float(i_indY);
-  hZ = hZ - convert_float(i_indZ);
+  hY.s0 = (gidY * mesh->dx.y) * inv_v_dx_y;
+  hZ.s0 = (gidZ * mesh->dx.z) * inv_v_dx_z;
+  i_indY = convert_int_rtn(hY.s0);
+  i_indZ = convert_int_rtn(hZ.s0);
+  hY.s0 = hY.s0 - convert_float(i_indY);
+  hZ.s0 = hZ.s0 - convert_float(i_indZ);
+  hY.s1 = (1.0-hY.s0);
+  hZ.s1 = (1.0-hZ.s0);
+
+  i_indY = i_indY + V_GHOSTS_NB;
+  i_indZ = i_indZ + V_GHOSTS_NB;
 
   for(i=gidX; i<V_NB_I; i+=(WI_NB)){
-    velocity_cache[noBC_id(i)] = (1.0-hY)*(1.0-hZ) * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II];
-    velocity_cache[noBC_id(i)] += (1.0-hY)*hZ * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II];
-    velocity_cache[noBC_id(i)] += hY*(1.0-hZ) * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II];
-    velocity_cache[noBC_id(i)] += hY*hZ * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] = hY.s1*hZ.s1 * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] += hY.s1*hZ.s0 * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] += hY.s0*hZ.s1 * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] += hY.s0*hZ.s0 * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II];
   }
 
   for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)){
-    buff_l_loc[i] = (1.0-hY)*(1.0-hZ)*buffer_l[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)];
-    buff_l_loc[i] += (1.0-hY)*hZ*buffer_l[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)];
-    buff_l_loc[i] += hY*(1.0-hZ)*buffer_l[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)];
-    buff_l_loc[i] += hY*hZ*buffer_l[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)];
+    buff_l_loc[i] = hY.s1*hZ.s1*buffer_l[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)];
+    buff_l_loc[i] += hY.s1*hZ.s0*buffer_l[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)];
+    buff_l_loc[i] += hY.s0*hZ.s1*buffer_l[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)];
+    buff_l_loc[i] += hY.s0*hZ.s0*buffer_l[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)];
   }
 
   for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)){
-    buff_r_loc[i] = (1.0-hY)*(1.0-hZ)*buffer_r[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)];
-    buff_r_loc[i] += (1.0-hY)*hZ*buffer_r[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)];
-    buff_r_loc[i] += hY*(1.0-hZ)*buffer_r[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)];
-    buff_r_loc[i] += hY*hZ*buffer_r[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)];
+    buff_r_loc[i] = hY.s1*hZ.s1*buffer_r[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)];
+    buff_r_loc[i] += hY.s1*hZ.s0*buffer_r[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)];
+    buff_r_loc[i] += hY.s0*hZ.s1*buffer_r[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)];
+    buff_r_loc[i] += hY.s0*hZ.s0*buffer_r[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)];
   }
 
   /* Synchronize work-group */
diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl
index 957c0d6b6..87127b396 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl
@@ -18,7 +18,8 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo,
   int gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */
   int i;			/* Particle index in 1D problem */
   int line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */
-  float p,v,c, hY, hZ,s,y,w;
+  float p,v,c,s,y,w;
+  float2 hY, hZ;
   int i_ind, i_indY, i_indZ;
   int ind, index;
 
@@ -46,32 +47,37 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo,
 
 
 
-  hY = (gidY * mesh->dx.y) * inv_v_dx_y;
-  hZ = (gidZ * mesh->dx.z) * inv_v_dx_z;
-  i_indY = convert_int_rtn(hY);
-  i_indZ = convert_int_rtn(hZ);
-  hY = hY - convert_float(i_indY);
-  hZ = hZ - convert_float(i_indZ);
+  hY.s0 = (gidY * mesh->dx.y) * inv_v_dx_y;
+  hZ.s0 = (gidZ * mesh->dx.z) * inv_v_dx_z;
+  i_indY = convert_int_rtn(hY.s0);
+  i_indZ = convert_int_rtn(hZ.s0);
+  hY.s0 = hY.s0 - convert_float(i_indY);
+  hZ.s0 = hZ.s0 - convert_float(i_indZ);
+  hY.s1 = (1.0-hY.s0);
+  hZ.s1 = (1.0-hZ.s0);
+
+  i_indY = i_indY + V_GHOSTS_NB;
+  i_indZ = i_indZ + V_GHOSTS_NB;
 
   for(i=gidX; i<V_NB_I; i+=(WI_NB)){
-    velocity_cache[noBC_id(i)] = (1.0-hY)*(1.0-hZ) * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II];
-    velocity_cache[noBC_id(i)] += (1.0-hY)*hZ * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II];
-    velocity_cache[noBC_id(i)] += hY*(1.0-hZ) * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II];
-    velocity_cache[noBC_id(i)] += hY*hZ * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] = hY.s1*hZ.s1 * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] += hY.s1*hZ.s0 * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] += hY.s0*hZ.s1 * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] += hY.s0*hZ.s0 * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II];
   }
 
   for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)){
-    v_l_buff_loc[i] = (1.0-hY)*(1.0-hZ)*v_l_buff[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)];
-    v_l_buff_loc[i] += (1.0-hY)*hZ*v_l_buff[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)];
-    v_l_buff_loc[i] += hY*(1.0-hZ)*v_l_buff[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)];
-    v_l_buff_loc[i] += hY*hZ*v_l_buff[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)];
+    v_l_buff_loc[i] = hY.s1*hZ.s1*v_l_buff[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)];
+    v_l_buff_loc[i] += hY.s1*hZ.s0*v_l_buff[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)];
+    v_l_buff_loc[i] += hY.s0*hZ.s1*v_l_buff[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)];
+    v_l_buff_loc[i] += hY.s0*hZ.s0*v_l_buff[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)];
   }
 
   for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)){
-    v_r_buff_loc[i] = (1.0-hY)*(1.0-hZ)*v_r_buff[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)];
-    v_r_buff_loc[i] += (1.0-hY)*hZ*v_r_buff[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)];
-    v_r_buff_loc[i] += hY*(1.0-hZ)*v_r_buff[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)];
-    v_r_buff_loc[i] += hY*hZ*v_r_buff[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)];
+    v_r_buff_loc[i] = hY.s1*hZ.s1*v_r_buff[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)];
+    v_r_buff_loc[i] += hY.s1*hZ.s0*v_r_buff[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)];
+    v_r_buff_loc[i] += hY.s0*hZ.s1*v_r_buff[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)];
+    v_r_buff_loc[i] += hY.s0*hZ.s0*v_r_buff[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)];
   }
 
   /* Synchronize work-group */
diff --git a/HySoP/hysop/gpu/multi_gpu_particle_advection.py b/HySoP/hysop/gpu/multi_gpu_particle_advection.py
index 9b2f783ae..1162ddb27 100644
--- a/HySoP/hysop/gpu/multi_gpu_particle_advection.py
+++ b/HySoP/hysop/gpu/multi_gpu_particle_advection.py
@@ -25,8 +25,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
     __metaclass__ = ABCMeta
 
     @debug
-    def __init__(self, platform_id=None, device_id=None, device_type=None,
-                 user_src=None, max_cfl=5, **kwds):
+    def __init__(self, max_velocity=None, max_dt=None, **kwds):
         """
         Create a Advection operator.
         Work on a given field (scalar or vector) at a given velocity to compute
@@ -34,31 +33,23 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
         OpenCL kernels are build once per dimension in order to handle
         directional splitting with resolution non uniform in directions.
 
-        @param velocity : Velocity field
-        @param fields_on_grid : Advected fields
-        @param d : Direction to advect
-        @param platform_id : OpenCL platform id (default = 0).
-        @param device_id : OpenCL device id (default = 0).
-        @param device_type : OpenCL device type (default = 'gpu').
-        @param method : the method to use. {'m4prime', 'm6prime', 'm8prime',
-        'l6star'}
-        @param user_src : User OpenCL sources.
-        @param splittingConfig : Directional splitting configuration
-        (parmepy.numerics.splitting.Splitting.__init__)
+        @param max_velocity : maximum velocity estimation for computing communications buffer sizes.
+        The estimation may be global or by components.
+        @param max_dt : maimum time step estimation.
+        @remark : Buffer widths are computed from max_velocity, max_dt and the mesh sizes:
+          - velocity buffers : |max_velocity * max_dt / v_dx| + 1
+          - scalar buffers : |max_velocity * max_dt / s_dx| + 1 + remesh_stencil/2
         """
         fields_topo = kwds['fields_on_grid'][0].topology
         direction = kwds['direction']
         self._cut_dir = fields_topo.cutdir
         self._is_cut_dir = self._cut_dir[direction]
 
-        super(MultiGPUParticleAdvection, self).__init__(
-            platform_id=platform_id, device_id=device_id,
-            device_type=device_type, user_src=user_src,
-            **kwds)
-        try:
-            self.max_cfl = max_cfl[self.direction]
-        except TypeError:
-            self.max_cfl = max_cfl
+        super(MultiGPUParticleAdvection, self).__init__(**kwds)
+
+        msg = "max_dt and _max_velocity must be given to advection "
+        msg += "for computing communication buffer sizes."
+        assert max_dt is not None and max_velocity is not None
 
         msg = "The Multi-GPU works only with the RK2 TimeIntegrator"
         assert self.method[TimeIntegrator] == RK2, msg
@@ -108,12 +99,27 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
         # mesh step
         self._space_step = msh.space_step
         self._v_space_step = v_msh.space_step
+
+        # Maximum cfl for velocity and scalar
+        try:
+            self.max_cfl_s = int(max_velocity[self.direction] * max_dt /
+                                 self._space_step[self.direction]) + 1
+            self.max_cfl_v = int(max_velocity[self.direction] * max_dt /
+                                 self._v_space_step[self.direction]) + 1
+        except TypeError:
+            self.max_cfl_s = int(max_velocity * max_dt /
+                                 self._space_step[self.direction]) + 1
+            self.max_cfl_v = int(max_velocity * max_dt /
+                                 self._v_space_step[self.direction]) + 1
+
+
+
         # Slice
         self._sl_dim = slice(self.dim, 2 * self.dim)
         self._cl_work_size = 0
         if self._is_cut_dir:
             #Advection variables
-            self._v_buff_width = self.max_cfl
+            self._v_buff_width = self.max_cfl_v
             self._v_r_buff = npw.zeros((self._v_buff_width,
                                         self.v_resol_dir[1],
                                         self.v_resol_dir[2]))
@@ -128,12 +134,12 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
                             self._cl_v_l_buff, self._v_l_buff).wait()
             self._cl_work_size += 2 * self._v_l_buff.nbytes
 
-            self._v_buff_size = self.max_cfl * \
+            self._v_buff_size = self._v_buff_width * \
                 self.v_resol_dir[1] * self.v_resol_dir[2]
             self._v_pitches_host = (int(self._v_l_buff[:, 0, 0].nbytes),
                                     int(self._v_l_buff[:, :, 0].nbytes))
             self._v_buffer_region = (
-                int(self.max_cfl * PARMES_REAL(0.).nbytes),
+                int(self._v_buff_width * PARMES_REAL(0.).nbytes),
                 int(self.v_resol_dir[1]),
                 int(self.v_resol_dir[2]))
 
@@ -147,7 +153,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
             self._py_remesh = self.method[Remesh]()
             ## Number of weights
             self._nb_w = len(self._py_remesh.weights)
-            self._s_buff_width = self.max_cfl + self._nb_w / 2
+            self._s_buff_width = self.max_cfl_s + self._nb_w / 2
             self._s_l_buff = npw.zeros((self._s_buff_width,
                                         self.resol_dir[1],
                                         self.resol_dir[2]))
@@ -240,7 +246,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
         build_options += " -D V_STOP_INDEX=" + str(self._v_stop_index)
         build_options += " -D START_INDEX=" + str(self._start_index)
         build_options += " -D STOP_INDEX=" + str(self._stop_index)
-        build_options += " -D V_BUFF_WIDTH=" + str(self.max_cfl)
+        build_options += " -D V_BUFF_WIDTH=" + str(self._v_buff_width)
         build_options += " -D FORMULA=" + self.method[Remesh].__name__.upper()
         build_options += " -D PART_NB_PER_WI="
         build_options += str(self.resol_dir[0] / WINb)
@@ -272,7 +278,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
         build_options += " -D V_STOP_INDEX=" + str(self._v_stop_index)
         build_options += " -D START_INDEX=" + str(self._start_index)
         build_options += " -D STOP_INDEX=" + str(self._stop_index)
-        build_options += " -D V_BUFF_WIDTH=" + str(self.max_cfl)
+        build_options += " -D V_BUFF_WIDTH=" + str(self._v_buff_width)
         prg = self.cl_env.build_src(
             src, build_options, 1)
         self.num_advec = KernelLauncher(
@@ -302,33 +308,33 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
     def _exchange_velocity_buffers(self):
         ghosts = self.velocity_topo.ghosts()[self.direction]
         if self._is_cut_dir and self.direction == 0:
-            velo_sl = (slice(self.v_resol_dir[0] - self.max_cfl - ghosts,
+            velo_sl = (slice(self.v_resol_dir[0] - self._v_buff_width - ghosts,
                              self.v_resol_dir[0] - ghosts),
                        slice(None), slice(None),)
             self._v_r_buff_loc[...] = self.velocity.data[0][velo_sl]
-            velo_sl = (slice(0 + ghosts, self.max_cfl + ghosts),
+            velo_sl = (slice(0 + ghosts, self._v_buff_width + ghosts),
                        slice(None), slice(None))
             self._v_l_buff_loc[...] = self.velocity.data[0][velo_sl]
         if self._is_cut_dir and self.direction == 1:
             velo_sl = (slice(None),
-                       slice(self.v_resol_dir[0] - self.max_cfl - ghosts,
+                       slice(self.v_resol_dir[0] - self._v_buff_width - ghosts,
                              self.v_resol_dir[0] - ghosts),
                        slice(None))
             self._v_r_buff_loc[...] = \
                 self.velocity.data[1][velo_sl].swapaxes(0, 1)
             velo_sl = (slice(None),
-                       slice(0 + ghosts, self.max_cfl + ghosts),
+                       slice(0 + ghosts, self._v_buff_width + ghosts),
                        slice(None))
             self._v_l_buff_loc[...] = \
                 self.velocity.data[1][velo_sl].swapaxes(0, 1)
         if self._is_cut_dir and self.direction == 2:
             velo_sl = (slice(None), slice(None),
-                       slice(self.v_resol_dir[0] - self.max_cfl - ghosts,
+                       slice(self.v_resol_dir[0] - self._v_buff_width - ghosts,
                              self.v_resol_dir[0] - ghosts))
             self._v_r_buff_loc[...] = \
                 self.velocity.data[2][velo_sl].swapaxes(0, 1).swapaxes(0, 2)
             velo_sl = (slice(None), slice(None),
-                       slice(0 + ghosts, self.max_cfl + ghosts))
+                       slice(0 + ghosts, self._v_buff_width + ghosts))
             self._v_l_buff_loc[...] = \
                 self.velocity.data[2][velo_sl].swapaxes(0, 1).swapaxes(0, 2)
 
-- 
GitLab