From c32b73d5120c95b1dcb4af583d9f1a7dbb549c5a Mon Sep 17 00:00:00 2001
From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@univ-reims.fr>
Date: Thu, 19 Feb 2015 08:20:52 +0100
Subject: [PATCH] ok gpu multiresolution filter (ghosts exchange)

---
 .../cl_src/kernels/fine_to_coarse_filter.cl   |  28 ++-
 HySoP/hysop/gpu/config_default.py             |  10 +-
 HySoP/hysop/gpu/multiresolution_filter.py     | 196 ++++++++++++++++--
 .../gpu/tests/test_multiresolutionfilter.py   |  22 +-
 .../discrete/multiresolution_filter.py        |  60 ++----
 5 files changed, 246 insertions(+), 70 deletions(-)

diff --git a/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl b/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl
index a4e6f753e..40328dcff 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl
@@ -26,6 +26,7 @@ __kernel void coarse_to_fine_filter(__global const float* scal_in,
   float4 d;
   __local float line[WG*PTS_PER_CELL_X];
   __local float result[NB_OUT_X][L_STENCIL][L_STENCIL];
+  __private float p_res[L_STENCIL][L_STENCIL][L_STENCIL];
   ///// IDEE : calculer les points de la cellule ouput calculee par un w-i dans les registres et ensuite l'ajouter à la memoire locale.
   ///// Utiliser un tableau prive de taille [L_STENCIL][L_STENCIL][L_STENCIL].
 
@@ -46,6 +47,11 @@ __kernel void coarse_to_fine_filter(__global const float* scal_in,
       // Each work-item is computing a coarse cell (looping in 3D over PTS_PER_CELL thanks to pt_x, pt_y and pt_z indices)
       // global fine grid data are cached line by line in the X direction
       coord_out = ((float4)(b_id*WG+lid, iy_c, iz_c, 0.0)) * dx_out;
+      // Initialize the register corresponding to the current cell
+      for (pt_z=0;pt_z<PTS_PER_CELL_Z;pt_z++)
+	for (pt_y=0;pt_y<PTS_PER_CELL_Y;pt_y++)
+	  for (pt_x=0;pt_x<PTS_PER_CELL_X;pt_x++)
+	    p_res[pt_x][pt_y][pt_z] = 0.0;
 
       // Loop over PTS_PER_CELL_Z: fine grid points in the curent cell
       for (pt_z=0;pt_z<PTS_PER_CELL_Z;pt_z++)
@@ -63,19 +69,25 @@ __kernel void coarse_to_fine_filter(__global const float* scal_in,
 		{
 		  coord_in = ((float4)(b_id*(WG*PTS_PER_CELL_X) + lid*PTS_PER_CELL_X + pt_x, iy_f+pt_y, iz_f+pt_z, 0.0)) * dx_in;
 		  d = (coord_in  - coord_out) / dx_out;
-		  result[GHOSTS_OUT_X+b_id*WG+lid][0][0] += scale_factor * (1.0 - d.x) * (1.0 - d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x];
-		  result[GHOSTS_OUT_X+b_id*WG+lid][0][1] += scale_factor * (1.0 - d.x) * (1.0 - d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x];
-		  result[GHOSTS_OUT_X+b_id*WG+lid][1][0] += scale_factor * (1.0 - d.x) * (d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x];
-		  result[GHOSTS_OUT_X+b_id*WG+lid][1][1] += scale_factor * (1.0 - d.x) * (d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x];
-		  result[GHOSTS_OUT_X+b_id*WG+lid+1][0][0] += scale_factor * (d.x) * (1.0 - d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x];
-		  result[GHOSTS_OUT_X+b_id*WG+lid+1][0][1] += scale_factor * (d.x) * (1.0 - d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x];
-		  result[GHOSTS_OUT_X+b_id*WG+lid+1][1][0] += scale_factor * (d.x) * (d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x];
-		  result[GHOSTS_OUT_X+b_id*WG+lid+1][1][1] += scale_factor * (d.x) * (d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x];
+		  p_res[0][0][0] += scale_factor * (1.0 - d.x) * (1.0 - d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x];
+		  p_res[0][0][1] += scale_factor * (1.0 - d.x) * (1.0 - d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x];
+		  p_res[0][1][0] += scale_factor * (1.0 - d.x) * (d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x];
+		  p_res[0][1][1] += scale_factor * (1.0 - d.x) * (d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x];
+		  p_res[1][0][0] += scale_factor * (d.x) * (1.0 - d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x];
+		  p_res[1][0][1] += scale_factor * (d.x) * (1.0 - d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x];
+		  p_res[1][1][0] += scale_factor * (d.x) * (d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x];
+		  p_res[1][1][1] += scale_factor * (d.x) * (d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x];
 
 		  barrier(CLK_LOCAL_MEM_FENCE);
 		}
 	    }
 	}
+      // Store the registers results in local memory
+      for (pt_z=0;pt_z<PTS_PER_CELL_Z;pt_z++)
+	for (pt_y=0;pt_y<PTS_PER_CELL_Y;pt_y++)
+	  for (pt_x=0;pt_x<PTS_PER_CELL_X;pt_x++)
+	    result[GHOSTS_OUT_X+b_id*WG+lid+pt_x][pt_y][pt_z] += p_res[pt_x][pt_y][pt_z];
+
     }
 
   // Write result in output array
diff --git a/HySoP/hysop/gpu/config_default.py b/HySoP/hysop/gpu/config_default.py
index 58755aede..4f9c8393c 100644
--- a/HySoP/hysop/gpu/config_default.py
+++ b/HySoP/hysop/gpu/config_default.py
@@ -221,8 +221,12 @@ kernels_config[3][FLOAT_GPU]['advec_MS_and_remesh_comm'] = \
      False, 1, advection_and_remeshing_index_space)
 
 
+def fine_to_coarse_filter_index_space(size, pts_per_cell):
+    wg = size[0] / (2 * pts_per_cell[0])
+    return ((wg, size[1] / pts_per_cell[1], size[2] / pts_per_cell[2]),
+            (wg, 1, 1))
+
+
 kernels_config[3][FLOAT_GPU]['fine_to_coarse_filter'] = \
     (["common.cl", "kernels/fine_to_coarse_filter.cl"],
-     1,
-     lambda size, pts_per_cell: ((32, size[1] / pts_per_cell[1], size[2] / pts_per_cell[2]),
-                                 (32, 1, 1)))
+     1, fine_to_coarse_filter_index_space)
diff --git a/HySoP/hysop/gpu/multiresolution_filter.py b/HySoP/hysop/gpu/multiresolution_filter.py
index d207e677e..b0fd39802 100644
--- a/HySoP/hysop/gpu/multiresolution_filter.py
+++ b/HySoP/hysop/gpu/multiresolution_filter.py
@@ -6,6 +6,7 @@ from hysop.operator.discrete.discrete import get_extra_args_from_method
 from hysop.gpu.gpu_discrete import GPUDiscreteField
 from hysop.gpu.gpu_kernel import KernelLauncher
 from hysop.methods_keys import Remesh
+from hysop.gpu import cl
 
 
 class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
@@ -21,11 +22,11 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
         self._cl_work_size = 0
         self._mesh_in = self.field_in.topology.mesh
         self._mesh_out = self.field_out.topology.mesh
-        gh_out = self.field_out.topology.ghosts()
+        self.gh_out = self.field_out.topology.ghosts()
         gh_in = self.field_in.topology.ghosts()
         #print gh_in, gh_out
         resol_in = self._mesh_in.resolution - 2 * gh_in
-        resol_out = self._mesh_out.resolution - 2 * gh_out
+        resol_out = self._mesh_out.resolution - 2 * self.gh_out
         pts_per_cell = resol_in / resol_out
         assert np.all(pts_per_cell >= 1), "This operator is fine grid to coarse one"
         self.scale_factor = np.prod(self._mesh_in.space_step) / \
@@ -82,22 +83,36 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
         self.resol_out[:self.dim] = self._reorderVect(shape_out)
         self._append_size_constants(resol_in, prefix='NB_IN')
         self._append_size_constants(resol_out, prefix='NB_OUT')
-        self._append_size_constants(topo_out.ghosts(), prefix='GHOSTS_OUT')
+        self._append_size_constants(self.gh_out, prefix='GHOSTS_OUT')
         self._append_size_constants(pts_per_cell, prefix='PTS_PER_CELL')
 
+        # Ghosts temp arrays
+        self.gh_x = npw.zeros((4 * self.gh_out[0], shape_out[1], shape_out[2]))
+        self.gh_y = npw.zeros((shape_out[0], 4 * self.gh_out[1], shape_out[2]))
+        self.gh_z = npw.zeros((shape_out[0], shape_out[1], 4 * self.gh_out[2]))
+        print self.gh_x.shape, self.gh_y.shape, self.gh_z.shape
+        self._pitches_host_x = (int(self.gh_x[:, 0, 0].nbytes),
+                                int(self.gh_x[:, :, 0].nbytes))
+        self._pitches_host_y = (int(self.gh_y[:, 0, 0].nbytes),
+                                int(self.gh_y[:, :, 0].nbytes))
+        self._pitches_host_z = (int(self.gh_z[:, 0, 0].nbytes),
+                                int(self.gh_z[:, :, 0].nbytes))
+        self._pitches_buff = (int(self.field_out.data[0][:, 0, 0].nbytes),
+                              int(self.field_out.data[0][:, :, 0].nbytes))
+
         src, vec, f_space = \
             self._kernel_cfg['fine_to_coarse_filter']
         build_options = self._size_constants
         self._rmsh = self.method[Remesh]()
+        gwi, lwi = f_space(self.field_out.data[0].shape -
+                           2 * topo_out.ghosts(), pts_per_cell)
         build_options += " -D L_STENCIL=" + str(len(self._rmsh.weights))
         build_options += " -D SHIFT_STENCIL=" + str(self._rmsh.shift)
-        build_options += " -D WG=32"
+        build_options += " -D WG=" + str(lwi[0])
         build_options += " -D FORMULA=" + self.method[Remesh].__name__.upper()
         print build_options
         prg = self.cl_env.build_src(src, build_options, vec)
-        gwi, lwi = f_space(self.field_out.data[0].shape -
-                           2 * topo_out.ghosts(), pts_per_cell)
-        self.num_diffusion = KernelLauncher(
+        self.fine_to_coarse = KernelLauncher(
             prg.coarse_to_fine_filter, self.cl_env.queue, gwi, lwi)
         print gwi, lwi
 
@@ -105,11 +120,162 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
         evts = []
         for iy in xrange(len(self._rmsh.weights)):
             for iz in xrange(len(self._rmsh.weights)):
-                evts.append(self.num_diffusion(self.field_in.gpu_data[0],
-                                               self.field_out.gpu_data[0],
-                                               self.scale_factor,
-                                               self._mesh_size_in,
-                                               self._mesh_size_out,
-                                               self._domain_origin,
-                                               np.int32(iy), np.int32(iz),
-                                               wait_for=evts))
+                evts.append(self.fine_to_coarse(self.field_in.gpu_data[0],
+                                                self.field_out.gpu_data[0],
+                                                self.scale_factor,
+                                                self._mesh_size_in,
+                                                self._mesh_size_out,
+                                                self._domain_origin,
+                                                np.int32(iy), np.int32(iz),
+                                                wait_for=evts))
+                self.field_out.events.append(evts[-1])
+        # Get ghosts values and in-domain layer
+        # X-direction
+        get_gh_xl = cl.enqueue_copy(
+            self.cl_env.queue,
+            self.gh_x, self.field_out.gpu_data[0],
+            host_origin=(0, 0, 0),
+            buffer_origin=(0, 0, 0),
+            host_pitches=self._pitches_host_x,
+            buffer_pitches=self._pitches_buff,
+            region=(self.gh_x[:2, 0, 0].nbytes,
+                    self.gh_x.shape[1],
+                    self.gh_x.shape[2]),
+            wait_for=evts)
+        get_gh_xr = cl.enqueue_copy(
+            self.cl_env.queue,
+            self.gh_x, self.field_out.gpu_data[0],
+            host_origin=(self.gh_x[:2, 0, 0].nbytes, 0, 0),
+            buffer_origin=(self.field_out.data[0][:, 0, 0].nbytes -
+                           self.gh_x[:2, 0, 0].nbytes, 0, 0),
+            host_pitches=self._pitches_host_x,
+            buffer_pitches=self._pitches_buff,
+            region=(self.gh_x[:2, 0, 0].nbytes,
+                    self.gh_x.shape[1],
+                    self.gh_x.shape[2]),
+            wait_for=evts)
+        get_gh_xl.wait()
+        get_gh_xr.wait()
+        # Add ghosts contributions in domain layer
+        self.gh_x[2, :, :] += self.gh_x[0, :, :]
+        self.gh_x[1, :, :] += self.gh_x[-1, :, :]
+        set_gh_xl = cl.enqueue_copy(
+            self.cl_env.queue,
+            self.field_out.gpu_data[0], self.gh_x,
+            host_origin=(self.gh_x[:1, 0, 0].nbytes, 0, 0),
+            buffer_origin=(self.gh_x[:1, 0, 0].nbytes, 0, 0),
+            host_pitches=self._pitches_host_x,
+            buffer_pitches=self._pitches_buff,
+            region=(self.gh_x[:1, 0, 0].nbytes,
+                    self.gh_x.shape[1],
+                    self.gh_x.shape[2]),
+            wait_for=evts)
+        set_gh_xr = cl.enqueue_copy(
+            self.cl_env.queue,
+             self.field_out.gpu_data[0], self.gh_x,
+            host_origin=(self.gh_x[:3, 0, 0].nbytes, 0, 0),
+            buffer_origin=(self.field_out.data[0][:, 0, 0].nbytes -
+                           self.gh_x[:1, 0, 0].nbytes, 0, 0),
+            host_pitches=self._pitches_host_x,
+            buffer_pitches=self._pitches_buff,
+            region=(self.gh_x[:1, 0, 0].nbytes,
+                    self.gh_x.shape[1],
+                    self.gh_x.shape[2]),
+            wait_for=evts)
+        set_gh_xl.wait()
+        set_gh_xr.wait()
+
+        # Y-direction
+        get_gh_yl = cl.enqueue_copy(
+            self.cl_env.queue,
+            self.gh_y, self.field_out.gpu_data[0],
+            host_origin=(0, 0, 0),
+            buffer_origin=(0, 0, 0),
+            host_pitches=self._pitches_host_y,
+            buffer_pitches=self._pitches_buff,
+            region=(self.gh_y[:, 0, 0].nbytes, 2, self.gh_y.shape[2]),
+            wait_for=evts)
+        get_gh_yr = cl.enqueue_copy(
+            self.cl_env.queue,
+            self.gh_y, self.field_out.gpu_data[0],
+            host_origin=(0, 2, 0),
+            buffer_origin=(0, self.field_out.data[0].shape[1] - 2, 0),
+            host_pitches=self._pitches_host_y,
+            buffer_pitches=self._pitches_buff,
+            region=(self.gh_y[:, 0, 0].nbytes, 2, self.gh_y.shape[2]),
+            wait_for=evts)
+        get_gh_yl.wait()
+        get_gh_yr.wait()
+        # Add ghosts contributions in domain layer
+        self.gh_y[:, 2, :] += self.gh_y[:, 0, :]
+        self.gh_y[:, 1, :] += self.gh_y[:, -1, :]
+        set_gh_yl = cl.enqueue_copy(
+            self.cl_env.queue,
+            self.field_out.gpu_data[0], self.gh_y,
+            host_origin=(0, 1, 0),
+            buffer_origin=(0, 1, 0),
+            host_pitches=self._pitches_host_y,
+            buffer_pitches=self._pitches_buff,
+            region=(self.gh_y[:, 0, 0].nbytes, 1, self.gh_y.shape[2]),
+            wait_for=evts)
+        set_gh_yr = cl.enqueue_copy(
+            self.cl_env.queue,
+            self.field_out.gpu_data[0], self.gh_y,
+            host_origin=(0, 3, 0),
+            buffer_origin=(0, self.field_out.data[0].shape[1] - 1, 0),
+            host_pitches=self._pitches_host_y,
+            buffer_pitches=self._pitches_buff,
+            region=(self.gh_y[:, 0, 0].nbytes, 1, self.gh_y.shape[2]),
+            wait_for=evts)
+        set_gh_yl.wait()
+        set_gh_yr.wait()
+
+        # Z-direction
+        get_gh_zl = cl.enqueue_copy(
+            self.cl_env.queue,
+            self.gh_z, self.field_out.gpu_data[0],
+            host_origin=(0, 0, 0),
+            buffer_origin=(0, 0, 0),
+            host_pitches=self._pitches_host_z,
+            buffer_pitches=self._pitches_buff,
+            region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 2),
+            wait_for=evts)
+        get_gh_zr = cl.enqueue_copy(
+            self.cl_env.queue,
+            self.gh_z, self.field_out.gpu_data[0],
+            host_origin=(0, 0, 2),
+            buffer_origin=(0, 0, self.field_out.data[0].shape[2] - 2),
+            host_pitches=self._pitches_host_z,
+            buffer_pitches=self._pitches_buff,
+            region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 2),
+            wait_for=evts)
+        get_gh_zl.wait()
+        get_gh_zr.wait()
+        # Add ghosts contributions in domain layer
+        self.gh_z[:, :, 2] += self.gh_z[:, :, 0]
+        self.gh_z[:, :, 1] += self.gh_z[:, :, -1]
+        set_gh_zl = cl.enqueue_copy(
+            self.cl_env.queue,
+            self.field_out.gpu_data[0], self.gh_z,
+            host_origin=(0, 0, 1),
+            buffer_origin=(0, 0, 1),
+            host_pitches=self._pitches_host_z,
+            buffer_pitches=self._pitches_buff,
+            region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 1),
+            wait_for=evts)
+        set_gh_zr = cl.enqueue_copy(
+            self.cl_env.queue,
+            self.field_out.gpu_data[0], self.gh_z,
+            host_origin=(0, 0, 3),
+            buffer_origin=(0, 0, self.field_out.data[0].shape[2] - 1),
+            host_pitches=self._pitches_host_z,
+            buffer_pitches=self._pitches_buff,
+            region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 1),
+            wait_for=evts)
+        set_gh_zl.wait()
+        set_gh_zr.wait()
+
+
+    def get_profiling_info(self):
+        for p in self.fine_to_coarse.profile:
+            self.profiler += p
diff --git a/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py b/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py
index 9ab6c48d2..004bc89d0 100644
--- a/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py
+++ b/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py
@@ -1,3 +1,5 @@
+import hysop.gpu
+hysop.gpu.CL_PROFILE = True
 from hysop.problem.simulation import Simulation
 from hysop.tools.parameters import Discretization
 from hysop.domain.box import Box
@@ -15,8 +17,8 @@ simu = Simulation(tinit=0., tend=0.1, nbIter=1)
 
 
 def func(res, x, y, z, t=0):
-    res[0][...] = np.sin(2. * np.pi * x) * \
-                  np.cos(2. * np.pi * y) * np.sin(4. * np.pi * z)
+    res[0][...] = np.cos(2. * np.pi * x) * \
+                  np.cos(2. * np.pi * y) * np.cos(4. * np.pi * z)
     return res
 
 
@@ -62,9 +64,21 @@ def test_filter():
     # print np.where(f_out.data[0][topo_coarse.mesh.iCompute]>0.0001)
     # print valid[0][topo_coarse.mesh.iCompute][32+4:32+12,2:6,2:6]
     # print f_out.data[0][topo_coarse.mesh.iCompute][32+4:32+12,2:6,2:6]
-    e = np.max(np.abs(valid[0][topo_coarse.mesh.iCompute][1:-1,1:-1,1:-1] -
-                      f_out[0][topo_coarse.mesh.iCompute][1:-1,1:-1,1:-1]))
+    # err = valid[0] - f_out[0]
+    # print "MAX X", np.max(f_out[0][0,:,:]), np.max(f_out[0][-1,:,:])
+    #print "MAX Y", np.max(f_out[0][:,0,:]), np.max(f_out[0][:,-1,:])
+    #print "MAX vY", np.max(valid[0][:,1,:]), np.max(valid[0][:,-2,:])
+    # print "MAX Z", np.max(f_out[0][:,:,0]), np.max(f_out[0][:,:,-1])
+    # print np.where(err[:2,:,:] > 0.0001)
+    # print err[:3,-4:,-4:]
+    e = np.max(np.abs(valid[0][topo_coarse.mesh.iCompute] -
+                      f_out[0][topo_coarse.mesh.iCompute]))
     print e
+    ## PB DE CUMLUL DES GHPSTS DANS LES COINS DE DIRECTION EN DIRECTIO?
+    #print np.where(np.abs(valid[0][topo_coarse.mesh.iCompute][:,0,1:-1] - \
+    #             f_out[0][topo_coarse.mesh.iCompute][:,0,1:-1]) > 0.001)
+    op.profiler.summarize()
+    print op.profiler
 
 
 if __name__ == '__main__':
diff --git a/HySoP/hysop/operator/discrete/multiresolution_filter.py b/HySoP/hysop/operator/discrete/multiresolution_filter.py
index 3565e7a38..b463b0a1c 100644
--- a/HySoP/hysop/operator/discrete/multiresolution_filter.py
+++ b/HySoP/hysop/operator/discrete/multiresolution_filter.py
@@ -107,82 +107,62 @@ class FilterFineToCoarse(DiscreteOperator):
             self._bc_from_ghosts.append((
                 slice(self._mesh_out.iCompute[0].stop,
                       self._mesh_out.iCompute[0].stop + gh_out[0], None),
-                self._mesh_out.iCompute[1],
-                self._mesh_out.iCompute[2]
-            ))
+                slice(None), slice(None)))
             self._bc_to_compute.append((
                 slice(self._mesh_out.iCompute[0].start,
                       self._mesh_out.iCompute[0].start + gh_out[0], None),
-                self._mesh_out.iCompute[1],
-                self._mesh_out.iCompute[2]
-            ))
+                slice(None), slice(None)))
             # Left X-dir ghosts
             self._bc_from_ghosts.append((
                 slice(self._mesh_out.iCompute[0].start - gh_out[0],
                       self._mesh_out.iCompute[0].start, None),
-                self._mesh_out.iCompute[1],
-                self._mesh_out.iCompute[2]
-            ))
+                slice(None), slice(None)))
             self._bc_to_compute.append((
                 slice(self._mesh_out.iCompute[0].stop - gh_out[0],
                       self._mesh_out.iCompute[0].stop, None),
-                self._mesh_out.iCompute[1],
-                self._mesh_out.iCompute[2]
-            ))
+                slice(None), slice(None)))
         if gh_out[1] > 0:
             # Right Y-dir ghosts
             self._bc_from_ghosts.append((
-                self._mesh_out.iCompute[0],
+                slice(None),
                 slice(self._mesh_out.iCompute[1].stop,
                       self._mesh_out.iCompute[1].stop + gh_out[1], None),
-                self._mesh_out.iCompute[2]
-            ))
+                slice(None)))
             self._bc_to_compute.append((
-                self._mesh_out.iCompute[0],
+                slice(None),
                 slice(self._mesh_out.iCompute[1].start,
                       self._mesh_out.iCompute[1].start + gh_out[1], None),
-                self._mesh_out.iCompute[2]
-            ))
+                slice(None)))
             # Left Y-dir ghosts
             self._bc_from_ghosts.append((
-                self._mesh_out.iCompute[0],
+                slice(None),
                 slice(self._mesh_out.iCompute[1].start - gh_out[1],
                       self._mesh_out.iCompute[1].start, None),
-                self._mesh_out.iCompute[2]
-            ))
+                slice(None)))
             self._bc_to_compute.append((
-                self._mesh_out.iCompute[0],
+                slice(None),
                 slice(self._mesh_out.iCompute[1].stop - gh_out[1],
                       self._mesh_out.iCompute[1].stop, None),
-                self._mesh_out.iCompute[2]
-            ))
+                slice(None)))
         if gh_out[2] > 0:
             # Right Z-dir ghosts
             self._bc_from_ghosts.append((
-                self._mesh_out.iCompute[0],
-                self._mesh_out.iCompute[1],
+                slice(None), slice(None),
                 slice(self._mesh_out.iCompute[2].stop,
-                      self._mesh_out.iCompute[2].stop + gh_out[2], None)
-            ))
+                      self._mesh_out.iCompute[2].stop + gh_out[2], None)))
             self._bc_to_compute.append((
-                self._mesh_out.iCompute[0],
-                self._mesh_out.iCompute[1],
+                slice(None), slice(None),
                 slice(self._mesh_out.iCompute[2].start,
-                      self._mesh_out.iCompute[2].start + gh_out[2], None)
-            ))
+                      self._mesh_out.iCompute[2].start + gh_out[2], None)))
             # Left Z-dir ghosts
             self._bc_from_ghosts.append((
-                self._mesh_out.iCompute[0],
-                self._mesh_out.iCompute[1],
+                slice(None), slice(None),
                 slice(self._mesh_out.iCompute[2].start - gh_out[2],
-                      self._mesh_out.iCompute[2].start, None)
-            ))
+                      self._mesh_out.iCompute[2].start, None)))
             self._bc_to_compute.append((
-                self._mesh_out.iCompute[0],
-                self._mesh_out.iCompute[1],
+                slice(None), slice(None),
                 slice(self._mesh_out.iCompute[2].stop - gh_out[2],
-                      self._mesh_out.iCompute[2].stop, None)
-            ))
+                      self._mesh_out.iCompute[2].stop, None)))
 
     @debug
     @profile
-- 
GitLab