diff --git a/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl b/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl index a4e6f753edfa66b5aa0bbb61425b6623681c6199..40328dcff62af3414576f71e3958c3022643f5ee 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl @@ -26,6 +26,7 @@ __kernel void coarse_to_fine_filter(__global const float* scal_in, float4 d; __local float line[WG*PTS_PER_CELL_X]; __local float result[NB_OUT_X][L_STENCIL][L_STENCIL]; + __private float p_res[L_STENCIL][L_STENCIL][L_STENCIL]; ///// IDEE : calculer les points de la cellule ouput calculee par un w-i dans les registres et ensuite l'ajouter à la memoire locale. ///// Utiliser un tableau prive de taille [L_STENCIL][L_STENCIL][L_STENCIL]. @@ -46,6 +47,11 @@ __kernel void coarse_to_fine_filter(__global const float* scal_in, // Each work-item is computing a coarse cell (looping in 3D over PTS_PER_CELL thanks to pt_x, pt_y and pt_z indices) // global fine grid data are cached line by line in the X direction coord_out = ((float4)(b_id*WG+lid, iy_c, iz_c, 0.0)) * dx_out; + // Initialize the register corresponding to the current cell + for (pt_z=0;pt_z<PTS_PER_CELL_Z;pt_z++) + for (pt_y=0;pt_y<PTS_PER_CELL_Y;pt_y++) + for (pt_x=0;pt_x<PTS_PER_CELL_X;pt_x++) + p_res[pt_x][pt_y][pt_z] = 0.0; // Loop over PTS_PER_CELL_Z: fine grid points in the curent cell for (pt_z=0;pt_z<PTS_PER_CELL_Z;pt_z++) @@ -63,19 +69,25 @@ __kernel void coarse_to_fine_filter(__global const float* scal_in, { coord_in = ((float4)(b_id*(WG*PTS_PER_CELL_X) + lid*PTS_PER_CELL_X + pt_x, iy_f+pt_y, iz_f+pt_z, 0.0)) * dx_in; d = (coord_in - coord_out) / dx_out; - result[GHOSTS_OUT_X+b_id*WG+lid][0][0] += scale_factor * (1.0 - d.x) * (1.0 - d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x]; - result[GHOSTS_OUT_X+b_id*WG+lid][0][1] += scale_factor * (1.0 - d.x) * (1.0 - d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x]; - result[GHOSTS_OUT_X+b_id*WG+lid][1][0] += scale_factor * (1.0 - d.x) * (d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x]; - result[GHOSTS_OUT_X+b_id*WG+lid][1][1] += scale_factor * (1.0 - d.x) * (d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x]; - result[GHOSTS_OUT_X+b_id*WG+lid+1][0][0] += scale_factor * (d.x) * (1.0 - d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x]; - result[GHOSTS_OUT_X+b_id*WG+lid+1][0][1] += scale_factor * (d.x) * (1.0 - d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x]; - result[GHOSTS_OUT_X+b_id*WG+lid+1][1][0] += scale_factor * (d.x) * (d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x]; - result[GHOSTS_OUT_X+b_id*WG+lid+1][1][1] += scale_factor * (d.x) * (d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x]; + p_res[0][0][0] += scale_factor * (1.0 - d.x) * (1.0 - d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x]; + p_res[0][0][1] += scale_factor * (1.0 - d.x) * (1.0 - d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x]; + p_res[0][1][0] += scale_factor * (1.0 - d.x) * (d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x]; + p_res[0][1][1] += scale_factor * (1.0 - d.x) * (d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x]; + p_res[1][0][0] += scale_factor * (d.x) * (1.0 - d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x]; + p_res[1][0][1] += scale_factor * (d.x) * (1.0 - d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x]; + p_res[1][1][0] += scale_factor * (d.x) * (d.y) * (1.0 - d.z) * line[lid*PTS_PER_CELL_X+pt_x]; + p_res[1][1][1] += scale_factor * (d.x) * (d.y) * (d.z) * line[lid*PTS_PER_CELL_X+pt_x]; barrier(CLK_LOCAL_MEM_FENCE); } } } + // Store the registers results in local memory + for (pt_z=0;pt_z<PTS_PER_CELL_Z;pt_z++) + for (pt_y=0;pt_y<PTS_PER_CELL_Y;pt_y++) + for (pt_x=0;pt_x<PTS_PER_CELL_X;pt_x++) + result[GHOSTS_OUT_X+b_id*WG+lid+pt_x][pt_y][pt_z] += p_res[pt_x][pt_y][pt_z]; + } // Write result in output array diff --git a/HySoP/hysop/gpu/config_default.py b/HySoP/hysop/gpu/config_default.py index 58755aedecd4f78fbfeedc104c04cc073e564947..4f9c8393cd0af2edaf032ae81b6ac04c43de32cf 100644 --- a/HySoP/hysop/gpu/config_default.py +++ b/HySoP/hysop/gpu/config_default.py @@ -221,8 +221,12 @@ kernels_config[3][FLOAT_GPU]['advec_MS_and_remesh_comm'] = \ False, 1, advection_and_remeshing_index_space) +def fine_to_coarse_filter_index_space(size, pts_per_cell): + wg = size[0] / (2 * pts_per_cell[0]) + return ((wg, size[1] / pts_per_cell[1], size[2] / pts_per_cell[2]), + (wg, 1, 1)) + + kernels_config[3][FLOAT_GPU]['fine_to_coarse_filter'] = \ (["common.cl", "kernels/fine_to_coarse_filter.cl"], - 1, - lambda size, pts_per_cell: ((32, size[1] / pts_per_cell[1], size[2] / pts_per_cell[2]), - (32, 1, 1))) + 1, fine_to_coarse_filter_index_space) diff --git a/HySoP/hysop/gpu/multiresolution_filter.py b/HySoP/hysop/gpu/multiresolution_filter.py index d207e677ed38e06bfc1e2f7bbf926147a55c7464..b0fd3980230e086ce379d8b65a78a19b13b2b3fd 100644 --- a/HySoP/hysop/gpu/multiresolution_filter.py +++ b/HySoP/hysop/gpu/multiresolution_filter.py @@ -6,6 +6,7 @@ from hysop.operator.discrete.discrete import get_extra_args_from_method from hysop.gpu.gpu_discrete import GPUDiscreteField from hysop.gpu.gpu_kernel import KernelLauncher from hysop.methods_keys import Remesh +from hysop.gpu import cl class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator): @@ -21,11 +22,11 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator): self._cl_work_size = 0 self._mesh_in = self.field_in.topology.mesh self._mesh_out = self.field_out.topology.mesh - gh_out = self.field_out.topology.ghosts() + self.gh_out = self.field_out.topology.ghosts() gh_in = self.field_in.topology.ghosts() #print gh_in, gh_out resol_in = self._mesh_in.resolution - 2 * gh_in - resol_out = self._mesh_out.resolution - 2 * gh_out + resol_out = self._mesh_out.resolution - 2 * self.gh_out pts_per_cell = resol_in / resol_out assert np.all(pts_per_cell >= 1), "This operator is fine grid to coarse one" self.scale_factor = np.prod(self._mesh_in.space_step) / \ @@ -82,22 +83,36 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator): self.resol_out[:self.dim] = self._reorderVect(shape_out) self._append_size_constants(resol_in, prefix='NB_IN') self._append_size_constants(resol_out, prefix='NB_OUT') - self._append_size_constants(topo_out.ghosts(), prefix='GHOSTS_OUT') + self._append_size_constants(self.gh_out, prefix='GHOSTS_OUT') self._append_size_constants(pts_per_cell, prefix='PTS_PER_CELL') + # Ghosts temp arrays + self.gh_x = npw.zeros((4 * self.gh_out[0], shape_out[1], shape_out[2])) + self.gh_y = npw.zeros((shape_out[0], 4 * self.gh_out[1], shape_out[2])) + self.gh_z = npw.zeros((shape_out[0], shape_out[1], 4 * self.gh_out[2])) + print self.gh_x.shape, self.gh_y.shape, self.gh_z.shape + self._pitches_host_x = (int(self.gh_x[:, 0, 0].nbytes), + int(self.gh_x[:, :, 0].nbytes)) + self._pitches_host_y = (int(self.gh_y[:, 0, 0].nbytes), + int(self.gh_y[:, :, 0].nbytes)) + self._pitches_host_z = (int(self.gh_z[:, 0, 0].nbytes), + int(self.gh_z[:, :, 0].nbytes)) + self._pitches_buff = (int(self.field_out.data[0][:, 0, 0].nbytes), + int(self.field_out.data[0][:, :, 0].nbytes)) + src, vec, f_space = \ self._kernel_cfg['fine_to_coarse_filter'] build_options = self._size_constants self._rmsh = self.method[Remesh]() + gwi, lwi = f_space(self.field_out.data[0].shape - + 2 * topo_out.ghosts(), pts_per_cell) build_options += " -D L_STENCIL=" + str(len(self._rmsh.weights)) build_options += " -D SHIFT_STENCIL=" + str(self._rmsh.shift) - build_options += " -D WG=32" + build_options += " -D WG=" + str(lwi[0]) build_options += " -D FORMULA=" + self.method[Remesh].__name__.upper() print build_options prg = self.cl_env.build_src(src, build_options, vec) - gwi, lwi = f_space(self.field_out.data[0].shape - - 2 * topo_out.ghosts(), pts_per_cell) - self.num_diffusion = KernelLauncher( + self.fine_to_coarse = KernelLauncher( prg.coarse_to_fine_filter, self.cl_env.queue, gwi, lwi) print gwi, lwi @@ -105,11 +120,162 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator): evts = [] for iy in xrange(len(self._rmsh.weights)): for iz in xrange(len(self._rmsh.weights)): - evts.append(self.num_diffusion(self.field_in.gpu_data[0], - self.field_out.gpu_data[0], - self.scale_factor, - self._mesh_size_in, - self._mesh_size_out, - self._domain_origin, - np.int32(iy), np.int32(iz), - wait_for=evts)) + evts.append(self.fine_to_coarse(self.field_in.gpu_data[0], + self.field_out.gpu_data[0], + self.scale_factor, + self._mesh_size_in, + self._mesh_size_out, + self._domain_origin, + np.int32(iy), np.int32(iz), + wait_for=evts)) + self.field_out.events.append(evts[-1]) + # Get ghosts values and in-domain layer + # X-direction + get_gh_xl = cl.enqueue_copy( + self.cl_env.queue, + self.gh_x, self.field_out.gpu_data[0], + host_origin=(0, 0, 0), + buffer_origin=(0, 0, 0), + host_pitches=self._pitches_host_x, + buffer_pitches=self._pitches_buff, + region=(self.gh_x[:2, 0, 0].nbytes, + self.gh_x.shape[1], + self.gh_x.shape[2]), + wait_for=evts) + get_gh_xr = cl.enqueue_copy( + self.cl_env.queue, + self.gh_x, self.field_out.gpu_data[0], + host_origin=(self.gh_x[:2, 0, 0].nbytes, 0, 0), + buffer_origin=(self.field_out.data[0][:, 0, 0].nbytes - + self.gh_x[:2, 0, 0].nbytes, 0, 0), + host_pitches=self._pitches_host_x, + buffer_pitches=self._pitches_buff, + region=(self.gh_x[:2, 0, 0].nbytes, + self.gh_x.shape[1], + self.gh_x.shape[2]), + wait_for=evts) + get_gh_xl.wait() + get_gh_xr.wait() + # Add ghosts contributions in domain layer + self.gh_x[2, :, :] += self.gh_x[0, :, :] + self.gh_x[1, :, :] += self.gh_x[-1, :, :] + set_gh_xl = cl.enqueue_copy( + self.cl_env.queue, + self.field_out.gpu_data[0], self.gh_x, + host_origin=(self.gh_x[:1, 0, 0].nbytes, 0, 0), + buffer_origin=(self.gh_x[:1, 0, 0].nbytes, 0, 0), + host_pitches=self._pitches_host_x, + buffer_pitches=self._pitches_buff, + region=(self.gh_x[:1, 0, 0].nbytes, + self.gh_x.shape[1], + self.gh_x.shape[2]), + wait_for=evts) + set_gh_xr = cl.enqueue_copy( + self.cl_env.queue, + self.field_out.gpu_data[0], self.gh_x, + host_origin=(self.gh_x[:3, 0, 0].nbytes, 0, 0), + buffer_origin=(self.field_out.data[0][:, 0, 0].nbytes - + self.gh_x[:1, 0, 0].nbytes, 0, 0), + host_pitches=self._pitches_host_x, + buffer_pitches=self._pitches_buff, + region=(self.gh_x[:1, 0, 0].nbytes, + self.gh_x.shape[1], + self.gh_x.shape[2]), + wait_for=evts) + set_gh_xl.wait() + set_gh_xr.wait() + + # Y-direction + get_gh_yl = cl.enqueue_copy( + self.cl_env.queue, + self.gh_y, self.field_out.gpu_data[0], + host_origin=(0, 0, 0), + buffer_origin=(0, 0, 0), + host_pitches=self._pitches_host_y, + buffer_pitches=self._pitches_buff, + region=(self.gh_y[:, 0, 0].nbytes, 2, self.gh_y.shape[2]), + wait_for=evts) + get_gh_yr = cl.enqueue_copy( + self.cl_env.queue, + self.gh_y, self.field_out.gpu_data[0], + host_origin=(0, 2, 0), + buffer_origin=(0, self.field_out.data[0].shape[1] - 2, 0), + host_pitches=self._pitches_host_y, + buffer_pitches=self._pitches_buff, + region=(self.gh_y[:, 0, 0].nbytes, 2, self.gh_y.shape[2]), + wait_for=evts) + get_gh_yl.wait() + get_gh_yr.wait() + # Add ghosts contributions in domain layer + self.gh_y[:, 2, :] += self.gh_y[:, 0, :] + self.gh_y[:, 1, :] += self.gh_y[:, -1, :] + set_gh_yl = cl.enqueue_copy( + self.cl_env.queue, + self.field_out.gpu_data[0], self.gh_y, + host_origin=(0, 1, 0), + buffer_origin=(0, 1, 0), + host_pitches=self._pitches_host_y, + buffer_pitches=self._pitches_buff, + region=(self.gh_y[:, 0, 0].nbytes, 1, self.gh_y.shape[2]), + wait_for=evts) + set_gh_yr = cl.enqueue_copy( + self.cl_env.queue, + self.field_out.gpu_data[0], self.gh_y, + host_origin=(0, 3, 0), + buffer_origin=(0, self.field_out.data[0].shape[1] - 1, 0), + host_pitches=self._pitches_host_y, + buffer_pitches=self._pitches_buff, + region=(self.gh_y[:, 0, 0].nbytes, 1, self.gh_y.shape[2]), + wait_for=evts) + set_gh_yl.wait() + set_gh_yr.wait() + + # Z-direction + get_gh_zl = cl.enqueue_copy( + self.cl_env.queue, + self.gh_z, self.field_out.gpu_data[0], + host_origin=(0, 0, 0), + buffer_origin=(0, 0, 0), + host_pitches=self._pitches_host_z, + buffer_pitches=self._pitches_buff, + region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 2), + wait_for=evts) + get_gh_zr = cl.enqueue_copy( + self.cl_env.queue, + self.gh_z, self.field_out.gpu_data[0], + host_origin=(0, 0, 2), + buffer_origin=(0, 0, self.field_out.data[0].shape[2] - 2), + host_pitches=self._pitches_host_z, + buffer_pitches=self._pitches_buff, + region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 2), + wait_for=evts) + get_gh_zl.wait() + get_gh_zr.wait() + # Add ghosts contributions in domain layer + self.gh_z[:, :, 2] += self.gh_z[:, :, 0] + self.gh_z[:, :, 1] += self.gh_z[:, :, -1] + set_gh_zl = cl.enqueue_copy( + self.cl_env.queue, + self.field_out.gpu_data[0], self.gh_z, + host_origin=(0, 0, 1), + buffer_origin=(0, 0, 1), + host_pitches=self._pitches_host_z, + buffer_pitches=self._pitches_buff, + region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 1), + wait_for=evts) + set_gh_zr = cl.enqueue_copy( + self.cl_env.queue, + self.field_out.gpu_data[0], self.gh_z, + host_origin=(0, 0, 3), + buffer_origin=(0, 0, self.field_out.data[0].shape[2] - 1), + host_pitches=self._pitches_host_z, + buffer_pitches=self._pitches_buff, + region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 1), + wait_for=evts) + set_gh_zl.wait() + set_gh_zr.wait() + + + def get_profiling_info(self): + for p in self.fine_to_coarse.profile: + self.profiler += p diff --git a/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py b/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py index 9ab6c48d2a94d396347d4ee69964e62eeeb435ec..004bc89d01fbcdda9f9c9502662045b670de4fde 100644 --- a/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py +++ b/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py @@ -1,3 +1,5 @@ +import hysop.gpu +hysop.gpu.CL_PROFILE = True from hysop.problem.simulation import Simulation from hysop.tools.parameters import Discretization from hysop.domain.box import Box @@ -15,8 +17,8 @@ simu = Simulation(tinit=0., tend=0.1, nbIter=1) def func(res, x, y, z, t=0): - res[0][...] = np.sin(2. * np.pi * x) * \ - np.cos(2. * np.pi * y) * np.sin(4. * np.pi * z) + res[0][...] = np.cos(2. * np.pi * x) * \ + np.cos(2. * np.pi * y) * np.cos(4. * np.pi * z) return res @@ -62,9 +64,21 @@ def test_filter(): # print np.where(f_out.data[0][topo_coarse.mesh.iCompute]>0.0001) # print valid[0][topo_coarse.mesh.iCompute][32+4:32+12,2:6,2:6] # print f_out.data[0][topo_coarse.mesh.iCompute][32+4:32+12,2:6,2:6] - e = np.max(np.abs(valid[0][topo_coarse.mesh.iCompute][1:-1,1:-1,1:-1] - - f_out[0][topo_coarse.mesh.iCompute][1:-1,1:-1,1:-1])) + # err = valid[0] - f_out[0] + # print "MAX X", np.max(f_out[0][0,:,:]), np.max(f_out[0][-1,:,:]) + #print "MAX Y", np.max(f_out[0][:,0,:]), np.max(f_out[0][:,-1,:]) + #print "MAX vY", np.max(valid[0][:,1,:]), np.max(valid[0][:,-2,:]) + # print "MAX Z", np.max(f_out[0][:,:,0]), np.max(f_out[0][:,:,-1]) + # print np.where(err[:2,:,:] > 0.0001) + # print err[:3,-4:,-4:] + e = np.max(np.abs(valid[0][topo_coarse.mesh.iCompute] - + f_out[0][topo_coarse.mesh.iCompute])) print e + ## PB DE CUMLUL DES GHPSTS DANS LES COINS DE DIRECTION EN DIRECTIO? + #print np.where(np.abs(valid[0][topo_coarse.mesh.iCompute][:,0,1:-1] - \ + # f_out[0][topo_coarse.mesh.iCompute][:,0,1:-1]) > 0.001) + op.profiler.summarize() + print op.profiler if __name__ == '__main__': diff --git a/HySoP/hysop/operator/discrete/multiresolution_filter.py b/HySoP/hysop/operator/discrete/multiresolution_filter.py index 3565e7a38582aef907306cb66333bc8e707d4c1b..b463b0a1c63988b50cefeba15d1ca935d9cec855 100644 --- a/HySoP/hysop/operator/discrete/multiresolution_filter.py +++ b/HySoP/hysop/operator/discrete/multiresolution_filter.py @@ -107,82 +107,62 @@ class FilterFineToCoarse(DiscreteOperator): self._bc_from_ghosts.append(( slice(self._mesh_out.iCompute[0].stop, self._mesh_out.iCompute[0].stop + gh_out[0], None), - self._mesh_out.iCompute[1], - self._mesh_out.iCompute[2] - )) + slice(None), slice(None))) self._bc_to_compute.append(( slice(self._mesh_out.iCompute[0].start, self._mesh_out.iCompute[0].start + gh_out[0], None), - self._mesh_out.iCompute[1], - self._mesh_out.iCompute[2] - )) + slice(None), slice(None))) # Left X-dir ghosts self._bc_from_ghosts.append(( slice(self._mesh_out.iCompute[0].start - gh_out[0], self._mesh_out.iCompute[0].start, None), - self._mesh_out.iCompute[1], - self._mesh_out.iCompute[2] - )) + slice(None), slice(None))) self._bc_to_compute.append(( slice(self._mesh_out.iCompute[0].stop - gh_out[0], self._mesh_out.iCompute[0].stop, None), - self._mesh_out.iCompute[1], - self._mesh_out.iCompute[2] - )) + slice(None), slice(None))) if gh_out[1] > 0: # Right Y-dir ghosts self._bc_from_ghosts.append(( - self._mesh_out.iCompute[0], + slice(None), slice(self._mesh_out.iCompute[1].stop, self._mesh_out.iCompute[1].stop + gh_out[1], None), - self._mesh_out.iCompute[2] - )) + slice(None))) self._bc_to_compute.append(( - self._mesh_out.iCompute[0], + slice(None), slice(self._mesh_out.iCompute[1].start, self._mesh_out.iCompute[1].start + gh_out[1], None), - self._mesh_out.iCompute[2] - )) + slice(None))) # Left Y-dir ghosts self._bc_from_ghosts.append(( - self._mesh_out.iCompute[0], + slice(None), slice(self._mesh_out.iCompute[1].start - gh_out[1], self._mesh_out.iCompute[1].start, None), - self._mesh_out.iCompute[2] - )) + slice(None))) self._bc_to_compute.append(( - self._mesh_out.iCompute[0], + slice(None), slice(self._mesh_out.iCompute[1].stop - gh_out[1], self._mesh_out.iCompute[1].stop, None), - self._mesh_out.iCompute[2] - )) + slice(None))) if gh_out[2] > 0: # Right Z-dir ghosts self._bc_from_ghosts.append(( - self._mesh_out.iCompute[0], - self._mesh_out.iCompute[1], + slice(None), slice(None), slice(self._mesh_out.iCompute[2].stop, - self._mesh_out.iCompute[2].stop + gh_out[2], None) - )) + self._mesh_out.iCompute[2].stop + gh_out[2], None))) self._bc_to_compute.append(( - self._mesh_out.iCompute[0], - self._mesh_out.iCompute[1], + slice(None), slice(None), slice(self._mesh_out.iCompute[2].start, - self._mesh_out.iCompute[2].start + gh_out[2], None) - )) + self._mesh_out.iCompute[2].start + gh_out[2], None))) # Left Z-dir ghosts self._bc_from_ghosts.append(( - self._mesh_out.iCompute[0], - self._mesh_out.iCompute[1], + slice(None), slice(None), slice(self._mesh_out.iCompute[2].start - gh_out[2], - self._mesh_out.iCompute[2].start, None) - )) + self._mesh_out.iCompute[2].start, None))) self._bc_to_compute.append(( - self._mesh_out.iCompute[0], - self._mesh_out.iCompute[1], + slice(None), slice(None), slice(self._mesh_out.iCompute[2].stop - gh_out[2], - self._mesh_out.iCompute[2].stop, None) - )) + self._mesh_out.iCompute[2].stop, None))) @debug @profile