diff --git a/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl b/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl index 40328dcff62af3414576f71e3958c3022643f5ee..064a7502b22bd1d990ef78d552b29cd808f26a92 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl @@ -27,8 +27,6 @@ __kernel void coarse_to_fine_filter(__global const float* scal_in, __local float line[WG*PTS_PER_CELL_X]; __local float result[NB_OUT_X][L_STENCIL][L_STENCIL]; __private float p_res[L_STENCIL][L_STENCIL][L_STENCIL]; - ///// IDEE : calculer les points de la cellule ouput calculee par un w-i dans les registres et ensuite l'ajouter à la memoire locale. - ///// Utiliser un tableau prive de taille [L_STENCIL][L_STENCIL][L_STENCIL]. // Fill local arrays // Output data diff --git a/HySoP/hysop/gpu/multiresolution_filter.py b/HySoP/hysop/gpu/multiresolution_filter.py index b0fd3980230e086ce379d8b65a78a19b13b2b3fd..1d6d6f018d7db76041b92c5d348ddc8fc0c6fc70 100644 --- a/HySoP/hysop/gpu/multiresolution_filter.py +++ b/HySoP/hysop/gpu/multiresolution_filter.py @@ -131,6 +131,7 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator): self.field_out.events.append(evts[-1]) # Get ghosts values and in-domain layer # X-direction + s_gh = self.gh_out[0] get_gh_xl = cl.enqueue_copy( self.cl_env.queue, self.gh_x, self.field_out.gpu_data[0], @@ -138,47 +139,49 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator): buffer_origin=(0, 0, 0), host_pitches=self._pitches_host_x, buffer_pitches=self._pitches_buff, - region=(self.gh_x[:2, 0, 0].nbytes, + region=(self.gh_x[:2 * s_gh, 0, 0].nbytes, self.gh_x.shape[1], self.gh_x.shape[2]), wait_for=evts) get_gh_xr = cl.enqueue_copy( self.cl_env.queue, self.gh_x, self.field_out.gpu_data[0], - host_origin=(self.gh_x[:2, 0, 0].nbytes, 0, 0), + host_origin=(self.gh_x[:2 * s_gh, 0, 0].nbytes, 0, 0), buffer_origin=(self.field_out.data[0][:, 0, 0].nbytes - - self.gh_x[:2, 0, 0].nbytes, 0, 0), + self.gh_x[:2 * s_gh, 0, 0].nbytes, 0, 0), host_pitches=self._pitches_host_x, buffer_pitches=self._pitches_buff, - region=(self.gh_x[:2, 0, 0].nbytes, + region=(self.gh_x[:2 * s_gh, 0, 0].nbytes, self.gh_x.shape[1], self.gh_x.shape[2]), wait_for=evts) get_gh_xl.wait() get_gh_xr.wait() # Add ghosts contributions in domain layer - self.gh_x[2, :, :] += self.gh_x[0, :, :] - self.gh_x[1, :, :] += self.gh_x[-1, :, :] + self.gh_x[2 * s_gh:3 * s_gh, :, :] += \ + self.gh_x[0 * s_gh:1 * s_gh, :, :] + self.gh_x[1 * s_gh:2 * s_gh, :, :] += \ + self.gh_x[3 * s_gh:4 * s_gh, :, :] set_gh_xl = cl.enqueue_copy( self.cl_env.queue, self.field_out.gpu_data[0], self.gh_x, - host_origin=(self.gh_x[:1, 0, 0].nbytes, 0, 0), - buffer_origin=(self.gh_x[:1, 0, 0].nbytes, 0, 0), + host_origin=(self.gh_x[:1 * s_gh, 0, 0].nbytes, 0, 0), + buffer_origin=(self.gh_x[:1 * s_gh, 0, 0].nbytes, 0, 0), host_pitches=self._pitches_host_x, buffer_pitches=self._pitches_buff, - region=(self.gh_x[:1, 0, 0].nbytes, + region=(self.gh_x[:1 * s_gh, 0, 0].nbytes, self.gh_x.shape[1], self.gh_x.shape[2]), wait_for=evts) set_gh_xr = cl.enqueue_copy( self.cl_env.queue, - self.field_out.gpu_data[0], self.gh_x, - host_origin=(self.gh_x[:3, 0, 0].nbytes, 0, 0), + self.field_out.gpu_data[0], self.gh_x, + host_origin=(self.gh_x[:3 * s_gh, 0, 0].nbytes, 0, 0), buffer_origin=(self.field_out.data[0][:, 0, 0].nbytes - - self.gh_x[:1, 0, 0].nbytes, 0, 0), + self.gh_x[:1 * s_gh, 0, 0].nbytes, 0, 0), host_pitches=self._pitches_host_x, buffer_pitches=self._pitches_buff, - region=(self.gh_x[:1, 0, 0].nbytes, + region=(self.gh_x[:1 * s_gh, 0, 0].nbytes, self.gh_x.shape[1], self.gh_x.shape[2]), wait_for=evts) @@ -186,6 +189,7 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator): set_gh_xr.wait() # Y-direction + s_gh = self.gh_out[1] get_gh_yl = cl.enqueue_copy( self.cl_env.queue, self.gh_y, self.field_out.gpu_data[0], @@ -193,44 +197,47 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator): buffer_origin=(0, 0, 0), host_pitches=self._pitches_host_y, buffer_pitches=self._pitches_buff, - region=(self.gh_y[:, 0, 0].nbytes, 2, self.gh_y.shape[2]), + region=(self.gh_y[:, 0, 0].nbytes, 2 * s_gh, self.gh_y.shape[2]), wait_for=evts) get_gh_yr = cl.enqueue_copy( self.cl_env.queue, self.gh_y, self.field_out.gpu_data[0], - host_origin=(0, 2, 0), - buffer_origin=(0, self.field_out.data[0].shape[1] - 2, 0), + host_origin=(0, 2 * s_gh, 0), + buffer_origin=(0, self.field_out.data[0].shape[1] - 2 * s_gh, 0), host_pitches=self._pitches_host_y, buffer_pitches=self._pitches_buff, - region=(self.gh_y[:, 0, 0].nbytes, 2, self.gh_y.shape[2]), + region=(self.gh_y[:, 0, 0].nbytes, 2 * s_gh, self.gh_y.shape[2]), wait_for=evts) get_gh_yl.wait() get_gh_yr.wait() # Add ghosts contributions in domain layer - self.gh_y[:, 2, :] += self.gh_y[:, 0, :] - self.gh_y[:, 1, :] += self.gh_y[:, -1, :] + self.gh_y[:, 2 * s_gh:3 * s_gh, :] += \ + self.gh_y[:, 0 * s_gh:1 * s_gh, :] + self.gh_y[:, 1 * s_gh:2 * s_gh, :] += \ + self.gh_y[:, 3 * s_gh:4 * s_gh, :] set_gh_yl = cl.enqueue_copy( self.cl_env.queue, self.field_out.gpu_data[0], self.gh_y, - host_origin=(0, 1, 0), - buffer_origin=(0, 1, 0), + host_origin=(0, 1 * s_gh, 0), + buffer_origin=(0, 1 * s_gh, 0), host_pitches=self._pitches_host_y, buffer_pitches=self._pitches_buff, - region=(self.gh_y[:, 0, 0].nbytes, 1, self.gh_y.shape[2]), + region=(self.gh_y[:, 0, 0].nbytes, 1 * s_gh, self.gh_y.shape[2]), wait_for=evts) set_gh_yr = cl.enqueue_copy( self.cl_env.queue, self.field_out.gpu_data[0], self.gh_y, - host_origin=(0, 3, 0), - buffer_origin=(0, self.field_out.data[0].shape[1] - 1, 0), + host_origin=(0, 3 * s_gh, 0), + buffer_origin=(0, self.field_out.data[0].shape[1] - 1 * s_gh, 0), host_pitches=self._pitches_host_y, buffer_pitches=self._pitches_buff, - region=(self.gh_y[:, 0, 0].nbytes, 1, self.gh_y.shape[2]), + region=(self.gh_y[:, 0, 0].nbytes, 1 * s_gh, self.gh_y.shape[2]), wait_for=evts) set_gh_yl.wait() set_gh_yr.wait() # Z-direction + s_gh = self.gh_out[2] get_gh_zl = cl.enqueue_copy( self.cl_env.queue, self.gh_z, self.field_out.gpu_data[0], @@ -238,44 +245,45 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator): buffer_origin=(0, 0, 0), host_pitches=self._pitches_host_z, buffer_pitches=self._pitches_buff, - region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 2), + region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 2 * s_gh), wait_for=evts) get_gh_zr = cl.enqueue_copy( self.cl_env.queue, self.gh_z, self.field_out.gpu_data[0], - host_origin=(0, 0, 2), - buffer_origin=(0, 0, self.field_out.data[0].shape[2] - 2), + host_origin=(0, 0, 2 * s_gh), + buffer_origin=(0, 0, self.field_out.data[0].shape[2] - 2 * s_gh), host_pitches=self._pitches_host_z, buffer_pitches=self._pitches_buff, - region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 2), + region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 2 * s_gh), wait_for=evts) get_gh_zl.wait() get_gh_zr.wait() # Add ghosts contributions in domain layer - self.gh_z[:, :, 2] += self.gh_z[:, :, 0] - self.gh_z[:, :, 1] += self.gh_z[:, :, -1] + self.gh_z[:, :, 2 * s_gh:3 * s_gh] += \ + self.gh_z[:, :, 0 * s_gh:1 * s_gh] + self.gh_z[:, :, 1 * s_gh:2 * s_gh] += \ + self.gh_z[:, :, 3 * s_gh:4 * s_gh] set_gh_zl = cl.enqueue_copy( self.cl_env.queue, self.field_out.gpu_data[0], self.gh_z, - host_origin=(0, 0, 1), - buffer_origin=(0, 0, 1), + host_origin=(0, 0, 1 * s_gh), + buffer_origin=(0, 0, 1 * s_gh), host_pitches=self._pitches_host_z, buffer_pitches=self._pitches_buff, - region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 1), + region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 1 * s_gh), wait_for=evts) set_gh_zr = cl.enqueue_copy( self.cl_env.queue, self.field_out.gpu_data[0], self.gh_z, - host_origin=(0, 0, 3), - buffer_origin=(0, 0, self.field_out.data[0].shape[2] - 1), + host_origin=(0, 0, 3 * s_gh), + buffer_origin=(0, 0, self.field_out.data[0].shape[2] - 1 * s_gh), host_pitches=self._pitches_host_z, buffer_pitches=self._pitches_buff, - region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 1), + region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 1 * s_gh), wait_for=evts) set_gh_zl.wait() set_gh_zr.wait() - def get_profiling_info(self): for p in self.fine_to_coarse.profile: self.profiler += p diff --git a/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py b/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py index 004bc89d01fbcdda9f9c9502662045b670de4fde..7647c2a2868ceecaf7ebbe1e3734041e180b47aa 100644 --- a/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py +++ b/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py @@ -27,7 +27,7 @@ def test_filter(): f = Field(box, formula=func) f_py = Field(box, formula=func) d_fine = Discretization([513, 513, 513]) - d_coarse = Discretization([257, 257, 257], ghosts=[1, 1, 1]) + d_coarse = Discretization([257, 257, 257], ghosts=[2, 2, 2]) op = MultiresolutionFilter(d_in=d_fine, d_out=d_coarse, variables={f: d_coarse}, method={Remesh: Rmsh_Linear,