Cleaning up. Need gpu version with L21 scheme

9d094637 · Jean-Matthieu Etancelin · c32b73d5 · 9d094637 · 9d094637 · 9d094637
Commit 9d094637 authored 10 years ago by Jean-Matthieu Etancelin
--- a/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl
@@ -27,8 +27,6 @@ __kernel void coarse_to_fine_filter(__global const float* scal_in,
  __local float line[WG*PTS_PER_CELL_X];
  __local float result[NB_OUT_X][L_STENCIL][L_STENCIL];
  __private float p_res[L_STENCIL][L_STENCIL][L_STENCIL];
-  ///// IDEE : calculer les points de la cellule ouput calculee par un w-i dans les registres et ensuite l'ajouter à la memoire locale.
-  ///// Utiliser un tableau prive de taille [L_STENCIL][L_STENCIL][L_STENCIL].

  // Fill local arrays
  // Output data

--- a/HySoP/hysop/gpu/multiresolution_filter.py
+++ b/HySoP/hysop/gpu/multiresolution_filter.py
@@ -131,6 +131,7 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
                self.field_out.events.append(evts[-1])
        # Get ghosts values and in-domain layer
        # X-direction
+        s_gh = self.gh_out[0]
        get_gh_xl = cl.enqueue_copy(
            self.cl_env.queue,
            self.gh_x, self.field_out.gpu_data[0],
@@ -138,47 +139,49 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
            buffer_origin=(0, 0, 0),
            host_pitches=self._pitches_host_x,
            buffer_pitches=self._pitches_buff,
-            region=(self.gh_x[:2, 0, 0].nbytes,
+            region=(self.gh_x[:2 * s_gh, 0, 0].nbytes,
                    self.gh_x.shape[1],
                    self.gh_x.shape[2]),
            wait_for=evts)
        get_gh_xr = cl.enqueue_copy(
            self.cl_env.queue,
            self.gh_x, self.field_out.gpu_data[0],
-            host_origin=(self.gh_x[:2, 0, 0].nbytes, 0, 0),
+            host_origin=(self.gh_x[:2 * s_gh, 0, 0].nbytes, 0, 0),
            buffer_origin=(self.field_out.data[0][:, 0, 0].nbytes -
-                           self.gh_x[:2, 0, 0].nbytes, 0, 0),
+                           self.gh_x[:2 * s_gh, 0, 0].nbytes, 0, 0),
            host_pitches=self._pitches_host_x,
            buffer_pitches=self._pitches_buff,
-            region=(self.gh_x[:2, 0, 0].nbytes,
+            region=(self.gh_x[:2 * s_gh, 0, 0].nbytes,
                    self.gh_x.shape[1],
                    self.gh_x.shape[2]),
            wait_for=evts)
        get_gh_xl.wait()
        get_gh_xr.wait()
        # Add ghosts contributions in domain layer
-        self.gh_x[2, :, :] += self.gh_x[0, :, :]
-        self.gh_x[1, :, :] += self.gh_x[-1, :, :]
+        self.gh_x[2 * s_gh:3 * s_gh, :, :] += \
+            self.gh_x[0 * s_gh:1 * s_gh, :, :]
+        self.gh_x[1 * s_gh:2 * s_gh, :, :] += \
+            self.gh_x[3 * s_gh:4 * s_gh, :, :]
        set_gh_xl = cl.enqueue_copy(
            self.cl_env.queue,
            self.field_out.gpu_data[0], self.gh_x,
-            host_origin=(self.gh_x[:1, 0, 0].nbytes, 0, 0),
-            buffer_origin=(self.gh_x[:1, 0, 0].nbytes, 0, 0),
+            host_origin=(self.gh_x[:1 * s_gh, 0, 0].nbytes, 0, 0),
+            buffer_origin=(self.gh_x[:1 * s_gh, 0, 0].nbytes, 0, 0),
            host_pitches=self._pitches_host_x,
            buffer_pitches=self._pitches_buff,
-            region=(self.gh_x[:1, 0, 0].nbytes,
+            region=(self.gh_x[:1 * s_gh, 0, 0].nbytes,
                    self.gh_x.shape[1],
                    self.gh_x.shape[2]),
            wait_for=evts)
        set_gh_xr = cl.enqueue_copy(
            self.cl_env.queue,
-             self.field_out.gpu_data[0], self.gh_x,
-            host_origin=(self.gh_x[:3, 0, 0].nbytes, 0, 0),
+            self.field_out.gpu_data[0], self.gh_x,
+            host_origin=(self.gh_x[:3 * s_gh, 0, 0].nbytes, 0, 0),
            buffer_origin=(self.field_out.data[0][:, 0, 0].nbytes -
-                           self.gh_x[:1, 0, 0].nbytes, 0, 0),
+                           self.gh_x[:1 * s_gh, 0, 0].nbytes, 0, 0),
            host_pitches=self._pitches_host_x,
            buffer_pitches=self._pitches_buff,
-            region=(self.gh_x[:1, 0, 0].nbytes,
+            region=(self.gh_x[:1 * s_gh, 0, 0].nbytes,
                    self.gh_x.shape[1],
                    self.gh_x.shape[2]),
            wait_for=evts)
@@ -186,6 +189,7 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
        set_gh_xr.wait()

        # Y-direction
+        s_gh = self.gh_out[1]
        get_gh_yl = cl.enqueue_copy(
            self.cl_env.queue,
            self.gh_y, self.field_out.gpu_data[0],
@@ -193,44 +197,47 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
            buffer_origin=(0, 0, 0),
            host_pitches=self._pitches_host_y,
            buffer_pitches=self._pitches_buff,
-            region=(self.gh_y[:, 0, 0].nbytes, 2, self.gh_y.shape[2]),
+            region=(self.gh_y[:, 0, 0].nbytes, 2 * s_gh, self.gh_y.shape[2]),
            wait_for=evts)
        get_gh_yr = cl.enqueue_copy(
            self.cl_env.queue,
            self.gh_y, self.field_out.gpu_data[0],
-            host_origin=(0, 2, 0),
-            buffer_origin=(0, self.field_out.data[0].shape[1] - 2, 0),
+            host_origin=(0, 2 * s_gh, 0),
+            buffer_origin=(0, self.field_out.data[0].shape[1] - 2 * s_gh, 0),
            host_pitches=self._pitches_host_y,
            buffer_pitches=self._pitches_buff,
-            region=(self.gh_y[:, 0, 0].nbytes, 2, self.gh_y.shape[2]),
+            region=(self.gh_y[:, 0, 0].nbytes, 2 * s_gh, self.gh_y.shape[2]),
            wait_for=evts)
        get_gh_yl.wait()
        get_gh_yr.wait()
        # Add ghosts contributions in domain layer
-        self.gh_y[:, 2, :] += self.gh_y[:, 0, :]
-        self.gh_y[:, 1, :] += self.gh_y[:, -1, :]
+        self.gh_y[:, 2 * s_gh:3 * s_gh, :] += \
+            self.gh_y[:, 0 * s_gh:1 * s_gh, :]
+        self.gh_y[:, 1 * s_gh:2 * s_gh, :] += \
+            self.gh_y[:, 3 * s_gh:4 * s_gh, :]
        set_gh_yl = cl.enqueue_copy(
            self.cl_env.queue,
            self.field_out.gpu_data[0], self.gh_y,
-            host_origin=(0, 1, 0),
-            buffer_origin=(0, 1, 0),
+            host_origin=(0, 1 * s_gh, 0),
+            buffer_origin=(0, 1 * s_gh, 0),
            host_pitches=self._pitches_host_y,
            buffer_pitches=self._pitches_buff,
-            region=(self.gh_y[:, 0, 0].nbytes, 1, self.gh_y.shape[2]),
+            region=(self.gh_y[:, 0, 0].nbytes, 1 * s_gh, self.gh_y.shape[2]),
            wait_for=evts)
        set_gh_yr = cl.enqueue_copy(
            self.cl_env.queue,
            self.field_out.gpu_data[0], self.gh_y,
-            host_origin=(0, 3, 0),
-            buffer_origin=(0, self.field_out.data[0].shape[1] - 1, 0),
+            host_origin=(0, 3 * s_gh, 0),
+            buffer_origin=(0, self.field_out.data[0].shape[1] - 1 * s_gh, 0),
            host_pitches=self._pitches_host_y,
            buffer_pitches=self._pitches_buff,
-            region=(self.gh_y[:, 0, 0].nbytes, 1, self.gh_y.shape[2]),
+            region=(self.gh_y[:, 0, 0].nbytes, 1 * s_gh, self.gh_y.shape[2]),
            wait_for=evts)
        set_gh_yl.wait()
        set_gh_yr.wait()

        # Z-direction
+        s_gh = self.gh_out[2]
        get_gh_zl = cl.enqueue_copy(
            self.cl_env.queue,
            self.gh_z, self.field_out.gpu_data[0],
@@ -238,44 +245,45 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
            buffer_origin=(0, 0, 0),
            host_pitches=self._pitches_host_z,
            buffer_pitches=self._pitches_buff,
-            region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 2),
+            region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 2 * s_gh),
            wait_for=evts)
        get_gh_zr = cl.enqueue_copy(
            self.cl_env.queue,
            self.gh_z, self.field_out.gpu_data[0],
-            host_origin=(0, 0, 2),
-            buffer_origin=(0, 0, self.field_out.data[0].shape[2] - 2),
+            host_origin=(0, 0, 2 * s_gh),
+            buffer_origin=(0, 0, self.field_out.data[0].shape[2] - 2 * s_gh),
            host_pitches=self._pitches_host_z,
            buffer_pitches=self._pitches_buff,
-            region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 2),
+            region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 2 * s_gh),
            wait_for=evts)
        get_gh_zl.wait()
        get_gh_zr.wait()
        # Add ghosts contributions in domain layer
-        self.gh_z[:, :, 2] += self.gh_z[:, :, 0]
-        self.gh_z[:, :, 1] += self.gh_z[:, :, -1]
+        self.gh_z[:, :, 2 * s_gh:3 * s_gh] += \
+            self.gh_z[:, :, 0 * s_gh:1 * s_gh]
+        self.gh_z[:, :, 1 * s_gh:2 * s_gh] += \
+            self.gh_z[:, :, 3 * s_gh:4 * s_gh]
        set_gh_zl = cl.enqueue_copy(
            self.cl_env.queue,
            self.field_out.gpu_data[0], self.gh_z,
-            host_origin=(0, 0, 1),
-            buffer_origin=(0, 0, 1),
+            host_origin=(0, 0, 1 * s_gh),
+            buffer_origin=(0, 0, 1 * s_gh),
            host_pitches=self._pitches_host_z,
            buffer_pitches=self._pitches_buff,
-            region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 1),
+            region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 1 * s_gh),
            wait_for=evts)
        set_gh_zr = cl.enqueue_copy(
            self.cl_env.queue,
            self.field_out.gpu_data[0], self.gh_z,
-            host_origin=(0, 0, 3),
-            buffer_origin=(0, 0, self.field_out.data[0].shape[2] - 1),
+            host_origin=(0, 0, 3 * s_gh),
+            buffer_origin=(0, 0, self.field_out.data[0].shape[2] - 1 * s_gh),
            host_pitches=self._pitches_host_z,
            buffer_pitches=self._pitches_buff,
-            region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 1),
+            region=(self.gh_z[:, 0, 0].nbytes, self.gh_z.shape[1], 1 * s_gh),
            wait_for=evts)
        set_gh_zl.wait()
        set_gh_zr.wait()

-
    def get_profiling_info(self):
        for p in self.fine_to_coarse.profile:
            self.profiler += p
--- a/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py
+++ b/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py
@@ -27,7 +27,7 @@ def test_filter():
    f = Field(box, formula=func)
    f_py = Field(box, formula=func)
    d_fine = Discretization([513, 513, 513])
-    d_coarse = Discretization([257, 257, 257], ghosts=[1, 1, 1])
+    d_coarse = Discretization([257, 257, 257], ghosts=[2, 2, 2])
    op = MultiresolutionFilter(d_in=d_fine, d_out=d_coarse,
                               variables={f: d_coarse},
                               method={Remesh: Rmsh_Linear,