Ok (multi-)GPU multiresolution filter with L2_1. Bug in Linear. Need merge with cpu version.

a8f7d453 · Jean-Matthieu Etancelin · 38d0c2d2 · a8f7d453 · a8f7d453 · a8f7d453
Commit a8f7d453 authored 10 years ago by Jean-Matthieu Etancelin
--- a/HySoP/hysop/gpu/multiresolution_filter.py
+++ b/HySoP/hysop/gpu/multiresolution_filter.py
-from hysop.constants import debug, np
+from hysop.constants import debug, np, HYSOP_MPI_REAL, ORDERMPI
 import hysop.tools.numpywrappers as npw
-from hysop.operator.discrete.discrete import DiscreteOperator
+#from hysop.operator.discrete.multiresolution_filter import FilterFineToCoarse
 from hysop.gpu.gpu_operator import GPUOperator
-from hysop.operator.discrete.discrete import get_extra_args_from_method
+from hysop.operator.discrete.discrete import DiscreteOperator, get_extra_args_from_method
 from hysop.gpu.gpu_discrete import GPUDiscreteField
 from hysop.gpu.gpu_kernel import KernelLauncher
 from hysop.methods_keys import Remesh
-from hysop.gpu import cl
 from hysop.methods import Rmsh_Linear, L2_1


@@ -36,7 +35,8 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
        resol_in = self._mesh_in.resolution - 2 * gh_in
        resol_out = self._mesh_out.resolution - 2 * self.gh_out
        pts_per_cell = resol_in / resol_out
-        assert np.all(pts_per_cell >= 1), "This operator is fine grid to coarse one"
+        assert np.all(pts_per_cell >= 1), \
+            "This operator is fine grid to coarse one"
        self.scale_factor = np.prod(self._mesh_in.space_step) / \
                            np.prod(self._mesh_out.space_step)

@@ -70,8 +70,8 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
        self._comm = topo_in.comm
        self._comm_size = self._comm.Get_size()
        self._comm_rank = self._comm.Get_rank()
-        if self._comm_size > 1:
-            raise RuntimeError('MPI version is not yet implemented')
+        # if self._comm_size > 1:
+        #     raise RuntimeError('MPI version is not yet implemented')

        self._mesh_size_in = npw.ones(4, dtype=self.gpu_precision)
        self._mesh_size_in[:self.dim] = \
@@ -94,7 +94,24 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
        self._append_size_constants(self.gh_out, prefix='GHOSTS_OUT')
        self._append_size_constants(pts_per_cell, prefix='PTS_PER_CELL')

-
+        # multi-gpu ghosts buffers for communication
+        if self._comm_size == 1:
+            self._exchange_ghosts = self._exchange_ghosts_local
+        else:
+            self._exchange_ghosts = self._exchange_ghosts_mpi
+            self._gh_from_l = [None] * self.dim
+            self._gh_from_r = [None] * self.dim
+            self._gh_to_l = [None] * self.dim
+            self._gh_to_r = [None] * self.dim
+            # self._mpi_to_l = [None] * self.dim
+            # self._mpi_to_r = [None] * self.dim
+            for d in self._cutdir_list:
+                shape = list(self.field_out.data[0].shape)
+                shape[d] = self.gh_out[d]
+                self._gh_from_l[d] = npw.zeros(tuple(shape))
+                self._gh_from_r[d] = npw.zeros(tuple(shape))
+                self._gh_to_l[d] = npw.zeros(tuple(shape))
+                self._gh_to_r[d] = npw.zeros(tuple(shape))

        # # Ghosts temp arrays for the second version of ghosts exchange
        # self.gh_x = npw.zeros((4 * self.gh_out[0], shape_out[1], shape_out[2]))
@@ -146,27 +163,76 @@ class GPUFilterFineToCoarse(DiscreteOperator, GPUOperator):
        #  - rect-copy of only needed data
        # The first one is running much faster than the second because of
        # the use of the mapping of device buffer in host pinned memory.
-        # The second version is kept in comments
+        # The second version is kept in comments (for sequential case)
+        self._exchange_ghosts()
+
+    def _exchange_ghosts_local_d(self, d):
+        s_gh = self.gh_out[d]
+        sl = [slice(None) for _ in xrange(self.dim)]
+        sl_gh = [slice(None) for _ in xrange(self.dim)]
+        sl[d] = slice(1 * s_gh, 2 * s_gh)
+        sl_gh[d] = slice(-1 * s_gh, None)
+        self.field_out.data[0][tuple(sl)] += \
+            self.field_out.data[0][tuple(sl_gh)]
+        sl[d] = slice(-2 * s_gh, -1 * s_gh)
+        sl_gh[d] = slice(0, 1 * s_gh)
+        self.field_out.data[0][tuple(sl)] += \
+            self.field_out.data[0][tuple(sl_gh)]

+    def _exchange_ghosts_local(self):
        self.field_out.toHost()
        self.field_out.wait()
-        s_gh = self.gh_out[0]
-        self.field_out.data[0][1 * s_gh:2 * s_gh, :, :] += \
-            self.field_out.data[0][-1 * s_gh:, :, :]
-        self.field_out.data[0][-2 * s_gh:-1 * s_gh, :, :] += \
-            self.field_out.data[0][:1 * s_gh, :, :]
-        s_gh = self.gh_out[1]
-        self.field_out.data[0][:, 1 * s_gh:2 * s_gh, :] += \
-            self.field_out.data[0][:, -1 * s_gh:, :]
-        self.field_out.data[0][:, -2 * s_gh:-1 * s_gh, :] += \
-            self.field_out.data[0][:, :1 * s_gh, :]
-        s_gh = self.gh_out[2]
-        self.field_out.data[0][:, :, 1 * s_gh:2 * s_gh] += \
-            self.field_out.data[0][:, :, -1 * s_gh:]
-        self.field_out.data[0][:, :, -2 * s_gh:-1 * s_gh] += \
-            self.field_out.data[0][:, :, :1 * s_gh]
+        for d in xrange(self.dim):
+            self._exchange_ghosts_local_d(d)
        self.field_out.toDevice()

+    def _exchange_ghosts_mpi_d(self, d):
+        s_gh = self.gh_out[d]
+        sl_l = [slice(None) for _ in xrange(self.dim)]
+        sl_gh_l = [slice(None) for _ in xrange(self.dim)]
+        sl_r = [slice(None) for _ in xrange(self.dim)]
+        sl_gh_r = [slice(None) for _ in xrange(self.dim)]
+        sl_l[d] = slice(1 * s_gh, 2 * s_gh)
+        sl_gh_r[d] = slice(-1 * s_gh, None)
+        sl_r[d] = slice(-2 * s_gh, -1 * s_gh)
+        sl_gh_l[d] = slice(0, 1 * s_gh)
+        first_cut_dir = self.field_out.topology.cutdir.tolist().index(True)
+        self._gh_to_l[d][...] = self.field_out.data[0][tuple(sl_gh_l)]
+        self._gh_to_r[d][...] = self.field_out.data[0][tuple(sl_gh_r)]
+        R_rk = self.field_out.topology.neighbours[1, d - first_cut_dir]
+        L_rk = self.field_out.topology.neighbours[0, d - first_cut_dir]
+        recv_r = self._comm.Irecv(
+            [self._gh_from_r[d], self._gh_from_r[d].size,
+             HYSOP_MPI_REAL],
+            source=R_rk, tag=1234 + R_rk + 19 * d)
+        recv_l = self._comm.Irecv(
+            [self._gh_from_l[d], self._gh_from_l[d].size,
+             HYSOP_MPI_REAL],
+            source=L_rk, tag=4321 + L_rk + 17 * d)
+        send_l = self._comm.Issend(
+            [self._gh_to_l[d], self._gh_to_l[d].size, HYSOP_MPI_REAL],
+            dest=L_rk, tag=1234 + self._comm_rank + 19 * d)
+        send_r = self._comm.Issend(
+            [self._gh_to_r[d], self._gh_to_r[d].size, HYSOP_MPI_REAL],
+            dest=R_rk, tag=4321 + self._comm_rank + 17 * d)
+        send_r.wait()
+        recv_l.wait()
+        self.field_out.data[0][tuple(sl_l)] += self._gh_from_l[d]
+        send_l.wait()
+        recv_r.wait()
+        self.field_out.data[0][tuple(sl_r)] += self._gh_from_r[d]
+
+    def _exchange_ghosts_mpi(self):
+        self.field_out.toHost()
+        self.field_out.wait()
+        for d in xrange(self.dim):
+            if d in self._cutdir_list:
+                self._exchange_ghosts_mpi_d(d)
+            else:
+                self._exchange_ghosts_local_d(d)
+        self.field_out.toDevice()
+
+
        # # Get ghosts values and in-domain layer
        # # X-direction
        # s_gh = self.gh_out[0]

--- a/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py
+++ b/HySoP/hysop/gpu/tests/test_multiresolutionfilter.py
@@ -57,13 +57,13 @@ def test_filter_linear():
    op_py.apply(simu)
    f_out.toHost()
    op.discrete_op.cl_env.queue.finish()
-    # e = np.abs(valid[0][topo_coarse.mesh.iCompute] -
-    #            f_out[0][topo_coarse.mesh.iCompute])
-    # print np.max(e), np.where(e>1e-6)[0].shape
-    print np.allclose(valid[0][topo_coarse.mesh.iCompute],
+    e = np.abs(valid[0][topo_coarse.mesh.iCompute] -
+               f_out[0][topo_coarse.mesh.iCompute])
+    print np.max(e), np.where(e>1e-6)[0].shape
+    assert np.allclose(valid[0][topo_coarse.mesh.iCompute],
                      f_out[0][topo_coarse.mesh.iCompute])
-    op.profiler.summarize()
-    print op.profiler
+    # op.profiler.summarize()
+    # print op.profiler



@@ -93,7 +93,7 @@ def test_filter_L2_1():
                 if not t is topo_coarse][0]
    f.initialize(topo=topo_fine)
    #f_py.initialize(topo=topo_fine)
-    #f_in = f.discreteFields[topo_fine]
+    f_in = f.discreteFields[topo_fine]
    f_out = f.discreteFields[topo_coarse]
    #valid = f_py.discreteFields[topo_coarse]
    valid = [npw.zeros(f_out[0].shape), ]
@@ -107,14 +107,13 @@ def test_filter_L2_1():
    op.discrete_op.cl_env.queue.finish()
    # e = np.abs(valid[0][topo_coarse.mesh.iCompute] -
    #            f_out[0][topo_coarse.mesh.iCompute])
-    # print np.max(e), np.where(e>1e-6)[0].shape
-    print np.allclose(valid[0][topo_coarse.mesh.iCompute],
+    #print np.max(e), np.where(e>1e-6)#[0].shape
+    assert np.allclose(valid[0][topo_coarse.mesh.iCompute],
                      f_out[0][topo_coarse.mesh.iCompute])
-
-    op.profiler.summarize()
-    print op.profiler
+    # op.profiler.summarize()
+    # print op.profiler


 if __name__ == '__main__':
-    test_filter_linear()
+    #test_filter_linear()
    test_filter_L2_1()
--- a/HySoP/hysop/operator/discrete/multiresolution_filter.py
+++ b/HySoP/hysop/operator/discrete/multiresolution_filter.py
-from hysop.constants import debug, np
+from hysop.constants import debug, np, HYSOP_MPI_REAL, ORDERMPI
 from hysop.operator.discrete.discrete import DiscreteOperator
 from hysop.tools.profiler import profile
 import hysop.tools.numpywrappers as npw
@@ -17,24 +17,42 @@ class FilterFineToCoarse(DiscreteOperator):
        self.output = [self.field_out]
        self._mesh_in = self.field_in[0].topology.mesh
        self._mesh_out = self.field_out[0].topology.mesh
-        gh_out = self.field_out[0].topology.ghosts()
+        self.gh_out = self.field_out[0].topology.ghosts()
        gh_in = self.field_in[0].topology.ghosts()
-        #print gh_in, gh_out
+        #print gh_in, self.gh_out
        resol_in = self._mesh_in.resolution - 2 * gh_in
-        resol_out = self._mesh_out.resolution - 2 * gh_out
+        resol_out = self._mesh_out.resolution - 2 * self.gh_out
        pts_per_cell = resol_in / resol_out
        assert np.all(pts_per_cell >= 1), "This operator is fine grid to coarse one"
        self.scale_factor = np.prod(self._mesh_in.space_step) / \
                            np.prod(self._mesh_out.space_step)
-        #print pts_per_cell, self.scale_factor
-        #print self._mesh_in.coords
+
+        # multi-gpu ghosts buffers for communication
+        self._cutdir_list = np.where(self.field_in[0].topology.cutdir)[0].tolist()
+        self._comm = self.field_in[0].topology.comm
+        self._comm_size = self._comm.Get_size()
+        self._comm_rank = self._comm.Get_rank()
+        if self._comm_size == 1:
+            self._exchange_ghosts = self._exchange_ghosts_local
+        else:
+            self._exchange_ghosts = self._exchange_ghosts_mpi
+            self._gh_from_l = [None] * self._dim
+            self._gh_from_r = [None] * self._dim
+            self._gh_to_l = [None] * self._dim
+            self._gh_to_r = [None] * self._dim
+            for d in self._cutdir_list:
+                shape = list(self.field_out[0].data[0].shape)
+                shape[d] = self.gh_out[d]
+                self._gh_from_l[d] = npw.zeros(tuple(shape))
+                self._gh_from_r[d] = npw.zeros(tuple(shape))
+                self._gh_to_l[d] = npw.zeros(tuple(shape))
+                self._gh_to_r[d] = npw.zeros(tuple(shape))
+
        in_coords = (self._mesh_in.coords - self._mesh_in.origin) / \
                    self._mesh_out.space_step
        self.floor_coords = np.array([np.floor(c) for c in in_coords])
        self.dist_coords = in_coords - self.floor_coords
        self._work_weight = np.array([npw.zeros_like(c) for c in self.dist_coords])
-        #print type(self.dist_coords), self.dist_coords.shape, self.dist_coords[0].shape
-        #print type(self._work_weight), self._work_weight.shape, self._work_weight[0].shape
        # Slices to serialize concurrent access in coarse grid
        # Several points in fine grid are laying in the same coarse cell
        # The serialization avoid concurrent access.
@@ -66,12 +84,12 @@ class FilterFineToCoarse(DiscreteOperator):
            self._rmsh_method = Rmsh_Linear()
        if isinstance(self._rmsh_method, Rmsh_Linear):
            # Linear interpolation
-            assert np.all(gh_out >= 1), "Output data must have at least one " \
-                " ghosts in all directions (" + str(gh_out) + " given)"
+            assert np.all(self.gh_out >= 1), "Output data must have at least one " \
+                " ghosts in all directions (" + str(self.gh_out) + " given)"
        elif isinstance(self._rmsh_method, L2_1):
            # L2_1 interpolation
-            assert np.all(gh_out >= 2), "Output data must have at least one " \
-                " ghosts in all directions (" + str(gh_out) + " given)"
+            assert np.all(self.gh_out >= 2), "Output data must have at least one " \
+                " ghosts in all directions (" + str(self.gh_out) + " given)"
        else:
            raise ValueError("The multiresolution filter is implemented for " +
                             "Linear or L2_1 interpolation ("
@@ -102,66 +120,66 @@ class FilterFineToCoarse(DiscreteOperator):
            self._rwork = npw.zeros(resol_out)
        self._bc_from_ghosts = []
        self._bc_to_compute = []
-        if gh_out[0] > 0:
+        if self.gh_out[0] > 0:
            # Right X-dir ghosts
            self._bc_from_ghosts.append((
                slice(self._mesh_out.iCompute[0].stop,
-                      self._mesh_out.iCompute[0].stop + gh_out[0], None),
+                      self._mesh_out.iCompute[0].stop + self.gh_out[0], None),
                slice(None), slice(None)))
            self._bc_to_compute.append((
                slice(self._mesh_out.iCompute[0].start,
-                      self._mesh_out.iCompute[0].start + gh_out[0], None),
+                      self._mesh_out.iCompute[0].start + self.gh_out[0], None),
                slice(None), slice(None)))
            # Left X-dir ghosts
            self._bc_from_ghosts.append((
-                slice(self._mesh_out.iCompute[0].start - gh_out[0],
+                slice(self._mesh_out.iCompute[0].start - self.gh_out[0],
                      self._mesh_out.iCompute[0].start, None),
                slice(None), slice(None)))
            self._bc_to_compute.append((
-                slice(self._mesh_out.iCompute[0].stop - gh_out[0],
+                slice(self._mesh_out.iCompute[0].stop - self.gh_out[0],
                      self._mesh_out.iCompute[0].stop, None),
                slice(None), slice(None)))
-        if gh_out[1] > 0:
+        if self.gh_out[1] > 0:
            # Right Y-dir ghosts
            self._bc_from_ghosts.append((
                slice(None),
                slice(self._mesh_out.iCompute[1].stop,
-                      self._mesh_out.iCompute[1].stop + gh_out[1], None),
+                      self._mesh_out.iCompute[1].stop + self.gh_out[1], None),
                slice(None)))
            self._bc_to_compute.append((
                slice(None),
                slice(self._mesh_out.iCompute[1].start,
-                      self._mesh_out.iCompute[1].start + gh_out[1], None),
+                      self._mesh_out.iCompute[1].start + self.gh_out[1], None),
                slice(None)))
            # Left Y-dir ghosts
            self._bc_from_ghosts.append((
                slice(None),
-                slice(self._mesh_out.iCompute[1].start - gh_out[1],
+                slice(self._mesh_out.iCompute[1].start - self.gh_out[1],
                      self._mesh_out.iCompute[1].start, None),
                slice(None)))
            self._bc_to_compute.append((
                slice(None),
-                slice(self._mesh_out.iCompute[1].stop - gh_out[1],
+                slice(self._mesh_out.iCompute[1].stop - self.gh_out[1],
                      self._mesh_out.iCompute[1].stop, None),
                slice(None)))
-        if gh_out[2] > 0:
+        if self.gh_out[2] > 0:
            # Right Z-dir ghosts
            self._bc_from_ghosts.append((
                slice(None), slice(None),
                slice(self._mesh_out.iCompute[2].stop,
-                      self._mesh_out.iCompute[2].stop + gh_out[2], None)))
+                      self._mesh_out.iCompute[2].stop + self.gh_out[2], None)))
            self._bc_to_compute.append((
                slice(None), slice(None),
                slice(self._mesh_out.iCompute[2].start,
-                      self._mesh_out.iCompute[2].start + gh_out[2], None)))
+                      self._mesh_out.iCompute[2].start + self.gh_out[2], None)))
            # Left Z-dir ghosts
            self._bc_from_ghosts.append((
                slice(None), slice(None),
-                slice(self._mesh_out.iCompute[2].start - gh_out[2],
+                slice(self._mesh_out.iCompute[2].start - self.gh_out[2],
                      self._mesh_out.iCompute[2].start, None)))
            self._bc_to_compute.append((
                slice(None), slice(None),
-                slice(self._mesh_out.iCompute[2].stop - gh_out[2],
+                slice(self._mesh_out.iCompute[2].stop - self.gh_out[2],
                      self._mesh_out.iCompute[2].stop, None)))

    @debug
@@ -222,3 +240,65 @@ class FilterFineToCoarse(DiscreteOperator):
                    #print  fr, to
                    #print v_out[d][fr].shape, v_out[d][to].shape
                    v_out[d][to] += v_out[d][fr]
+        self._exchange_ghosts()
+
+    def _exchange_ghosts_local_d(self, d):
+        s_gh = self.gh_out[d]
+        sl = [slice(None) for _ in xrange(self._dim)]
+        sl_gh = [slice(None) for _ in xrange(self._dim)]
+        sl[d] = slice(1 * s_gh, 2 * s_gh)
+        sl_gh[d] = slice(-1 * s_gh, None)
+        for v_out in self.field_out:
+            v_out.data[0][tuple(sl)] += v_out.data[0][tuple(sl_gh)]
+        sl[d] = slice(-2 * s_gh, -1 * s_gh)
+        sl_gh[d] = slice(0, 1 * s_gh)
+        for v_out in self.field_out:
+            v_out.data[0][tuple(sl)] += v_out.data[0][tuple(sl_gh)]
+
+    def _exchange_ghosts_local(self):
+        for d in xrange(self._dim):
+            self._exchange_ghosts_local_d(d)
+
+    def _exchange_ghosts_mpi_d(self, d):
+        s_gh = self.gh_out[d]
+        sl_l = [slice(None) for _ in xrange(self._dim)]
+        sl_gh_l = [slice(None) for _ in xrange(self._dim)]
+        sl_r = [slice(None) for _ in xrange(self._dim)]
+        sl_gh_r = [slice(None) for _ in xrange(self._dim)]
+        sl_l[d] = slice(1 * s_gh, 2 * s_gh)
+        sl_gh_r[d] = slice(-1 * s_gh, None)
+        sl_r[d] = slice(-2 * s_gh, -1 * s_gh)
+        sl_gh_l[d] = slice(0, 1 * s_gh)
+        for v_out in self.field_out:
+            first_cut_dir = v_out.topology.cutdir.tolist().index(True)
+            self._gh_to_l[d][...] = v_out.data[0][tuple(sl_gh_l)]
+            self._gh_to_r[d][...] = v_out.data[0][tuple(sl_gh_r)]
+            R_rk = v_out.topology.neighbours[1, d - first_cut_dir]
+            L_rk = v_out.topology.neighbours[0, d - first_cut_dir]
+            recv_r = self._comm.Irecv(
+                [self._gh_from_r[d], self._gh_from_r[d].size,
+                 HYSOP_MPI_REAL],
+                source=R_rk, tag=1234 + R_rk + 19 * d)
+            recv_l = self._comm.Irecv(
+                [self._gh_from_l[d], self._gh_from_l[d].size,
+                 HYSOP_MPI_REAL],
+                source=L_rk, tag=4321 + L_rk + 17 * d)
+            send_l = self._comm.Issend(
+                [self._gh_to_l[d], self._gh_to_l[d].size, HYSOP_MPI_REAL],
+                dest=L_rk, tag=1234 + self._comm_rank + 19 * d)
+            send_r = self._comm.Issend(
+                [self._gh_to_r[d], self._gh_to_r[d].size, HYSOP_MPI_REAL],
+                dest=R_rk, tag=4321 + self._comm_rank + 17 * d)
+            send_r.wait()
+            recv_l.wait()
+            v_out.data[0][tuple(sl_l)] += self._gh_from_l[d]
+            send_l.wait()
+            recv_r.wait()
+            v_out.data[0][tuple(sl_r)] += self._gh_from_r[d]
+
+    def _exchange_ghosts_mpi(self):
+        for d in xrange(self._dim):
+            if d in self._cutdir_list:
+                self._exchange_ghosts_mpi_d(d)
+            else:
+                self._exchange_ghosts_local_d(d)