From 77d76eee68688fe603f3c57076186ff46d7c83f4 Mon Sep 17 00:00:00 2001
From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@imag.fr>
Date: Fri, 16 Mar 2012 09:27:43 +0000
Subject: [PATCH] Fix GPU code. Still a bug in sequential (numpy) code.

---
 parmepy/examples/mainJM.py                    |  11 +-
 parmepy/examples/main_Rotating_2D.py          |   6 +-
 parmepy/examples/main_Shear_2D.py             |  71 ++++
 parmepy/new_ParMePy/Operator/AdvectionDOp.py  |  19 +-
 parmepy/new_ParMePy/Operator/RemeshingDOp.py  |  53 +--
 .../Problem/DiscreteTransportProblem.py       |   2 +-
 .../new_ParMePy/Utils/GPUParticularSolver.py  |  16 +-
 parmepy/new_ParMePy/Utils/Printer.py          |  13 +-
 parmepy/new_ParMePy/Utils/gpu_src.cl          | 319 ++++++++++--------
 9 files changed, 294 insertions(+), 216 deletions(-)
 create mode 100644 parmepy/examples/main_Shear_2D.py

diff --git a/parmepy/examples/mainJM.py b/parmepy/examples/mainJM.py
index 018181072..eec917abb 100644
--- a/parmepy/examples/mainJM.py
+++ b/parmepy/examples/mainJM.py
@@ -17,14 +17,21 @@ def scalaire(x):
 def run():
     # Parameter
     dim = 3
-    nb = 32
+    nb = 128
     boxLength = [1., 1., 1.]
     boxMin = [0., 0., 0.]
     nbElem = [nb, nb, nb]
+
+    # dim = 2
+    # nb = 64
+    # boxLength = [1., 1.]
+    # boxMin = [0., 0.]
+    # nbElem = [nb, nb]
+
     timeStep = 0.02
     FinalTime = 1.
     outputFilePrefix = './res/RK2_'
-    outputModulo = 1
+    outputModulo = 2
 
     t0 = time.time()
 
diff --git a/parmepy/examples/main_Rotating_2D.py b/parmepy/examples/main_Rotating_2D.py
index bb4313c19..742a5d5cf 100644
--- a/parmepy/examples/main_Rotating_2D.py
+++ b/parmepy/examples/main_Rotating_2D.py
@@ -18,7 +18,7 @@ def scalaire(x):
 
 def run():
     dim = 2
-    nb = 100
+    nb = 128
     boxLength = [2., 2.]
     boxMin = [-1., -1.]
     nbElem = [nb, nb]
@@ -48,9 +48,9 @@ def run():
     RemeshingMethod = pp.Utils.M6Prime(grid=box.discreteDomain, gvalues=scal.discreteVariable)
     ODESolver = pp.Utils.RK2(InterpolationMethod, box.discreteDomain.applyConditions)
     ## cpu
-    solver = pp.Utils.ParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod)
+    #solver = pp.Utils.ParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod)
     ## gpu
-    #solver = pp.GPUParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod, device_type='gpu')
+    solver = pp.GPUParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod, device_type='gpu')
     cTspPb.setSolver(solver)
     cTspPb.setPrinter(pp.Utils.Printer(frequency=outputModulo, outputPrefix=outputFilePrefix, problem=cTspPb))
 
diff --git a/parmepy/examples/main_Shear_2D.py b/parmepy/examples/main_Shear_2D.py
new file mode 100644
index 000000000..5fe230160
--- /dev/null
+++ b/parmepy/examples/main_Shear_2D.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+import time
+import new_ParMePy as pp
+import math
+
+
+def vitesse(x):
+    r = math.sqrt(x[0] * x[0] + x[1] * x[1])
+    c = math.cos(3 * math.pi * r / 2.)
+    return [-c * x[1], c * x[0]]
+
+
+def scalaire(x):
+    r = math.sqrt(x[0] * x[0] + x[1] * x[1])
+    if r < 1:
+        return (1. - r * r) ** 6
+    else:
+        return 0.
+
+
+def run():
+    dim = 2
+    nb = 128
+    boxLength = [2., 2.]
+    boxMin = [-1., -1.]
+    nbElem = [nb, nb]
+
+    timeStep = 0.02
+    FinalTime = 1.
+    outputFilePrefix = './res/RK2_'
+    outputModulo = 1
+
+    t0 = time.time()
+
+    box = pp.Domain.Box(dimension=dim,
+              length=boxLength,
+              minPosition=boxMin)
+    velo = pp.Variable.AnalyticalVariable(domain=box,
+                              dimension=dim,
+                              formula=vitesse)
+    scal = pp.Variable.AnalyticalVariable(domain=box,
+                              dimension=1,
+                              formula=scalaire, scalar=True)
+    advec = pp.Operator.Advection(velo, scal)
+    cTspPb = pp.Problem.ContinuousTransportProblem(advection=advec)
+
+    cTspPb.discretize(dSpec=[nbElem])
+
+    InterpolationMethod = pp.Utils.Linear(grid=box.discreteDomain, gvalues=velo.discreteVariable)
+    RemeshingMethod = pp.Utils.M6Prime(grid=box.discreteDomain, gvalues=scal.discreteVariable)
+    ODESolver = pp.Utils.RK2(InterpolationMethod, box.discreteDomain.applyConditions)
+    ## cpu
+    #solver = pp.Utils.ParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod)
+    ## gpu
+    solver = pp.GPUParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod, device_type='gpu')
+    cTspPb.setSolver(solver)
+    cTspPb.setPrinter(pp.Utils.Printer(frequency=outputModulo, outputPrefix=outputFilePrefix, problem=cTspPb))
+
+    t1 = time.time()
+
+    cTspPb.solve(T=FinalTime, dt=timeStep)
+
+    tf = time.time()
+
+    print "Total time : ", tf - t0, "sec (CPU)"
+    print "Init time : ", t1 - t0, "sec (CPU)"
+    print "Solving time : ", tf - t1, "sec (CPU)"
+
+
+if __name__ == "__main__":
+    run()
diff --git a/parmepy/new_ParMePy/Operator/AdvectionDOp.py b/parmepy/new_ParMePy/Operator/AdvectionDOp.py
index 9af9d2d96..b9ea41436 100644
--- a/parmepy/new_ParMePy/Operator/AdvectionDOp.py
+++ b/parmepy/new_ParMePy/Operator/AdvectionDOp.py
@@ -56,9 +56,6 @@ class AdvectionDOp(DiscreteOperator):
         @param dt : time step for advection.
         @param splittingDirection : direction to advect.
         """
-        #for i in self.velocity.domain.explore():
-        #    self.resultPosition.values[i] = self.velocity.domain.applyConditions(self.numMethod.integrate(self.velocity.domain.points[i], self.velocity.values[i], t, dt, splittingDirection))
-            #self.resultScalar.values[i] = self.scalar.values[i]
         if self.gpu_kernel is None:
             c_time = time.time()
             ## ------------------- NumPy Optimisation
@@ -67,14 +64,22 @@ class AdvectionDOp(DiscreteOperator):
             ## -------------------
             self.compute_time += (time.time() - c_time)
         else:
-            evt = self.gpu_kernel(self.gpu_queue, self.gpu_shape, None,
+            l = list(self.gpu_shape)
+            nb_ligne = l.pop(splittingDirection)
+            evt = self.gpu_kernel(self.gpu_queue, tuple(l), None,
                 self.velocity.gpu_mem_object,
                 self.scalar.gpu_mem_object,
                 self.resultPosition.gpu_mem_object,
                 self.resultScalar.gpu_mem_object,
-                np.float32(t), np.float32(dt), np.uint(splittingDirection),
-                np.concatenate((self.velocity.domain.min, [1.])).astype(np.float32),
-                np.concatenate((self.velocity.domain.max, [1.])).astype(np.float32))
+                np.float32(t), np.float32(dt), np.uint32(splittingDirection), np.uint32(nb_ligne),
+                self.velocity.domain.min.astype(np.float32),
+                self.velocity.domain.max.astype(np.float32),
+                self.velocity.domain.elementNumber.astype(np.uint32),
+                self.velocity.domain.elementSize.astype(np.float32)
+                )
+            # self.gpu_queue.finish()
+            # cl.enqueue_copy(self.gpu_queue, self.resultPosition.values, self.resultPosition.gpu_mem_object)
+            # print self.resultPosition.values
             self.gpu_queue.finish()
             self.queued_time += (evt.profile.submit - evt.profile.queued)
             self.submit_time += (evt.profile.start - evt.profile.submit)
diff --git a/parmepy/new_ParMePy/Operator/RemeshingDOp.py b/parmepy/new_ParMePy/Operator/RemeshingDOp.py
index 9efb9a8b3..745c927ec 100644
--- a/parmepy/new_ParMePy/Operator/RemeshingDOp.py
+++ b/parmepy/new_ParMePy/Operator/RemeshingDOp.py
@@ -65,31 +65,6 @@ class RemeshingDOp(DiscreteOperator):
         """
         Apply Remeshing operator.
         """
-        #reset scalar
-        #for i in self.resultScalar.domain.explore():
-        #    self.resultScalar.values[i] = 0.
-        #### Remaillage tag:
-        # if isinstance(self.numMethod, Lambda2):
-        #     for i in self.resultScalar.domain.explore():
-        #         # Particules TaguÃ©es CentrÃ©es (tag = 1, bloc Type = 1)
-        #         if self.ptag.values[i] == 1 and self.pbloc.values[i] == 1:
-        #             ip = tuple([(i[d] + 1) % self.resultScalar.domain.elementNumber[d] if d == splittingDirection else i[d] for d in xrange(len(i))])
-        #             self.numMethod.remesh_Tag_Center(self.ppos.values[i], self.pscal.values[i], self.ppos.values[ip], self.pscal.values[ip], splittingDirection)
-        #         # Particules TaguÃ©es Gauche (tag = 1, bloc Type = 0)
-        #         elif self.ptag.values[i] == 1 and self.pbloc.values[i] == 0:
-        #             ip = tuple([(i[d] + 1) % self.resultScalar.domain.elementNumber[d] if d == splittingDirection else i[d] for d in xrange(len(i))])
-        #             self.numMethod.remesh_Tag_Left(self.ppos.values[i], self.pscal.values[i], self.ppos.values[ip], self.pscal.values[ip], splittingDirection)
-        #         # Particules non-TaguÃ©es CentrÃ©es (tag = 0, bloc Type = 1)
-        #         elif self.ptag.values[i] == 0 and self.pbloc.values[i] == 1:
-        #             self.numMethod.remesh_noTag_Center(self.ppos.values[i], self.pscal.values[i], splittingDirection)
-        #         # Particules non-TaguÃ©es Gauche (tag = 0, bloc Type = 0)
-        #         elif self.ptag.values[i] == 0 and self.pbloc.values[i] == 0:
-        #             self.numMethod.remesh_noTag_Left(self.ppos.values[i], self.pscal.values[i], splittingDirection)
-        #         elif self.ptag.values[i] != 2:
-        #             raise Exception("Bad tag or bloc type : (tag = {0}, type = {1})".format(self.ptag.values[i], self.pbloc.values[i]))
-        # else:
-        #for i in self.resultScalar.domain.explore():
-        #    self.numMethod.remesh(self.ppos.values[i], self.pscal.values[i], splittingDirection)
         c_time = time.time()
         if self.gpu_kernel is None:
             c_time = time.time()
@@ -99,26 +74,24 @@ class RemeshingDOp(DiscreteOperator):
             ## -------------------
             self.compute_time += (time.time() - c_time)
         else:
-            evt = self.gpu_kernel[0](self.gpu_queue, self.gpu_shape, None,
-                self.ppos.gpu_mem_object, self.pscal.gpu_mem_object, self.resultScalar.gpu_mem_object, self.gpu_buffer,
-                np.float32(t), np.float32(dt), np.int32(splittingDirection),
-                np.concatenate((self.resultScalar.domain.min, [1.])).astype(np.float32),
-                np.concatenate((self.resultScalar.domain.max, [1.])).astype(np.float32))
+            l = list(self.gpu_shape)
+            nb_ligne = l.pop(splittingDirection)
+            evt = self.gpu_kernel(self.gpu_queue, tuple(l), None,
+                self.ppos.gpu_mem_object, self.pscal.gpu_mem_object, self.resultScalar.gpu_mem_object,
+                np.float32(t), np.float32(dt), np.uint32(splittingDirection), np.uint32(nb_ligne),
+                self.resultScalar.domain.min.astype(np.float32),
+                self.resultScalar.domain.max.astype(np.float32),
+                self.resultScalar.domain.elementNumber.astype(np.uint32),
+                self.resultScalar.domain.elementSize.astype(np.float32)
+                )
+            # self.gpu_queue.finish()
+            # cl.enqueue_copy(self.gpu_queue, self.resultScalar.values, self.resultScalar.gpu_mem_object)
+            # print self.resultScalar.values
             self.gpu_queue.finish()
             self.queued_time += (evt.profile.submit - evt.profile.queued)
             self.submit_time += (evt.profile.start - evt.profile.submit)
             self.compute_time += (evt.profile.end - evt.profile.start)
             #print "Remeshing M'6:", self.compute_time * 1e-9
-            evt = self.gpu_kernel[1](self.gpu_queue, self.gpu_shape, None,
-                self.resultScalar.gpu_mem_object, self.gpu_buffer,
-                np.float32(t), np.float32(dt), np.int32(splittingDirection),
-                np.concatenate((self.resultScalar.domain.min, [1.])).astype(np.float32),
-                np.concatenate((self.resultScalar.domain.max, [1.])).astype(np.float32))
-            self.gpu_queue.finish()
-            self.queued_time += (evt.profile.submit - evt.profile.queued)
-            self.submit_time += (evt.profile.start - evt.profile.submit)
-            self.compute_time += (evt.profile.end - evt.profile.start)
-            #print "Remeshing reduce:", self.compute_time * 1e-9
 
     def __str__(self):
         """ToString method"""
diff --git a/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py b/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py
index e42a7471a..224d311d0 100644
--- a/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py
+++ b/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py
@@ -47,7 +47,7 @@ class DiscreteTransportProblem(DiscreteProblem):
         @param dt : Simulation time step.
         """
         ite = 0
-        if self.printer != None:
+        if not self.printer is None:
             while self.t < T:
                 [[op.applyOperator(self.t, dt * i[1], i[0]) for op in self.operators] for i in self.splittingStep]
                 self.t += dt
diff --git a/parmepy/new_ParMePy/Utils/GPUParticularSolver.py b/parmepy/new_ParMePy/Utils/GPUParticularSolver.py
index c9183718d..69aaac6e0 100644
--- a/parmepy/new_ParMePy/Utils/GPUParticularSolver.py
+++ b/parmepy/new_ParMePy/Utils/GPUParticularSolver.py
@@ -121,13 +121,15 @@ class GPUParticularSolver(ParticularSolver):
         discreteProblem.advecOp.gpu_kernel = self.prg.integrate
         remesh.gpu_queue = self.queue
         remesh.gpu_shape = self.gpu_shape
-        remesh.gpu_kernel = (self.prg.remeshing, self.prg.remeshing_reduce)
-        remesh.gpu_buffer = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=self.gscal.values.nbytes * 8)
-        self.gpu_used_mem += remesh.gpu_buffer.size
-        print "Allocation remeshing buffer on gpu, size:", remesh.gpu_buffer.size, 'B'
-        buffer_shape = list(self.gpu_shape)
-        buffer_shape.append(8)
-        remesh.gpu_buffer_values = np.zeros(tuple(buffer_shape), dtype=np.float32)
+        remesh.gpu_kernel = self.prg.remeshing
+
+        # Modifying domains parameters to fit with float4 or int4 opencl objects
+        padding_float = np.zeros(4 - self.grid.dimension, dtype=np.float32)
+        padding_uint = np.zeros(4 - self.grid.dimension, dtype=np.uint)
+        self.grid.min = np.concatenate((self.grid.min, padding_float)).astype(np.float32)
+        self.grid.max = np.concatenate((self.grid.max, padding_float)).astype(np.float32)
+        self.grid.elementSize = np.concatenate((self.grid.elementSize, padding_float)).astype(np.float32)
+        self.grid.elementNumber = np.concatenate((self.grid.elementNumber, padding_uint)).astype(np.uint32)
 
         self.queue.finish()
         p = self.gpu_used_mem * 100 / (self.device.global_mem_size * 1.)
diff --git a/parmepy/new_ParMePy/Utils/Printer.py b/parmepy/new_ParMePy/Utils/Printer.py
index c90b14bf3..b30981eb8 100644
--- a/parmepy/new_ParMePy/Utils/Printer.py
+++ b/parmepy/new_ParMePy/Utils/Printer.py
@@ -30,24 +30,23 @@ class Printer():
             self.printStep = self._printStep
         else:
             self.printStep = self._passStep
+        self.skip = 1
         self.grid = self.problem.discreteProblem.advecOp.velocity.domain
         self.gvelo = self.problem.discreteProblem.advecOp.velocity.values
         self.ppos = self.problem.discreteProblem.solver.ppos.values
         self.pscal = self.problem.discreteProblem.solver.pscal.values
         self.gscal = self.problem.discreteProblem.advecOp.scalar.values
-
-        #self.pvelo = self.problem.discreteProblem.solver.pvelo
-        # self.pbloc = self.problem.discreteProblem.solver.pbloc
-        # self.ptag = self.problem.discreteProblem.solver.ptag
         self.printStep()
 
     def _passStep(self):
         pass
 
     def _printStep(self):
-        if (self.ite % self.freq) == 0:
+        self.skip -= 1
+        if self.skip == 0:
+            self.skip = self.freq
             f = open(self.outputPrefix + "results_{0:05d}.dat".format(self.ite), 'w')
-            dim = len(self.grid.elementNumber)
+            dim = self.grid.dimension
             f.write("# iteration : {0}, time = {1}".format(self.ite, self.problem.discreteProblem.t))
             f.write("# gid({0})  gpos({1}) gvelo({2}) gscal(1) ppos({3}) pvelo({4}) pscal(1) pbloc(1) ptag(1)\n".format(dim, dim, dim, dim, dim))
             if isinstance(self.problem.discreteProblem.solver, GPUParticularSolver):
@@ -98,7 +97,7 @@ class Printer():
                             f.write("\n")
                 f.write("\n")
             f.close()
-        self.ite += 1
+            self.ite += 1
 
 
 if __name__ == "__main__":
diff --git a/parmepy/new_ParMePy/Utils/gpu_src.cl b/parmepy/new_ParMePy/Utils/gpu_src.cl
index ec0e6a207..b671bff9b 100644
--- a/parmepy/new_ParMePy/Utils/gpu_src.cl
+++ b/parmepy/new_ParMePy/Utils/gpu_src.cl
@@ -1,84 +1,105 @@
-size_t space_index_to_buffer_index(size_t *ind, size_t d, size_t dim_vec);
-#if ORDER == 'F'
-size_t space_index_to_buffer_index(size_t *ind, size_t d, size_t dim_vec)
-{
-    //F order
-#if DIM == 2
-      return ind[0] + ind[1] * get_global_size(0) + ind[2] * get_global_size(0) * get_global_size(1) + d * get_global_size(0) * get_global_size(1) * get_global_size(2);
-#else
-      return ind[0] + ind[1] * get_global_size(0) + d * get_global_size(0) * get_global_size(1);
-#endif
-}
-#else
-size_t space_index_to_buffer_index(size_t *ind, size_t d, size_t dim_vec)
-{
-  //C order
-#if DIM == 3
-    return dim_vec * (ind[2] + ind[1] * get_global_size(2) + ind[0] * get_global_size(2) * get_global_size(1)) + d;
-#else
-    return dim_vec * (ind[1] + ind[0] * get_global_size(1)) + d;
-#endif
-}
-#endif
-
-
 // Integration Kernel (RK2 method)
 __kernel void integrate(__global const float* gvelo,
-			__global const float* gscal,
+			__global float* gscal,
 			__global float* ppos,
 			__global float* pscal,
-			 float t,
-			 float dt,
-			 size_t dir,
-			 float4 min_v,
-			 float4 max_v
+			float t,
+			float dt,
+			size_t dir,
+			size_t line_nb_pts,
+			float4 min_v,
+			float4 max_v,
+			uint4 nb,
+			float4 size
 			)
 {
-  __private size_t i, stride_vec, stride_part_dir, ind, ind_scal, ind_interp_dir, ind_shift, ind_shift_p;
-  __private size_t ind_d[DIM], elementNumber[DIM];
-  __private float v_loc_dir, ppos_loc_dir, x0;
-  __private float ppos_loc_d[DIM], elementSize[DIM];
-  stride_vec = 1;
-  stride_part_dir = DIM;
-  // Compute global index and buffer index
-  // Compute element number and size
-  // Compute grid position from index
-  for(i=0;i<DIM;i++)
+  __private size_t i;  // DIM loop index
+  __private size_t p;  // Particle line loop index
+  __private size_t stride_vec;  // Buffer stride between two components of the same particle
+  __private size_t stride_along_dir;   // Buffer stride between two particle of the same line
+  __private size_t ind_line;  // Line begin buffer index
+  __private size_t i_ind;  // Interpolation left point buffer index
+  __private size_t i_ind_p;  // Interpolation right point buffer index
+  __private int ind_c;  // Interpolation grid point
+  __private float x0;  // Interpolation distance from nearest left grid point
+  __private float size_line;  // Space step along line
+  __private float p_velo;  // Particle interpolated velocity
+  __private float ppos_c;  // Particle position temp
+  __private float line_c[DIM];  // Particle final position
+  // Compute strides and index. Depend on array order (C order here)
+  stride_vec = 1; // Les composantes des vecteurs sont Ã  la suite dans les buffers
+  if(DIM == 3)
     {
-      ind_d[i] = get_global_id(i);
-      elementNumber[i] = get_global_size(i);
-      elementSize[i] = (max_v[i] - min_v[i]) / ((float)elementNumber[i]);
-      ppos_loc_d[i] = ind_d[i]*elementSize[i] + min_v[i];
+      if (dir == 0)
+	{
+	  stride_along_dir = DIM *nb[2] * nb[1]; // avancer de 1 dans la direction 0
+	  ind_line = DIM * (get_global_id(1) + get_global_id(0)*nb[2]); // indice du point 0,y,z,0 = DIM*(z+y*nb[2])
+	  line_c[1] = min_v[1] + get_global_id(0)*size[1];
+	  line_c[2] = min_v[2] + get_global_id(1)*size[2];
+	  size_line = size[0];
+	}
+      else if (dir == 1)
+	{
+	  stride_along_dir = DIM*nb[2]; // Avancer de 1 dans la direction 1
+	  ind_line = DIM * (get_global_id(1) + get_global_id(0) * nb[2] * nb[1]); // indice du point x,0,z,0 = DIM(z+x*nb[2]*nb[1])
+	  line_c[0] = min_v[0] + get_global_id(0)*size[0];
+	  line_c[2] = min_v[2] + get_global_id(1)*size[2];
+	  size_line =size[1];
+	}
+      else
+	{
+	  stride_along_dir = DIM; // Avancer de 1 dans la direction 2
+	  ind_line = DIM * (get_global_id(1) * nb[2] + get_global_id(0) * nb[2] * nb[1]); // indice du point x,y,0,0 = DIM*(y*nb[2]+x*nb[2]*nb[1])
+	  line_c[0] = min_v[0] + get_global_id(0)*size[0];
+	  line_c[1] = min_v[1] + get_global_id(1)*size[1];
+	  size_line = size[2];
+	}
     }
-  // Compute buffer index from global index (vector|scalar buffer)
-  ind = space_index_to_buffer_index(ind_d, 0, DIM);
-  ind_scal = space_index_to_buffer_index(ind_d, 0, 1);
-  // Buffer read velocity (dir component)
-  v_loc_dir = gvelo[ind + dir * stride_vec];
-  // RK2 intermediar point (dir component)
-  ppos_loc_dir = ppos_loc_d[dir] + 0.5f * dt * v_loc_dir;
-  // Compute left index of intermediar point (dir component)
-  ind_interp_dir = (int)((ppos_loc_dir - min_v[dir]) / elementSize[dir]);
-  // Compute distance to left grid point
-  x0 = (ppos_loc_dir - ind_interp_dir * elementSize[dir] - min_v[dir]) / elementSize[dir];
-  // Compute effective shift grid index to read Velocity
-  ind_shift = ((ind_interp_dir + elementNumber[dir]) % elementNumber[dir]) - ind_d[dir];
-  ind_shift_p = ((ind_interp_dir + 1 + elementNumber[dir]) % elementNumber[dir]) - ind_d[dir];
-  // Compute interpolatex velocity
-  v_loc_dir = (1.0f - x0) * gvelo[ind + ind_shift * stride_part_dir] + x0 * gvelo[ind + ind_shift_p * stride_part_dir];
-  // Compute final position
-  ppos_loc_dir = ppos_loc_d[dir] + dt * v_loc_dir;
-  // Applying boundary conditions on new position
-  if (ppos_loc_dir >= max_v[dir])
-    ppos_loc_d[dir] = ppos_loc_dir - max_v[dir] + min_v[dir];
-  else if (ppos_loc_dir < min_v[dir])
-    ppos_loc_d[dir] = ppos_loc_dir + max_v[dir] - min_v[dir];
   else
-    ppos_loc_d[dir] = ppos_loc_dir;
-  // Writing particles position and scalar to results buffer
-  for(i=0; i<DIM; i++)
-    ppos[ind + i*stride_vec] = ppos_loc_d[i];
-  pscal[ind_scal] = gscal[ind_scal];
+    {
+      if (dir == 0)
+	{
+	  stride_along_dir = DIM * nb[1]; // Avancer de 1 dans la direction 0
+	  ind_line = DIM * (get_global_id(0)); // indice du point 0,y,0
+	  line_c[1] = min_v[1] + get_global_id(0)*size[1];
+	  size_line = size[0];
+	}
+      else
+	{
+	  stride_along_dir = DIM; // Avancer de 1 dans la direction 1
+	  ind_line = DIM * (get_global_id(0) * nb[1]); // indice du point x,0,0
+	  line_c[0] = min_v[0] + get_global_id(0)*size[0];
+	  size_line = size[1];
+	}
+    }
+  for(p=0;p<line_nb_pts;p++)
+    {
+      // Particles scalar initialisation
+      pscal[(ind_line + p*stride_along_dir)/DIM] = gscal[(ind_line + p*stride_along_dir)/DIM];
+      // Compute intermediar point
+      ppos_c = min_v[dir] + p*size_line + 0.5f*dt*gvelo[ind_line + p*stride_along_dir + dir*stride_vec];
+      // Compute nearest left grid point
+      ind_c = convert_int_rtn((ppos_c - min_v[dir])/size_line);
+      // Compute nearest left grid point distance
+      x0 = (ppos_c - ind_c * size_line - min_v[dir]) / size_line;
+      // Compute line index
+      i_ind = (ind_c + line_nb_pts) % line_nb_pts;
+      i_ind_p = (ind_c + 1 + line_nb_pts) % line_nb_pts;
+      // Interpolate velocity
+      p_velo = (1.0f - x0) * gvelo[ind_line + i_ind * stride_along_dir + dir*stride_vec] + x0 * gvelo[ind_line + i_ind_p * stride_along_dir + dir*stride_vec];
+      // Compute final position
+      ppos_c = min_v[dir] + p*size_line + dt*p_velo;
+      // Apply boundary conditions
+      if (ppos_c >= max_v[dir])
+	line_c[dir] = ppos_c - max_v[dir] + min_v[dir];
+      else if (ppos_c < min_v[dir])
+	line_c[dir] = ppos_c + max_v[dir] - min_v[dir];
+      else
+	line_c[dir] = ppos_c;
+      // Write result in buffer
+      for(i=0; i<DIM; i++)
+	ppos[ind_line + p*stride_along_dir + i*stride_vec] = line_c[i];
+    }
 }
 
 
@@ -86,88 +107,88 @@ __kernel void integrate(__global const float* gvelo,
 __kernel void remeshing(__global const float* part,
 			__global const float* pscal,
 			__global float* gscal,
-			__global float *tmp,
-			 float t,
-			 float dt,
-			 int dir,
-			 float4 min_v,
-			 float4 max_v
+			float t,
+			float dt,
+			int dir,
+			size_t line_nb_pts,
+			float4 min_v,
+			float4 max_v,
+			uint4 nb,
+			float4 size
 			)
 {
-  __private size_t i;
-  __private size_t s_ind[DIM];
-  __private size_t elementNumber[DIM];
-  __private float elementSize[DIM];
-  __private float s_part[DIM],s_pscal;
-  __private size_t s_ind_im2[DIM],s_ind_im[DIM],s_ind_i0[DIM],s_ind_ip[DIM],s_ind_ip2[DIM],s_ind_ip3[DIM];
-  __private float x0, am2, am, a0, ap, ap2, ap3;
-  // Compute global index and global size
-  for(i=0; i<DIM; i++)
+  __private size_t i;  // DIM loop index
+  __private size_t p;  // Particle line loop index
+  __private size_t stride_vec;  // Buffer stride between two components of the same particle
+  __private size_t stride_along_dir;   // Buffer stride between two particle of the same line
+  __private size_t ind_line;  // Line begin buffer index
+  __private size_t r_ind;  // Remeshing nearest left point buffer index
+  __private int ind_c;  // Remeshing grid point
+  __private float x0;  // Remeshing distance from nearest left grid point
+  __private float size_line;  // Space step along line
+  __private float ppos_c;  // Particle position temp along line
+  __private float pscal_c;  // Particle scalar along line
+  __private float weights[6];  // Remeshing weights
+  // Compute strides and index. Depend on array order (C order here)
+  stride_vec = 1; // Les composantes des vecteurs sont Ã  la suite dans les buffers
+  if(DIM == 3){
+  if (dir == 0)
     {
-      s_ind[i] = get_global_id(i);
-      elementNumber[i] = get_global_size(i);
+      stride_along_dir = DIM *nb[2] * nb[1]; // avancer de 1 dans la direction 0
+      ind_line = DIM * (get_global_id(1) + get_global_id(0)*nb[2]); // indice du point 0,y,z,0 = DIM*(z+y*nb[2])
+      size_line = size[0];
     }
-  // Compute element size
-  for(i=0; i<DIM; i++)
-    elementSize[i] = (max_v[i] - min_v[i]) / ((float)elementNumber[i]);
-  // Read buffers
-  for(i=0; i<DIM; i++)
-      s_part[i] = part[space_index_to_buffer_index(s_ind, i, DIM)];
-  s_pscal = pscal[space_index_to_buffer_index(s_ind, 0, 1)];
-  // Compute left nearest index and copy to other index
-  for(i=0; i<DIM; i++)
+  else if (dir == 1)
     {
-      s_ind_i0[i] = (int)((s_part[i] - min_v[i])/elementSize[i]+0.000001f);
-      s_ind_ip[i] = s_ind_i0[i];
-      s_ind_ip2[i] = s_ind_i0[i];
-      s_ind_ip3[i] = s_ind_i0[i];
-      s_ind_im[i] = s_ind_i0[i];
-      s_ind_im2[i] = s_ind_i0[i];
+      stride_along_dir = DIM*nb[2]; // Avancer de 1 dans la direction 1
+      ind_line = DIM * (get_global_id(1) + get_global_id(0) * nb[2] * nb[1]); // indice du point x,0,z,0 = DIM(z+x*nb[2]*nb[1])
+      size_line =size[1];
     }
-  s_ind_im2[dir] = (s_ind_im2[dir] - 2 + elementNumber[dir]) % elementNumber[dir];
-  s_ind_im[dir] = (s_ind_im[dir] - 1 + elementNumber[dir]) % elementNumber[dir];
-  s_ind_i0[dir] = (s_ind_i0[dir] + elementNumber[dir]) % elementNumber[dir];
-  s_ind_ip[dir] = (s_ind_ip[dir] + 1 + elementNumber[dir]) % elementNumber[dir];
-  s_ind_ip2[dir] = (s_ind_ip2[dir] + 2 + elementNumber[dir]) % elementNumber[dir];
-  s_ind_ip3[dir] = (s_ind_ip3[dir] + 3 + elementNumber[dir]) % elementNumber[dir];
-  // Compute distance to left nearest point
-  x0 = (s_part[dir] - s_ind_i0[dir] * elementSize[dir] - min_v[dir]) / elementSize[dir];
-  // Compute remeshing weights
-  am2 = (-(x0) * (5.0f * (x0 + 2.0f) - 8.0f) * (x0 - 1.0f) * (x0 - 1.0f) * (x0 - 1.0f)) / 24.0f;
-  am = x0 * (x0 - 1.0f) * (25.0f * (x0 + 1.0f)*(x0 + 1.0f)*(x0 + 1.0f) - 114.0f * (x0 + 1.0f) * (x0 + 1.0f) + 153.0f * (x0 + 1.0f) - 48.0f) / 24.0f;
-  a0 = -(x0 - 1.0f) * (25.0f * x0*x0*x0*x0 - 38.0f * x0*x0*x0 - 3.0f * x0*x0 + 12.0f * x0 + 12.0f) / 12.0f;
-  ap = x0 * (25.0f * (1.0f - x0)*(1.0f - x0)*(1.0f - x0)*(1.0f - x0) - 38.0f * (1.0f - x0)*(1.0f - x0)*(1.0f - x0) - 3.0f * (1.0f - x0)*(1.0f - x0) + 12.0f * (1.0f - x0) + 12.0f) / 12.0f;
-  ap2 = (1.0f - x0) * (-x0) * (25.0f * (2.0f - x0)*(2.0f - x0)*(2.0f - x0) - 114.0f * (2.0f - x0)*(2.0f - x0) + 153.0f * (2.0f - x0) - 48.0f) / 24.0f;
-  ap3 = -(1.0f - x0) * ((5.0f * (3.0f - x0) - 8.0f) * (-x0)*(-x0)*(-x0)) / 24.0f;
-  // Write temp result
-  tmp[space_index_to_buffer_index(s_ind_im2, 0, 8)] += am2 * s_pscal;
-  tmp[space_index_to_buffer_index(s_ind_im, 1, 8)] += am * s_pscal;
-  tmp[space_index_to_buffer_index(s_ind_i0, 2, 8)] += a0 * s_pscal;
-  tmp[space_index_to_buffer_index(s_ind_ip, 3, 8)] += ap * s_pscal;
-  tmp[space_index_to_buffer_index(s_ind_ip2, 4, 8)] += ap2 * s_pscal;
-  tmp[space_index_to_buffer_index(s_ind_ip3, 5, 8)] += ap3 * s_pscal;
-}
+  else
+    {
+      stride_along_dir = DIM; // Avancer de 1 dans la direction 2
+      ind_line = DIM * (get_global_id(1) * nb[2] + get_global_id(0) * nb[2] * nb[1]); // indice du point x,y,0,0 = DIM*(y*nb[2]+x*nb[2]*nb[1])
+      size_line = size[2];
+    }}
+  else{
+    if (dir == 0)
+    {
+      stride_along_dir = DIM * nb[1]; // Avancer de 1 dans la direction 0
+      ind_line = DIM * (get_global_id(0)); // indice du point 0,y,0
+      size_line = size[0];
+    }
+    else
+      {
+	stride_along_dir = DIM; // Avancer de 1 dans la direction 1
+	ind_line = DIM * (get_global_id(0) * nb[1]); // indice du point x,0,0
+	size_line = size[1];
+      }
+  }
 
-__kernel void remeshing_reduce(__global float* gscal,
-			       __global float *tmp,
-			        float t,
-			        float dt,
-			        int dir,
-			        float4 min,
-			        float4 max)
-{
-  __private size_t i;
-  __private size_t s_ind[DIM];
-  __private float sum = 0.0f;
-  // Compute global index and global size
-  for(i=0; i<DIM; i++)
-      s_ind[i] = get_global_id(i);
-  // Reduce grid point weights
-  for(i=0;i<6;i++)
-    sum += tmp[space_index_to_buffer_index(s_ind, i, 8)];
-  // Write grid point new scalar
-  gscal[space_index_to_buffer_index(s_ind, 0, 1)] = sum;
-  // reset buffer
-  for(i=0;i<6;i++)
-    tmp[space_index_to_buffer_index(s_ind, i, 8)] = 0.0f;
+  // Initialise grid scalar
+  for(p=0;p<line_nb_pts;p++)
+      gscal[(ind_line + p*stride_along_dir)/DIM] = 0.0f;
+  for(p=0;p<line_nb_pts;p++)
+    {
+      // read particle position
+      ppos_c = part[ind_line + p*stride_along_dir + dir*stride_vec];
+      // read particle scalar
+      pscal_c = pscal[(ind_line + p*stride_along_dir)/DIM];
+      // Compute nearest left grid point
+      ind_c = convert_int_rtn((ppos_c - min_v[dir])/size_line);
+      // Compute nearest left grid point distance
+      x0 = (ppos_c - ind_c * size_line - min_v[dir]) / size_line;
+      // Compute remeshing weights
+      weights[0] = (-(x0) * (5.0f * (x0 + 2.0f) - 8.0f) * (x0 - 1.0f) * (x0 - 1.0f) * (x0 - 1.0f)) / 24.0f;
+      weights[1] = x0 * (x0 - 1.0f) * (25.0f * (x0 + 1.0f)*(x0 + 1.0f)*(x0 + 1.0f) - 114.0f * (x0 + 1.0f) * (x0 + 1.0f) + 153.0f * (x0 + 1.0f) - 48.0f) / 24.0f;
+      weights[2] = -(x0 - 1.0f) * (25.0f * x0*x0*x0*x0 - 38.0f * x0*x0*x0 - 3.0f * x0*x0 + 12.0f * x0 + 12.0f) / 12.0f;
+      weights[3] = x0 * (25.0f * (1.0f - x0)*(1.0f - x0)*(1.0f - x0)*(1.0f - x0) - 38.0f * (1.0f - x0)*(1.0f - x0)*(1.0f - x0) - 3.0f * (1.0f - x0)*(1.0f - x0) + 12.0f * (1.0f - x0) + 12.0f) / 12.0f;
+      weights[4] = (1.0f - x0) * (-x0) * (25.0f * (2.0f - x0)*(2.0f - x0)*(2.0f - x0) - 114.0f * (2.0f - x0)*(2.0f - x0) + 153.0f * (2.0f - x0) - 48.0f) / 24.0f;
+      weights[5] = -(1.0f - x0) * ((5.0f * (3.0f - x0) - 8.0f) * (-x0)*(-x0)*(-x0)) / 24.0f;
+      // Apply remeshing weights
+      for(i=0;i<6;i++){
+	r_ind = (ind_c-2+i+line_nb_pts)%line_nb_pts;
+	gscal[(ind_line + r_ind*stride_along_dir)/DIM] += weights[i] * pscal_c;
+      }
+    }
 }
-- 
GitLab