From def467499b30793c413d53403e27b2ca1184c738 Mon Sep 17 00:00:00 2001
From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@imag.fr>
Date: Mon, 4 Mar 2013 07:45:46 +0000
Subject: [PATCH] Add m8 remeshing

---
 HySoP/hysop/particular_solvers/gpu.py         |  97 +++++++---
 ...and_remesh_m8prime_2D_opt4_builtin_noBC.cl | 107 +++++++++++
 ...and_remesh_m8prime_2D_opt8_builtin_noBC.cl | 177 ++++++++++++++++++
 ...remesh_m8prime_3D_opt4_builtin_private4.cl | 115 ++++++++++++
 ...h_m8prime_3D_opt4_builtin_private4_noBC.cl | 115 ++++++++++++
 .../gpu_src/remeshing_m8prime_2D_opt4_noBC.cl | 102 ++++++++++
 .../remeshing_m8prime_3D_opt4_private4.cl     |  95 ++++++++++
 .../gpu_src/weights_m8_prime.cl               |  31 +++
 .../gpu_src/weights_m8prime_builtin.cl        |  33 ++++
 .../gpu_src/weights_m8prime_opt4_builtin.cl   |  33 ++++
 10 files changed, 874 insertions(+), 31 deletions(-)
 create mode 100644 HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl
 create mode 100644 HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl
 create mode 100644 HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl
 create mode 100644 HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl
 create mode 100644 HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_2D_opt4_noBC.cl
 create mode 100644 HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_3D_opt4_private4.cl
 create mode 100644 HySoP/hysop/particular_solvers/gpu_src/weights_m8_prime.cl
 create mode 100644 HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_builtin.cl
 create mode 100644 HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_opt4_builtin.cl

diff --git a/HySoP/hysop/particular_solvers/gpu.py b/HySoP/hysop/particular_solvers/gpu.py
index f3c55e6af..f0c9f39c3 100644
--- a/HySoP/hysop/particular_solvers/gpu.py
+++ b/HySoP/hysop/particular_solvers/gpu.py
@@ -20,7 +20,8 @@ class GPUParticleSolver(ParticleSolver):
     """
     GPU Particular solver description.
 
-    Link with differents numericals methods used. Prepare GPU side (memory, kernels, ...)
+    Link with differents numericals methods used.
+    Prepare GPU side (memory, kernels, ...)
     """
     def __init__(self, problem, t_end, dt,
                  ODESolver=None,
@@ -68,7 +69,8 @@ class GPUParticleSolver(ParticleSolver):
         #Get device.
         try:
             ## OpenCL device
-            self.device = self.platform.get_devices(eval("cl.device_type." + device_type.upper()))[device_id]
+            self.device = self.platform.get_devices(
+                eval("cl.device_type." + device_type.upper()))[device_id]
         except cl.RuntimeError as e:
             print "RuntimeError:", e
             self.device = cl.create_some_context().devices[0]
@@ -76,17 +78,22 @@ class GPUParticleSolver(ParticleSolver):
             print "AttributeError:", e
             self.device = cl.create_some_context().devices[0]
         print "  Device"
-        print "  - Name                :", self.device.name
-        print "  - Type                :", cl.device_type.to_string(self.device.type)
-        print "  - C Version           :", self.device.opencl_c_version
-        print "  - Global mem size     :", self.device.global_mem_size / (1024 ** 3), "GB"
+        print "  - Name                :",
+        print self.device.name
+        print "  - Type                :",
+        print cl.device_type.to_string(self.device.type)
+        print "  - C Version           :",
+        print self.device.opencl_c_version
+        print "  - Global mem size     :",
+        print self.device.global_mem_size / (1024 ** 3), "GB"
         print "===\n"
         #Creates GPU Context
         ## OpenCL context
         self.ctx = cl.Context([self.device])
         #Create CommandQueue on the GPU Context
         ## OpenCL command queue
-        self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+        self.queue = cl.CommandQueue(
+            self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     def initialize(self):
         """
@@ -129,27 +136,31 @@ class GPUParticleSolver(ParticleSolver):
                       'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF',
                       'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']:
                 try:
-                    if eval('(self.device.single_fp_config & cl.device_fp_config.' +
+                    if eval('(self.device.single_fp_config &' +
+                            ' cl.device_fp_config.' +
                             v + ') == cl.device_fp_config.' + v):
                         print v
                 except AttributeError as ae:
                     if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT':
                         self.correctlyroundeddividesqrt = False
-                    print v, 'is not supported in OpenCL C 1.2.\n   Exception catched : ', ae
+                    print v, 'is not supported in OpenCL C 1.2.\n',
+                    print '   Exception catched : ', ae
         else:
             print "for Double Precision: "
             if self.device.double_fp_config > 0:
-                for v in ['DENORM', 'INF_NAN',
+                for v in ['DENORM', 'INF_NAN', 'FMA',
                           'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF',
-                          'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']:
+                          'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']:
                     try:
-                        if eval('(self.device.double_fp_config & cl.device_fp_config.' +
+                        if eval('(self.device.double_fp_config &' +
+                                ' cl.device_fp_config.' +
                                 v + ') == cl.device_fp_config.' + v):
                             print v
                     except AttributeError as ae:
                         if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT':
                             self.correctlyroundeddividesqrt = False
-                        print  v, 'is supported in OpenCL C 1.2.\n   Exception catched : ', ae
+                        print  v, 'is supported in OpenCL C 1.2.\n',
+                        print '   Exception catched : ', ae
             else:
                 raise ValueError("Double Precision is not supported by device")
         ## Building options and kernel settings
@@ -167,20 +178,36 @@ class GPUParticleSolver(ParticleSolver):
         if self.advection.include_remesh:
             if len(resolution) == 3:
                 if self.gpu_precision is np.float32:
-                    src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
-                    src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4.cl'
+                    if self.RemeshingMethod is 'm6prime':
+                        src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
+                        src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4.cl'
+                    else:
+                        src_remesh_weights = 'weights_m8prime_opt4_builtin.cl'
+                        src_advec_and_remesh = 'advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl'
                 else:
-                    src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
-                    src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4_noBC.cl'
+                    if self.RemeshingMethod is 'm6prime':
+                        src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
+                        src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4_noBC.cl'
+                    else:
+                        src_remesh_weights = 'weights_m8prime_opt4_builtin.cl'
+                        src_advec_and_remesh = 'advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl'
                 advec_and_remesh_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2]))
                 advec_and_remesh_lwi = (int(workItemNumber), 1, 1)
             else:
                 if self.gpu_precision is np.float32:
-                    src_remesh_weights = 'weights_m6prime_builtin.cl'
-                    src_advec_and_remesh = 'advection_and_remesh_2D_opt8_builtin_noBC.cl'
+                    if self.RemeshingMethod is 'm6prime':
+                        src_remesh_weights = 'weights_m6prime_builtin.cl'
+                        src_advec_and_remesh = 'advection_and_remesh_2D_opt8_builtin_noBC.cl'
+                    else:
+                        src_remesh_weights = 'weights_m8prime_builtin.cl'
+                        src_advec_and_remesh = 'advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl'
                 else:
-                    src_remesh_weights = 'weights_m6prime_builtin.cl'
-                    src_advec_and_remesh = 'advection_and_remesh_2D_opt4_builtin_noBC.cl'
+                    if self.RemeshingMethod is 'm6prime':
+                        src_remesh_weights = 'weights_m6prime_builtin.cl'
+                        src_advec_and_remesh = 'advection_and_remesh_2D_opt4_builtin_noBC.cl'
+                    else:
+                        src_remesh_weights = 'weights_m8prime_builtin.cl'
+                        src_advec_and_remesh = 'advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl'
                 advec_and_remesh_gwi = (int(workItemNumber), int(resolution[1]))
                 advec_and_remesh_lwi = (int(workItemNumber), 1)
             f = open(os.path.join(GPU_SRC, src_remesh_weights))
@@ -212,13 +239,21 @@ class GPUParticleSolver(ParticleSolver):
             f.close()
             ## remeshing
             if len(resolution) == 3:
-                src_remesh = 'remeshing_3D_opt4_private4.cl'
-                src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
+                if self.RemeshingMethod is 'm6prime':
+                    src_remesh = 'remeshing_3D_opt4_private4.cl'
+                    src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
+                else:
+                    src_remesh = 'remeshing_m8prime_3D_opt4_private4.cl'
+                    src_remesh_weights = 'weights_m8prime_opt4_builtin.cl'
                 remesh_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2]))
                 remesh_lwi = (int(workItemNumber), 1, 1)
             else:
-                src_remesh = 'remeshing_2D_opt4_noBC.cl'
-                src_remesh_weights = 'weights_m6prime_builtin.cl'
+                if self.RemeshingMethod is 'm6prime':
+                    src_remesh = 'remeshing_2D_opt4_noBC.cl'
+                    src_remesh_weights = 'weights_m6prime_builtin.cl'
+                else:
+                    src_remesh = 'remeshing_m8prime_2D_opt4_noBC.cl'
+                    src_remesh_weights = 'weights_m8prime_builtin.cl'
                 remesh_gwi = (int(workItemNumber), int(resolution[1]))
                 remesh_lwi = (int(workItemNumber), 1)
             f = open(os.path.join(GPU_SRC, src_remesh_weights))
@@ -417,14 +452,14 @@ class GPUParticleSolver(ParticleSolver):
                     sop.discreteOperator.gpu_precision = self.gpu_precision
                     if sop.discreteOperator.name == 'advection':
                         sop.setMethod(KernelLauncher(self.prg.advection,
-                                                 self.queue,
-                                                 advec_gwi,
-                                                 advec_lwi))
+                                                     self.queue,
+                                                     advec_gwi,
+                                                     advec_lwi))
                     elif sop.discreteOperator.name == 'remeshing':
                         sop.setMethod(KernelLauncher(self.prg.remeshing,
-                                                 self.queue,
-                                                 remesh_gwi,
-                                                 remesh_lwi))
+                                                     self.queue,
+                                                     remesh_gwi,
+                                                     remesh_lwi))
                     elif sop.discreteOperator.name == 'advection_and_remeshing':
                         sop.setMethod(KernelLauncher(self.prg.advection_and_remeshing,
                                                  self.queue,
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl
new file mode 100644
index 000000000..0a52ced9c
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl
@@ -0,0 +1,107 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGN+(id/nb_part);
+}
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt,  float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int4 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGN;
+  float4 p, s, y, gs, v, vp, coord;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    {
+      v = vload4((i+gidY*WIDTH)/4, gvelo);
+      gvelo_loc[noBC_id(i,nb_part)] = v.x;
+      gvelo_loc[noBC_id(i+1,nb_part)] = v.y;
+      gvelo_loc[noBC_id(i+2,nb_part)] = v.z;
+      gvelo_loc[noBC_id(i+3,nb_part)] = v.w;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      v = (float4)(gvelo_loc[noBC_id(i,nb_part)],
+                   gvelo_loc[noBC_id(i+1,nb_part)],
+                   gvelo_loc[noBC_id(i+2,nb_part)],
+                   gvelo_loc[noBC_id(i+3,nb_part)]);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]);
+      vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]);
+      p=fma(mix(v,vp,y),dt,coord);
+
+      s = vload4((i + gidY*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH);
+      gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (eta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (eta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (eta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (eta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (zeta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (zeta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (zeta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (zeta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (theta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (theta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (theta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (theta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (iota(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (iota(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (iota(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (iota(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl
new file mode 100644
index 000000000..043ba841f
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl
@@ -0,0 +1,177 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGN+(id/nb_part);
+}
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt,float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int8 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGN;
+  float8 p, s, y, gs, v, vp, coord;
+  uint8 index8;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*8; i<WIDTH; i+=(WGN*8))
+    {
+      v = vload8((i+gidY*WIDTH)/8, gvelo);
+      gvelo_loc[noBC_id(i,nb_part)] = v.s0;
+      gvelo_loc[noBC_id(i+1,nb_part)] = v.s1;
+      gvelo_loc[noBC_id(i+2,nb_part)] = v.s2;
+      gvelo_loc[noBC_id(i+3,nb_part)] = v.s3;
+      gvelo_loc[noBC_id(i+4,nb_part)] = v.s4;
+      gvelo_loc[noBC_id(i+5,nb_part)] = v.s5;
+      gvelo_loc[noBC_id(i+6,nb_part)] = v.s6;
+      gvelo_loc[noBC_id(i+7,nb_part)] = v.s7;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+      gscal_loc[i+4] = 0.0;
+      gscal_loc[i+5] = 0.0;
+      gscal_loc[i+6] = 0.0;
+      gscal_loc[i+7] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=8)
+    {
+      v = (float8)(gvelo_loc[noBC_id(i,nb_part)],
+                   gvelo_loc[noBC_id(i+1,nb_part)],
+                   gvelo_loc[noBC_id(i+2,nb_part)],
+                   gvelo_loc[noBC_id(i+3,nb_part)],
+                   gvelo_loc[noBC_id(i+4,nb_part)],
+                   gvelo_loc[noBC_id(i+5,nb_part)],
+                   gvelo_loc[noBC_id(i+6,nb_part)],
+                   gvelo_loc[noBC_id(i+7,nb_part)]);
+      coord = (float8)(i*dx,
+		       (i+1)*dx,
+		       (i+2)*dx,
+		       (i+3)*dx,
+		       (i+4)*dx,
+		       (i+5)*dx,
+		       (i+6)*dx,
+		       (i+7)*dx);
+      p = fma((float8)(0.5*dt),v,coord);
+      ind = convert_int8_rtn(p * invdx);
+      y = (fma(- convert_float8(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float8)(gvelo_loc[noBC_id(i_ind.s0,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s1,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s2,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s3,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s4,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s5,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s6,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s7,nb_part)]);
+      vp = (float8)(gvelo_loc[noBC_id(i_ind_p.s0,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s1,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s2,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s3,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s4,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s5,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s6,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s7,nb_part)]);
+      p = fma(mix(v,vp,y),dt,coord);
+
+      s = vload8((i + gidY*WIDTH)/8, pscal);
+      ind = convert_int8_rtn(p * invdx);
+      y = (p - convert_float8(ind) * dx) * invdx;
+      index8 = convert_uint8((ind - 3 + WIDTH) % WIDTH);
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (alpha(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (alpha(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (alpha(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (alpha(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (alpha(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (alpha(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (alpha(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (alpha(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (beta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (beta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (beta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (beta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (beta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (beta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (beta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (beta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (gamma(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (gamma(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (gamma(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (gamma(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (gamma(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (gamma(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (gamma(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (gamma(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (delta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (delta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (delta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (delta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (delta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (delta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (delta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (delta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (eta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (eta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (eta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (eta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (eta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (eta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (eta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (eta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (zeta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (zeta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (zeta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (zeta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (zeta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (zeta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (zeta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (zeta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (theta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (theta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (theta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (theta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (theta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (theta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (theta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (theta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (iota(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (iota(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (iota(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (iota(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (iota(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (iota(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (iota(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (iota(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*8; i<WIDTH; i+=(WGN*8))
+    vstore8((float8)(gscal_loc[noBC_id(i,nb_part)],
+                     gscal_loc[noBC_id(i+1,nb_part)],
+                     gscal_loc[noBC_id(i+2,nb_part)],
+                     gscal_loc[noBC_id(i+3,nb_part)],
+                     gscal_loc[noBC_id(i+4,nb_part)],
+                     gscal_loc[noBC_id(i+5,nb_part)],
+                     gscal_loc[noBC_id(i+6,nb_part)],
+                     gscal_loc[noBC_id(i+7,nb_part)]), (i + gidY*WIDTH)/8, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl
new file mode 100644
index 000000000..92b508820
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl
@@ -0,0 +1,115 @@
+
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0/dx;
+  int4 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGN;
+  float4 p, s, y, gs, v, vp, coord;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    {
+      v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
+      gvelo_loc[i] = v.x;
+      gvelo_loc[i+1] = v.y;
+      gvelo_loc[i+2] = v.z;
+      gvelo_loc[i+3] = v.w;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      v = vload4(i/4, gvelo_loc);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(gvelo_loc[i_ind.x], gvelo_loc[i_ind.y],
+                   gvelo_loc[i_ind.z], gvelo_loc[i_ind.w]);
+      vp = (float4)(gvelo_loc[i_ind_p.x], gvelo_loc[i_ind_p.y],
+                    gvelo_loc[i_ind_p.z], gvelo_loc[i_ind_p.w]);
+      p=fma(mix(v,vp,y),dt,coord);
+
+      s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH);
+      barrier(CLK_LOCAL_MEM_FENCE);
+      gs = (alpha(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs = (beta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (gamma(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (delta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs = (eta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (zeta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (theta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (iota(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl
new file mode 100644
index 000000000..1557aaffc
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl
@@ -0,0 +1,115 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGN+(id/nb_part);
+}
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0/dx;
+  int4 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGN;
+  float4 p, s, y, gs, v, vp, coord;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    {
+      v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
+      gvelo_loc[noBC_id(i, nb_part)] = v.x;
+      gvelo_loc[noBC_id(i+1, nb_part)] = v.y;
+      gvelo_loc[noBC_id(i+2, nb_part)] = v.z;
+      gvelo_loc[noBC_id(i+3, nb_part)] = v.w;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      v = (float4)(gvelo_loc[noBC_id(i,nb_part)],
+                   gvelo_loc[noBC_id(i+1,nb_part)],
+                   gvelo_loc[noBC_id(i+2,nb_part)],
+                   gvelo_loc[noBC_id(i+3,nb_part)]);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]);
+      vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]);
+      p=fma(mix(v,vp,y),dt,coord);
+
+      s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH);
+      gs =  (alpha(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (beta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs = (gamma(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (delta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (eta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs = (zeta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (theta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs = (iota(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_2D_opt4_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_2D_opt4_noBC.cl
new file mode 100644
index 000000000..69fddc7f9
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_2D_opt4_noBC.cl
@@ -0,0 +1,102 @@
+/**
+ * Local memory as a 2D array to avoid bank conflicts.
+ *
+ * @param id Index 1D
+ * @param nb_part Particle number per work-item
+ *
+ * @return Index in a 2D space.
+ */
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGN+(id/nb_part);
+}
+
+/**
+  * 2D Remeshing kernel.
+  *
+  * @param ppos Particle position
+  * @param pscal Particle scalar
+  * @param gscal Grid scalar
+  * @param min_position Domain lower point
+  * @param dx Space step
+  */
+__kernel void remeshing(__global const float* ppos,
+			__global const float* pscal,
+			__global float* gscal, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int4 ind;
+  uint i, nb_part = WIDTH/WGN;
+  float4 p, s, y;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    {
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position);
+      s = vload4((i + gidY*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH); // i-3
+      gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i-2
+      gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i-1
+      gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i
+      gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; //i+1
+      gscal_loc[noBC_id(index4.x,nb_part)] += (eta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (eta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (eta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (eta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i+2
+      gscal_loc[noBC_id(index4.x,nb_part)] += (zeta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (zeta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (zeta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (zeta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i+3
+      gscal_loc[noBC_id(index4.x,nb_part)] += (theta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (theta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (theta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (theta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i+4
+      gscal_loc[noBC_id(index4.x,nb_part)] += (iota(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (iota(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (iota(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (iota(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_3D_opt4_private4.cl b/HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_3D_opt4_private4.cl
new file mode 100644
index 000000000..6af20d005
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_3D_opt4_private4.cl
@@ -0,0 +1,95 @@
+
+__kernel void remeshing(__global const float* ppos,
+			__global const float* pscal,
+			__global float* gscal, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0/dx;
+  int4 ind;
+  uint i, nb_part = WIDTH/WGN;
+  float4 p, s, y, gs;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    {
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      p = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, ppos) - (float4)(min_position);
+      s = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH);
+      gs =  (alpha(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (beta(y, s));
+      gscal_loc[index4.x]+= gs.x;
+      gscal_loc[index4.y]+= gs.y;
+      gscal_loc[index4.z]+= gs.z;
+      gscal_loc[index4.w]+= gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (gamma(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (delta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (eta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (zeta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (theta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (iota(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    vstore4((float4)(gscal_loc[i],
+                     gscal_loc[i+1],
+                     gscal_loc[i+2],
+                     gscal_loc[i+3]),
+            (i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m8_prime.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m8_prime.cl
new file mode 100644
index 000000000..84c4032b0
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m8_prime.cl
@@ -0,0 +1,31 @@
+/**
+ * M8prime weights.
+ *
+ * @param y Distance between the particle and left-hand grid point
+ * @param s Scalar of the particle
+ *
+ * @return weights
+ */
+inline float alpha(float y, float s){
+  return ((y*(y*(y*(y*(y*(y*(-10.0*y + 21.0) + 28.0) - 105.0) + 70.0) + 35.0) - 56.0) + 17.0) / 3360.0) * s;}
+
+inline float beta(float y, float s){
+  return ((y*(y*(y*(y*(y*(y*(70.0*y - 175.0) - 140.0) + 770.0) - 560.0) - 350.0) + 504.0) - 102.0) / 3360.0)*s;}
+
+inline float gamma(float y, float s){
+  return ((y*(y*(y*(y*(y*(y*(-210.0*y + 609.0) + 224.0) - 2135.0) + 910.0) + 2765.0) - 2520.0) + 255.0) / 3360.0)*s;}
+
+inline float delta(float y, float s){
+  return ((y*y*(y*y*(y*y*(70.0*y - 231.0) + 588.0) - 980.0) + 604.0) / 672.0)*s;}
+
+inline float eta(float y, float s){
+  return ((y*(y*(y*(y*(y*(y*(-70.0*y + 259.0) - 84.0) - 427.0) - 182.0) + 553.0) + 504.0) + 51.0) / 672.0)*s;}
+
+inline float zeta(float y, float s){
+  return ((y*(y*(y*(y*(y*(y*(210.0*y - 861.0) + 532.0) + 770.0) + 560.0) - 350.0) - 504.0) - 102.0) / 3360.0)*s;}
+
+inline float theta(float y, float s){
+  return ((y*(y*(y*(y*(y*(y*(-70.0*y + 315.0) - 280.0) - 105.0) - 70.0) + 35.0) + 56.0) + 17.0) / 3360.0)*s;}
+
+inline float iota(float y, float s){
+  return ((y*y*y*y*y*(y*(10.*y - 49.) + 56.))/3360.)*s;}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_builtin.cl
new file mode 100644
index 000000000..94d393d33
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_builtin.cl
@@ -0,0 +1,33 @@
+/**
+ * M8prime weights.
+ *
+ * @param y Distance between the particle and left-hand grid point
+ * @param s Scalar of the particle
+ *
+ * @return weights
+ *
+ * Use OpenCL fma builtin function.
+ */
+inline float alpha(float y, float s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) / 3360.0) * s;}
+
+inline float beta(float y, float s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(70.0,y, - 175.0), - 140.0), + 770.0), - 560.0), - 350.0), + 504.0), - 102.0) / 3360.0)*s;}
+
+inline float gamma(float y, float s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-210.0,y, + 609.0), + 224.0), - 2135.0), + 910.0), + 2765.0), - 2520.0), + 255.0) / 3360.0)*s;}
+
+inline float delta(float y, float s){
+  return (fma(y*y, fma(y*y, fma(y*y, fma(70.0,y, - 231.0), + 588.0), - 980.0), + 604.0) / 672.0)*s;}
+
+inline float eta(float y, float s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-70.0,y, 259.0), - 84.0), - 427.0), - 182.0), + 553.0), + 504.0), + 51.0) / 672.0)*s;}
+
+inline float zeta(float y, float s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(210.0,y,- 861.0), + 532.0), + 770.0), + 560.0), - 350.0), - 504.0), - 102.0) / 3360.0)*s;}
+
+inline float theta(float y, float s){
+  return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) / 3360.0)*s;}
+
+inline float iota(float y, float s){
+  return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) / 3360.0)*s;}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_opt4_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_opt4_builtin.cl
new file mode 100644
index 000000000..90f28e492
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_opt4_builtin.cl
@@ -0,0 +1,33 @@
+/**
+ * M8prime weights.
+ *
+ * @param y Distance between the particle and left-hand grid point
+ * @param s Scalar of the particle
+ *
+ * @return weights
+ *
+ * Use OpenCL fma builtin function, computed by float4.
+ */
+inline float4 alpha(float4 y, float4 s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) / 3360.0) * s;}
+
+inline float4 beta(float4 y, float4 s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(70.0,y, - 175.0), - 140.0), + 770.0), - 560.0), - 350.0), + 504.0), - 102.0) / 3360.0)*s;}
+
+inline float4 gamma(float4 y, float4 s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-210.0,y, + 609.0), + 224.0), - 2135.0), + 910.0), + 2765.0), - 2520.0), + 255.0) / 3360.0)*s;}
+
+inline float4 delta(float4 y, float4 s){
+  return (fma(y*y, fma(y*y, fma(y*y, fma(70.0,y, - 231.0), + 588.0), - 980.0), + 604.0) / 672.0)*s;}
+
+inline float4 eta(float4 y, float4 s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-70.0,y, 259.0), - 84.0), - 427.0), - 182.0), + 553.0), + 504.0), + 51.0) / 672.0)*s;}
+
+inline float4 zeta(float4 y, float4 s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(210.0,y,- 861.0), + 532.0), + 770.0), + 560.0), - 350.0), - 504.0), - 102.0) / 3360.0)*s;}
+
+inline float4 theta(float4 y, float4 s){
+  return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) / 3360.0)*s;}
+
+inline float4 iota(float4 y, float4 s){
+  return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) / 3360.0)*s;}
-- 
GitLab