diff --git a/HySoP/hysop/operator/advec_and_remesh_d.py b/HySoP/hysop/operator/advec_and_remesh_d.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e927542794cf9c719ab805573db8a9b2964d78
--- /dev/null
+++ b/HySoP/hysop/operator/advec_and_remesh_d.py
@@ -0,0 +1,128 @@
+"""
+@package parmepy.operator.transport_d
+
+Discrete transport representation
+"""
+from ..constants import *
+from .discrete import DiscreteOperator
+import pyopencl as cl
+import time
+
+
+class Advec_and_Remesh_d(DiscreteOperator):
+    """
+    Particle advection and remesh operator representation.
+
+    """
+
+    def __init__(self, advec, idVelocityD=0, idScalarD=0, particle_scalar=None, method=None):
+        """
+        Create a Advection and Remeshing operator.
+        Work on a given scalar at a given velocity to produce scalar distribution at new positions.
+
+        @param advec : Transport operator
+        @param idVelocityD : Index of velocity discretisation to use.
+        @param idScalarD : Index of scalar discretisation to use.
+        @param result_scalar : result scalar.
+        @param method : the method to use.
+        """
+        DiscreteOperator.__init__(self)
+        ## Velocity.
+        self.velocity = advec.velocity.discreteField[idVelocityD]
+        ## Transported scalar.
+        self.scalar = advec.scalar.discreteField[idScalarD]
+        ## Result scalar
+        self.particle_scalar = particle_scalar.discreteField[0]
+        self.input = [self.velocity, self.scalar]
+        self.output = [self.particle_scalar]
+        self.method = method
+        ## Previous splitting direction
+        self.old_splitting_direction = None
+        ## Compute time detailed per directions
+        self.compute_time = [0., 0., 0.]
+        ## Compute time for copy detailed per directions
+        self.compute_time_copy = [0., 0., 0.]
+        ## Compute time for transposition detailed per directions
+        self.compute_time_swap = [0., 0., 0.]
+        self.name = "advection_and_remeshing"
+        self.call_number = [0, 0, 0]
+        self.gpu_precision = PARMES_REAL_GPU
+
+    def apply(self, t, dt, splittingDirection):
+        """
+        Apply advection and remeshing operator.
+
+        @param t : current time.
+        @param dt : time step.
+        @param splittingDirection : Direction of splitting.
+
+        Advection algorithm:
+        @li 1. Particle initialization : \n
+                 - by copy scalar from grid to particles if previous splitting direction equals current splitting direction.\n
+                 - by transposition of scalar from grid to particle.
+        @li 2. Particle advection and remesh :\n
+                 - compute particle position in splitting direction as a scalar. Performs a RK2 resolution of dx_p/dt = a_p.
+                 - remesh particle scalar
+        @li 3. Profile timings of OpenCL kernels.
+        """
+        c_time, c_time_init = 0., 0.
+        self.call_number[splittingDirection] += 1
+        if self.numMethod is not None and self.init_transpose is not None and self.init_copy is not None:
+            # Particle init
+            if (self.old_splitting_direction == splittingDirection) or self.old_splitting_direction is None:
+                evt_init = self.init_copy.launch(self.scalar.gpu_data,
+                                                 self.particle_scalar.gpu_data)
+            else:
+                if self.scalar.domain.dimension == 2:
+                    evt_init = self.init_transpose.launch(self.scalar.gpu_data,
+                                                          self.particle_scalar.gpu_data)
+                else:
+                    if min(self.old_splitting_direction, splittingDirection) == 0:
+                        evt_init = self.init_transpose.launch(0,
+                                                              self.scalar.gpu_data,
+                                                              self.particle_scalar.gpu_data)
+                    else:
+                        evt_init = self.init_transpose.launch(1,
+                                                              self.scalar.gpu_data,
+                                                              self.particle_scalar.gpu_data)
+            # Advection and Remeshing
+            evt = self.numMethod.launch(self.velocity.gpu_data[splittingDirection],
+                                        self.particle_scalar.gpu_data,
+                                        self.scalar.gpu_data,
+                                        self.gpu_precision(dt),
+                                        self.gpu_precision(self.scalar.domain.origin[splittingDirection]),
+                                        self.gpu_precision(self.scalar.domain.step[splittingDirection]))
+            for df in self.output:
+                df.contains_data = False
+            # Get timpings from OpenCL events
+            self.numMethod.finish()
+            c_time_init = (evt_init.profile.end - evt_init.profile.start) * 1e-9
+            c_time = (evt.profile.end - evt.profile.start) * 1e-9
+            self.compute_time[splittingDirection] += c_time
+            if (self.old_splitting_direction == splittingDirection) or self.old_splitting_direction is None:
+                self.compute_time_copy[splittingDirection] += c_time_init
+            else:
+                self.compute_time_swap[min(self.old_splitting_direction, splittingDirection)] += c_time_init
+            self.total_time += (c_time + c_time_init)
+            self.old_splitting_direction = splittingDirection
+        return (c_time + c_time_init)
+
+    def printComputeTime(self):
+        self.timings_info[0] = "\"Advection_and_Remesh total\" \"copy\" \"swap xy\" \"swap xz\""
+        self.timings_info[0] += " \"Advection_and_Remesh x\" \"Advection_and_Remesh y\" \"Advection_and_Remesh z\" "
+        self.timings_info[1] = str(self.total_time) + "  " + str(self.compute_time_copy[0]) + "  "
+        self.timings_info[1] += str(self.compute_time_swap[0]) + "  " + str(self.compute_time_swap[1]) + "  "
+        self.timings_info[1] += str(self.compute_time[0]) + "  " + str(self.compute_time[1]) + "  " + str(self.compute_time[2]) + "  "
+        print "Advection_and_Remesh total time : ", self.total_time, self.call_number
+        print "\t Advection_and_Remesh init copy :", self.compute_time_copy, self.init_copy.call_number
+        print "\t Advection_and_Remesh init swap :", self.compute_time_swap, self.init_transpose.call_number
+        print "\t Advection_and_Remesh :", self.compute_time, self.numMethod.call_number
+
+    def __str__(self):
+        s = "Advection_P (DiscreteOperator). " + DiscreteOperator.__str__(self)
+        return s
+
+if __name__ == "__main__":
+    print __doc__
+    print "- Provided class : Transport_d"
+    print AdvectionDOp.__doc__
diff --git a/HySoP/hysop/operator/transport.py b/HySoP/hysop/operator/transport.py
index 4879d1799ccf49f4f2395705ab8abf287c25b075..b130522ccd68df9fc098920dcc8df3d4a663337d 100644
--- a/HySoP/hysop/operator/transport.py
+++ b/HySoP/hysop/operator/transport.py
@@ -5,6 +5,7 @@ Transport operator representation.
 """
 from .continuous import ContinuousOperator
 from .transport_d import Transport_d
+from .advec_and_remesh_d import Advec_and_Remesh_d
 
 
 class Transport(ContinuousOperator):
@@ -12,7 +13,7 @@ class Transport(ContinuousOperator):
     Transport operator representation.
     """
 
-    def __init__(self, velocity, scalar):
+    def __init__(self, velocity, scalar, include_remesh=False):
         """
         Create an Transport operator from given variables velocity and scalar.
 
@@ -26,6 +27,7 @@ class Transport(ContinuousOperator):
         self.scalar = scalar
         self.needSplitting = True
         self.addVariable([velocity, scalar])
+        self.include_remesh = include_remesh
 
     def discretize(self, idVelocityD=0, idScalarD=0, result_position=None, result_scalar=None, method=None):
         """
@@ -39,7 +41,10 @@ class Transport(ContinuousOperator):
         @param method : the method to use.
         """
         if self.discreteOperator is None:
-            self.discreteOperator = Transport_d(self, idVelocityD, idScalarD, result_position, result_scalar, method)
+            if self.include_remesh:
+                self.discreteOperator = Advec_and_Remesh_d(self, idVelocityD, idScalarD, result_scalar, method)
+            else:
+                self.discreteOperator = Transport_d(self, idVelocityD, idScalarD, result_position, result_scalar, method)
 
     def __str__(self):
         """ToString method"""
diff --git a/HySoP/hysop/particular_solvers/basic.py b/HySoP/hysop/particular_solvers/basic.py
index 8c5544b664219b5c0f0b20493ee19e58de801b78..32c87d746c822be0d37f1c84dd225b7dbd021da8 100644
--- a/HySoP/hysop/particular_solvers/basic.py
+++ b/HySoP/hysop/particular_solvers/basic.py
@@ -132,7 +132,7 @@ class ParticleSolver(Solver):
             #    op.discretize(result_position=self.p_position, result_scalar=self.p_scalar, method=self.ODESolver)
             else:
                 op.discretize()
-        if self.advection is not None:
+        if self.advection is not None and not self.advection.include_remesh:
             ## Create remeshing operator
             self.remeshing = Remeshing(partPositions=self.advection.discreteOperator.res_position,
                                     partScalar=self.advection.discreteOperator.res_scalar,
@@ -140,13 +140,16 @@ class ParticleSolver(Solver):
                                     method=self.RemeshingMethod)
         for op in self.problem.operators:
             if op.needSplitting:
-                if op is self.advection:
-                    index = self.problem.operators.index(op)
+                index = self.problem.operators.index(op)
+                if op is self.advection and not self.advection.include_remesh:
                     self.problem.operators[index] = Splitting([op, self.remeshing], dim=self.problem.topology.dim)
                 else:
                     self.problem.operators[index] = Splitting([op], dim=self.problem.topology.dim)
         self.isInitialized = True
 
+    def end(self):
+        """@TODO : Put here memory free instrutcions"""
+
     def __str__(self):
         """ToString method"""
         s = " Particular solver "
diff --git a/HySoP/hysop/particular_solvers/gpu.py b/HySoP/hysop/particular_solvers/gpu.py
index cbd48a6bb8dfaa284c57da6694a438b256a6e736..f3c55e6afd46cec922d7dd9b18da4e961a7489d4 100644
--- a/HySoP/hysop/particular_solvers/gpu.py
+++ b/HySoP/hysop/particular_solvers/gpu.py
@@ -164,43 +164,71 @@ class GPUParticleSolver(ParticleSolver):
         ## Collect OpenCL source code
         print "\n Source code:"
         ## advection
-        if len(resolution) == 3:
-            if self.gpu_precision is np.float32:
-                src_advec = 'advection_3D_opt4_builtin.cl'
+        if self.advection.include_remesh:
+            if len(resolution) == 3:
+                if self.gpu_precision is np.float32:
+                    src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
+                    src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4.cl'
+                else:
+                    src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
+                    src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4_noBC.cl'
+                advec_and_remesh_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2]))
+                advec_and_remesh_lwi = (int(workItemNumber), 1, 1)
             else:
-                src_advec = 'advection_3D_opt2_builtin.cl'
-            advec_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2]))
-            advec_lwi = (int(workItemNumber), 1, 1)
+                if self.gpu_precision is np.float32:
+                    src_remesh_weights = 'weights_m6prime_builtin.cl'
+                    src_advec_and_remesh = 'advection_and_remesh_2D_opt8_builtin_noBC.cl'
+                else:
+                    src_remesh_weights = 'weights_m6prime_builtin.cl'
+                    src_advec_and_remesh = 'advection_and_remesh_2D_opt4_builtin_noBC.cl'
+                advec_and_remesh_gwi = (int(workItemNumber), int(resolution[1]))
+                advec_and_remesh_lwi = (int(workItemNumber), 1)
+            f = open(os.path.join(GPU_SRC, src_remesh_weights))
+            self.gpu_src += "".join(f.readlines())
+            print  "   - remeshing weights:", f.name
+            f.close()
+            f = open(os.path.join(GPU_SRC, src_advec_and_remesh))
+            print  "   - advection and remesh:", f.name
+            self.gpu_src += "".join(f.readlines())
+            f.close()
         else:
-            if self.gpu_precision is np.float32:
-                src_advec = 'advection_2D_opt4_builtin.cl'
+            if len(resolution) == 3:
+                if self.gpu_precision is np.float32:
+                    src_advec = 'advection_3D_opt4_builtin.cl'
+                else:
+                    src_advec = 'advection_3D_opt2_builtin.cl'
+                advec_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2]))
+                advec_lwi = (int(workItemNumber), 1, 1)
             else:
-                src_advec = 'advection_2D_builtin.cl'
-            advec_gwi = (int(workItemNumber), int(resolution[1]))
-            advec_lwi = (int(workItemNumber), 1)
-        f = open(os.path.join(GPU_SRC, src_advec))
-        print  "   - advection:", f.name
-        self.gpu_src += "".join(f.readlines())
-        f.close()
-        ## remeshing
-        if len(resolution) == 3:
-            src_remesh = 'remeshing_3D_opt4_private4.cl'
-            src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
-            remesh_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2]))
-            remesh_lwi = (int(workItemNumber), 1, 1)
-        else:
-            src_remesh = 'remeshing_2D_opt4_noBC.cl'
-            src_remesh_weights = 'weights_m6prime_builtin.cl'
-            remesh_gwi = (int(workItemNumber), int(resolution[1]))
-            remesh_lwi = (int(workItemNumber), 1)
-        f = open(os.path.join(GPU_SRC, src_remesh_weights))
-        self.gpu_src += "".join(f.readlines())
-        print  "   - remeshing weights:", f.name
-        f.close()
-        f = open(os.path.join(GPU_SRC, src_remesh))
-        print  "   - remeshing:", f.name
-        self.gpu_src += "".join(f.readlines())
-        f.close()
+                if self.gpu_precision is np.float32:
+                    src_advec = 'advection_2D_opt4_builtin.cl'
+                else:
+                    src_advec = 'advection_2D_builtin.cl'
+                advec_gwi = (int(workItemNumber), int(resolution[1]))
+                advec_lwi = (int(workItemNumber), 1)
+            f = open(os.path.join(GPU_SRC, src_advec))
+            print  "   - advection:", f.name
+            self.gpu_src += "".join(f.readlines())
+            f.close()
+            ## remeshing
+            if len(resolution) == 3:
+                src_remesh = 'remeshing_3D_opt4_private4.cl'
+                src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
+                remesh_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2]))
+                remesh_lwi = (int(workItemNumber), 1, 1)
+            else:
+                src_remesh = 'remeshing_2D_opt4_noBC.cl'
+                src_remesh_weights = 'weights_m6prime_builtin.cl'
+                remesh_gwi = (int(workItemNumber), int(resolution[1]))
+                remesh_lwi = (int(workItemNumber), 1)
+            f = open(os.path.join(GPU_SRC, src_remesh_weights))
+            self.gpu_src += "".join(f.readlines())
+            print  "   - remeshing weights:", f.name
+            f.close()
+            f = open(os.path.join(GPU_SRC, src_remesh))
+            print  "   - remeshing:", f.name
+            self.gpu_src += "".join(f.readlines())
+            f.close()
         ## copy
         if len(resolution) == 3:
             if self.gpu_precision is np.float32:
@@ -210,20 +238,20 @@ class GPUParticleSolver(ParticleSolver):
                 copy_lwi = (4, 8, 1)
             else:
                 src_copy = 'copy_3D_locMem.cl'
-                build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=4"
-                copy_gwi = (int(resolution[0]), int(resolution[1] / 8), int(resolution[2]))
-                copy_lwi = (32, 4, 1)
+                build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=8"
+                copy_gwi = (int(resolution[0]), int(resolution[1] / 4), int(resolution[2]))
+                copy_lwi = (32, 8, 1)
         else:
             if self.gpu_precision is np.float32:
                 src_copy = 'copy_2D_opt2.cl'
-                build_options += " -D TILE_DIM_COPY=128 -D BLOCK_ROWS_COPY=4"
-                copy_gwi = (int(resolution[0] / 2), int(resolution[1] / 32))
-                copy_lwi = (64, 4)
+                build_options += " -D TILE_DIM_COPY=16 -D BLOCK_ROWS_COPY=8"
+                copy_gwi = (int(resolution[0] / 2), int(resolution[1] / 2))
+                copy_lwi = (8, 8)
             else:
-                src_copy = 'copy_2D.cl'
+                src_copy = 'copy_2D_opt2.cl'
                 build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=2"
-                copy_gwi = (int(resolution[0]), int(resolution[1] / 16))
-                copy_lwi = (32, 2)
+                copy_gwi = (int(resolution[0] / 2), int(resolution[1] / 16))
+                copy_lwi = (16, 2)
         f = open(os.path.join(GPU_SRC, src_copy))
         print  "   - copy:", f.name
         self.gpu_src += "".join(f.readlines())
@@ -232,9 +260,9 @@ class GPUParticleSolver(ParticleSolver):
         if len(resolution) == 3:
             if self.gpu_precision is np.float32:
                 src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl'
-                build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8"
-                transpose_xy_gwi = (int(resolution[0] / 2), int(resolution[1] / 4), int(resolution[2]))
-                transpose_xy_lwi = (16, 8, 1)
+                build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8"
+                transpose_xy_gwi = (int(resolution[0] / 2), int(resolution[1] / 2), int(resolution[2]))
+                transpose_xy_lwi = (8, 8, 1)
                 src_transpose_xz = 'transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl'
                 build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4"
                 transpose_xz_gwi = (int(resolution[0]), int(resolution[1] / 4), int(resolution[2] / 4))
@@ -249,10 +277,16 @@ class GPUParticleSolver(ParticleSolver):
                 transpose_xz_gwi = (int(resolution[0]), int(resolution[1] / 4), int(resolution[2] / 4))
                 transpose_xz_lwi = (8, 2, 2)
         else:
-            src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl'
-            build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=2"
-            transpose_xy_gwi = (int(resolution[0] / 4), int(resolution[1]) / 16)
-            transpose_xy_lwi = (8, 2)
+            if self.gpu_precision is np.float32:
+                src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl'
+                build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8"
+                transpose_xy_gwi = (int(resolution[0] / 4), int(resolution[1]) / 4)
+                transpose_xy_lwi = (8, 8)
+            else:
+                src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl'
+                build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=2"
+                transpose_xy_gwi = (int(resolution[0] / 4), int(resolution[1]) / 16)
+                transpose_xy_lwi = (8, 2)
         f = open(os.path.join(GPU_SRC, src_transpose_xy))
         print  "   - transpose_xy:", f.name
         self.gpu_src += "".join(f.readlines())
@@ -391,6 +425,11 @@ class GPUParticleSolver(ParticleSolver):
                                                  self.queue,
                                                  remesh_gwi,
                                                  remesh_lwi))
+                    elif sop.discreteOperator.name == 'advection_and_remeshing':
+                        sop.setMethod(KernelLauncher(self.prg.advection_and_remeshing,
+                                                 self.queue,
+                                                 advec_and_remesh_gwi,
+                                                 advec_and_remesh_lwi))
                     else:
                         raise ValueError("Unknown operator : " + sop.discreteOperator.name)
             else:
@@ -420,6 +459,21 @@ class GPUParticleSolver(ParticleSolver):
         self.problem.io.set_get_data_method(self.get_data_from_device)
         print "===\n"
 
+    def end(self):
+        print "=== OpenCL Buffer deallocations ==="
+        for f in self.problem.variables:
+            print f.name, "deallocate "
+            temp_mem_used = 0
+            for df in f.discreteField:
+                if f.vector:
+                    for d in xrange(len(df.data)):
+                        if isinstance(df.gpu_data[d], cl.MemoryObject):
+                            df.gpu_data[d].release()
+                else:
+                    if isinstance(df.gpu_data, cl.MemoryObject):
+                        df.gpu_data.release()
+        print "===\n"
+
     def get_data_from_device(self, field):
         """Data collect method."""
         deviceToHost(self.queue, field)
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_2D_opt4_builtin_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_2D_opt4_builtin_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..2eaec614a26d971b549d417d48d85a2ce4f5dd80
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_2D_opt4_builtin_noBC.cl
@@ -0,0 +1,95 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGN+(id/nb_part);
+}
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt,  float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int4 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGN;
+  float4 p, s, y, gs, v, vp, coord;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    {
+      v = vload4((i+gidY*WIDTH)/4, gvelo);
+      gvelo_loc[noBC_id(i,nb_part)] = v.x;
+      gvelo_loc[noBC_id(i+1,nb_part)] = v.y;
+      gvelo_loc[noBC_id(i+2,nb_part)] = v.z;
+      gvelo_loc[noBC_id(i+3,nb_part)] = v.w;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      v = (float4)(gvelo_loc[noBC_id(i,nb_part)],
+                   gvelo_loc[noBC_id(i+1,nb_part)],
+                   gvelo_loc[noBC_id(i+2,nb_part)],
+                   gvelo_loc[noBC_id(i+3,nb_part)]);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]);
+      vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]);
+      p=fma(mix(v,vp,y),dt,coord);
+
+      s = vload4((i + gidY*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH);
+      gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (eta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (eta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (eta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (eta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (zeta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (zeta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (zeta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (zeta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_2D_opt8_builtin_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_2D_opt8_builtin_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..289b9d4a6a0c4fc9bd2f84c09e96d294385485ed
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_2D_opt8_builtin_noBC.cl
@@ -0,0 +1,157 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGN+(id/nb_part);
+}
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt,float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int8 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGN;
+  float8 p, s, y, gs, v, vp, coord;
+  uint8 index8;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*8; i<WIDTH; i+=(WGN*8))
+    {
+      v = vload8((i+gidY*WIDTH)/8, gvelo);
+      gvelo_loc[noBC_id(i,nb_part)] = v.s0;
+      gvelo_loc[noBC_id(i+1,nb_part)] = v.s1;
+      gvelo_loc[noBC_id(i+2,nb_part)] = v.s2;
+      gvelo_loc[noBC_id(i+3,nb_part)] = v.s3;
+      gvelo_loc[noBC_id(i+4,nb_part)] = v.s4;
+      gvelo_loc[noBC_id(i+5,nb_part)] = v.s5;
+      gvelo_loc[noBC_id(i+6,nb_part)] = v.s6;
+      gvelo_loc[noBC_id(i+7,nb_part)] = v.s7;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+      gscal_loc[i+4] = 0.0;
+      gscal_loc[i+5] = 0.0;
+      gscal_loc[i+6] = 0.0;
+      gscal_loc[i+7] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=8)
+    {
+      v = (float8)(gvelo_loc[noBC_id(i,nb_part)],
+                   gvelo_loc[noBC_id(i+1,nb_part)],
+                   gvelo_loc[noBC_id(i+2,nb_part)],
+                   gvelo_loc[noBC_id(i+3,nb_part)],
+                   gvelo_loc[noBC_id(i+4,nb_part)],
+                   gvelo_loc[noBC_id(i+5,nb_part)],
+                   gvelo_loc[noBC_id(i+6,nb_part)],
+                   gvelo_loc[noBC_id(i+7,nb_part)]);
+      coord = (float8)(i*dx,
+		       (i+1)*dx,
+		       (i+2)*dx,
+		       (i+3)*dx,
+		       (i+4)*dx,
+		       (i+5)*dx,
+		       (i+6)*dx,
+		       (i+7)*dx);
+      p = fma((float8)(0.5*dt),v,coord);
+      ind = convert_int8_rtn(p * invdx);
+      y = (fma(- convert_float8(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float8)(gvelo_loc[noBC_id(i_ind.s0,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s1,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s2,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s3,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s4,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s5,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s6,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s7,nb_part)]);
+      vp = (float8)(gvelo_loc[noBC_id(i_ind_p.s0,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s1,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s2,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s3,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s4,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s5,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s6,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s7,nb_part)]);
+      p = fma(mix(v,vp,y),dt,coord);
+
+      s = vload8((i + gidY*WIDTH)/8, pscal);
+      ind = convert_int8_rtn(p * invdx);
+      y = (p - convert_float8(ind) * dx) * invdx;
+      index8 = convert_uint8((ind - 2 + WIDTH) % WIDTH);
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (alpha(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (alpha(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (alpha(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (alpha(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (alpha(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (alpha(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (alpha(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (alpha(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (beta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (beta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (beta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (beta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (beta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (beta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (beta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (beta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (gamma(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (gamma(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (gamma(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (gamma(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (gamma(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (gamma(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (gamma(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (gamma(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (delta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (delta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (delta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (delta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (delta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (delta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (delta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (delta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (eta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (eta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (eta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (eta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (eta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (eta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (eta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (eta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (zeta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (zeta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (zeta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (zeta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (zeta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (zeta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (zeta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (zeta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*8; i<WIDTH; i+=(WGN*8))
+    vstore8((float8)(gscal_loc[noBC_id(i,nb_part)],
+                     gscal_loc[noBC_id(i+1,nb_part)],
+                     gscal_loc[noBC_id(i+2,nb_part)],
+                     gscal_loc[noBC_id(i+3,nb_part)],
+                     gscal_loc[noBC_id(i+4,nb_part)],
+                     gscal_loc[noBC_id(i+5,nb_part)],
+                     gscal_loc[noBC_id(i+6,nb_part)],
+                     gscal_loc[noBC_id(i+7,nb_part)]), (i + gidY*WIDTH)/8, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_3D_opt4_builtin_private4.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_3D_opt4_builtin_private4.cl
new file mode 100644
index 0000000000000000000000000000000000000000..46e9bc59b2ea793d24cd36c429a41bb0e448ff8e
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_3D_opt4_builtin_private4.cl
@@ -0,0 +1,101 @@
+
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0/dx;
+  int4 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGN;
+  float4 p, s, y, gs, v, vp, coord;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    {
+      v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
+      gvelo_loc[i] = v.x;
+      gvelo_loc[i+1] = v.y;
+      gvelo_loc[i+2] = v.z;
+      gvelo_loc[i+3] = v.w;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      v = vload4(i/4, gvelo_loc);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(gvelo_loc[i_ind.x], gvelo_loc[i_ind.y],
+                   gvelo_loc[i_ind.z], gvelo_loc[i_ind.w]);
+      vp = (float4)(gvelo_loc[i_ind_p.x], gvelo_loc[i_ind_p.y],
+                    gvelo_loc[i_ind_p.z], gvelo_loc[i_ind_p.w]);
+      p=fma(mix(v,vp,y),dt,coord);
+
+      s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH);
+      barrier(CLK_LOCAL_MEM_FENCE);
+      gs = (alpha(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs = (beta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (gamma(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (delta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs = (eta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (zeta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..5d221f67393d24f1f7614f422ef0396ad22d8c20
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl
@@ -0,0 +1,101 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGN+(id/nb_part);
+}
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0/dx;
+  int4 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGN;
+  float4 p, s, y, gs, v, vp, coord;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    {
+      v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
+      gvelo_loc[noBC_id(i, nb_part)] = v.x;
+      gvelo_loc[noBC_id(i+1, nb_part)] = v.y;
+      gvelo_loc[noBC_id(i+2, nb_part)] = v.z;
+      gvelo_loc[noBC_id(i+3, nb_part)] = v.w;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      v = (float4)(gvelo_loc[noBC_id(i,nb_part)],
+                   gvelo_loc[noBC_id(i+1,nb_part)],
+                   gvelo_loc[noBC_id(i+2,nb_part)],
+                   gvelo_loc[noBC_id(i+3,nb_part)]);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]);
+      vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]);
+      p=fma(mix(v,vp,y),dt,coord);
+
+      s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH);
+      gs =  (alpha(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (beta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs = (gamma(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (delta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (eta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs = (zeta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/problem/problem.py b/HySoP/hysop/problem/problem.py
index c78606626bfc4a46c27a2f252e906a89c5d76825..fce59388f3b7326dee9ce3c4511c1e46ce3a6681 100644
--- a/HySoP/hysop/problem/problem.py
+++ b/HySoP/hysop/problem/problem.py
@@ -100,6 +100,7 @@ class Problem():
             self.timer.step()
             self.io.step()
         print "\n\n End solving\n"
+        self.solver.end()
         print "=== Timings ==="
         if ( self.topology.rank == 0 ) :
             print "IO total", self.io.compute_time
diff --git a/HySoP/hysop/tools/kernel_benchmark.py b/HySoP/hysop/tools/kernel_benchmark.py
index a87735bd4156be57a3a617387256d93790a4175d..83af9c94a757fa6b0a7b798451da1924746d7ba2 100644
--- a/HySoP/hysop/tools/kernel_benchmark.py
+++ b/HySoP/hysop/tools/kernel_benchmark.py
@@ -194,8 +194,11 @@ class BenchmarkSuite:
                         f.write(k + " ")
                         f.write(v + " ")
                         f.write(c + " ")
-                        Nx = s[0]
-                        f.write(str(eval(c)) + ' ')
+                        #print c
+                        cs = c.replace('Nx', str(s[0])).replace('x', '*').split('_')
+                        cse = cs[0]+'/'+cs[1][1] if len(cs) == 2 else cs[0]
+                        #print cse
+                        f.write(str(eval(cse)) + ' ')
                         try:
                             f.write(str(self.timings[k][v][c][s]) + "\n")
                         except:
diff --git a/HySoP/setup.py.in b/HySoP/setup.py.in
index 098bcc29d0c516836267366604bc2c05813f0e96..8aa8053048dd4f0429d0de156a8874cc01759d47 100644
--- a/HySoP/setup.py.in
+++ b/HySoP/setup.py.in
@@ -101,7 +101,7 @@ config = Configuration(name=name,
       data_files=[('./parmepy/particular_solvers/gpu_src',
                    ['@CMAKE_SOURCE_DIR@/parmepy/particular_solvers/gpu_src/' + cl_file
                     for cl_file in os.listdir('@CMAKE_SOURCE_DIR@/parmepy/particular_solvers/gpu_src/')
-                    if cl_file[-3:]=='.cl'])]
+                    if cl_file[0]!='.' and cl_file[-3:]=='.cl'])]
 )
 
 setup(**config.todict())