diff --git a/HySoP/hysop/constants.py b/HySoP/hysop/constants.py
index 4161890efb209d4872e71e04ae1ff2b0e06ccece..6ec26194ddabf8a6e357fc599b2d5a10a8b8d792 100644
--- a/HySoP/hysop/constants.py
+++ b/HySoP/hysop/constants.py
@@ -32,4 +32,4 @@ PERIODIC = 0
 ## Directions string
 S_DIR = ["_X", "_Y", "_Z"]
 ## GPU deflault sources
-GPU_SRC = os.path.join(solver_path[0], "gpu_src.cl")
+GPU_SRC = os.path.join(solver_path[0], "gpu_src")
diff --git a/HySoP/hysop/domain/topology.py b/HySoP/hysop/domain/topology.py
index 908110a093d31c5f91966f92b592a0e3fe8145e5..83c589af08d8ad696ab63d7bf2034548c7f5da7c 100644
--- a/HySoP/hysop/domain/topology.py
+++ b/HySoP/hysop/domain/topology.py
@@ -56,8 +56,13 @@ class CartesianTopology(object):
             self.west, self.east = self.topo.Shift(YDIR, 1)
         if(self.dim > 2):
             self.south, self.north = self.topo.Shift(ZDIR, 1)
-        self.tabSort = np.array([[self.up,0] , [self.down,1] , [self.east,2], [self.west,3], 
-            [self.north,4],[self.south,5]])
+        if self.dim == 3:
+            self.tabSort = np.array([[self.up, 0], [self.down, 1],
+                                     [self.east, 2], [self.west, 3],
+                                     [self.north, 4], [self.south, 5]])
+        else:
+            self.tabSort = np.array([[self.up, 0] , [self.down, 1],
+                                     [self.east, 2], [self.west, 3]])
         tabSort1 = self.tabSort[self.tabSort[:,0]<=self.rank]
         tabSort2 = self.tabSort[self.tabSort[:,0]>self.rank]
         if (np.asarray(tabSort1[:,0].shape)>0):
@@ -147,7 +152,7 @@ class LocalMesh(object):
         # Local resolution
         self.resolution = resolution.copy() + 2 * ghosts
         # index of the lower point (in each dir) in the global mesh
-        self.start = start # - ghosts
+        self.start = start  # - ghosts
         # index of the upper point (in each dir), global mesh
         self.end = self.start + self.resolution - 1
         # Mesh step size
@@ -157,12 +162,11 @@ class LocalMesh(object):
         self.origin = coord_start
         coord_end = (self.end +1) * self.size + dom_origin - ghosts* self.size
 #        print 'Print TOPO'
-#        print 'resolution', self.resolution , 'shape', (coord_end-coord_start)/self.size, 'coord_end',coord_end, 'coord_start',coord_start 
+#        print 'resolution', self.resolution , 'shape', (coord_end-coord_start)/self.size, 'coord_end',coord_end, 'coord_start',coord_start
         if self.start.size == 3:
             self.coords = np.ogrid[coord_start[0]:coord_end[0]:self.size[0],
                                    coord_start[1]:coord_end[1]:self.size[1],
                                    coord_start[2]:coord_end[2]:self.size[2]]
-            self.resolution = self.coords[0][:,0,0].shape[0],self.coords[1][0,:,0].shape[0],self.coords[2][0,0,:].shape[0]
         elif self.start.size == 2:
             self.coords = np.ogrid[coord_start[0]:coord_end[0]:self.size[0],
                                    coord_start[1]:coord_end[1]:self.size[1]]
diff --git a/HySoP/hysop/operator/remeshing.py b/HySoP/hysop/operator/remeshing.py
index d8afc7ee28a67d2874edec8ec9805389001a2cdc..53baa23ecd583e15c300b4acca4e75c96d8cc591 100644
--- a/HySoP/hysop/operator/remeshing.py
+++ b/HySoP/hysop/operator/remeshing.py
@@ -37,6 +37,7 @@ class Remeshing(DiscreteOperator):
         ## Compute time detailed per directions
         self.compute_time = [0., 0., 0.]
         self.name = "remeshing"
+        self.call_number = 0
 
     def apply(self, t, dt, splittingDirection):
         """
@@ -52,6 +53,7 @@ class Remeshing(DiscreteOperator):
         @li 2. Profile timings of OpenCL kernels.
         """
         c_time = 0.
+        self.call_number += 1
         if self.numMethod is not None:
             ## Launching kernel
             evt = self.numMethod.launch(self.ppos.gpu_data,
@@ -72,8 +74,8 @@ class Remeshing(DiscreteOperator):
         self.timings_info[0] = "\"Remeshing total\" \"Remeshing x\" \"Remeshing y\" \"Remeshing z\" "
         self.timings_info[1] = str(self.total_time) + " " + str(self.compute_time[0]) + " "
         self.timings_info[1] += str(self.compute_time[1]) + " " + str(self.compute_time[2]) + " "
-        print "Remeshing total time : ", self.total_time
-        print "\t Remeshing: ", self.compute_time
+        print "Remeshing total time : ", self.total_time, self.call_number
+        print "\t Remeshing: ", self.compute_time, self.numMethod.call_number
 
     def __str__(self):
         s = "Remeshing (DiscreteOperator). " + DiscreteOperator.__str__(self)
diff --git a/HySoP/hysop/operator/transport_d.py b/HySoP/hysop/operator/transport_d.py
index 658d94cd7ff4a3b0962e0c9fcf73dcbb04847de6..1b4e6d7e31ecd25c9ce4a77f134352f44c648a11 100644
--- a/HySoP/hysop/operator/transport_d.py
+++ b/HySoP/hysop/operator/transport_d.py
@@ -48,6 +48,7 @@ class Transport_d(DiscreteOperator):
         ## Compute time for transposition detailed per directions
         self.compute_time_swap = [0., 0., 0.]
         self.name = "advection"
+        self.call_number = 0
 
     def apply(self, t, dt, splittingDirection):
         """
@@ -66,6 +67,7 @@ class Transport_d(DiscreteOperator):
         @li 3. Profile timings of OpenCL kernels.
         """
         c_time, c_time_init = 0., 0.
+        self.call_number += 1
         if self.numMethod is not None and self.init_transpose is not None and self.init_copy is not None:
             # Particle init
             if (self.old_splitting_direction == splittingDirection) or self.old_splitting_direction is None:
@@ -110,10 +112,10 @@ class Transport_d(DiscreteOperator):
         self.timings_info[1] = str(self.total_time) + "  " + str(self.compute_time_copy[0]) + "  "
         self.timings_info[1] += str(self.compute_time_swap[0]) + "  " + str(self.compute_time_swap[1]) + "  "
         self.timings_info[1] += str(self.compute_time[0]) + "  " + str(self.compute_time[1]) + "  " + str(self.compute_time[2]) + "  "
-        print "Advection total time : ", self.total_time
-        print "\t Advection init copy :", self.compute_time_copy
-        print "\t Advection init swap :", self.compute_time_swap
-        print "\t Advection :", self.compute_time
+        print "Advection total time : ", self.total_time, self.call_number
+        print "\t Advection init copy :", self.compute_time_copy, self.init_copy.call_number
+        print "\t Advection init swap :", self.compute_time_swap, self.init_transpose.call_number
+        print "\t Advection :", self.compute_time, self.numMethod.call_number
 
     def __str__(self):
         s = "Advection_P (DiscreteOperator). " + DiscreteOperator.__str__(self)
diff --git a/HySoP/hysop/particular_solvers/gpu.py b/HySoP/hysop/particular_solvers/gpu.py
index d47d4ef46c70832b131ca47cec5449e941e6bf84..ff9c3272e84162416f73436c64b4cb5a5bc9233f 100644
--- a/HySoP/hysop/particular_solvers/gpu.py
+++ b/HySoP/hysop/particular_solvers/gpu.py
@@ -93,15 +93,12 @@ class GPUParticleSolver(ParticleSolver):
         Allocate device memory as OpenCL Buffers.\n
         Initialize memory on device. \n
         Link kernels ans parmepy.operators.
+        @todo : Take non square resolutions
         """
         ParticleSolver.initialize(self)
         # kernels compilation
         print "=== Kernel sources compiling ==="
-        print "Sources files (default): ", GPU_SRC
-        f = open(GPU_SRC, 'r')
-        ## OpenCL kernel sources
-        self.gpu_src = "".join(f.readlines())
-        f.close()
+        self.gpu_src = ""
         if self.user_gpu_src is not None:
             if isinstance(self.user_gpu_src, list):
                 for user_src in self.user_gpu_src:
@@ -114,36 +111,171 @@ class GPUParticleSolver(ParticleSolver):
                 f = open(self.user_gpu_src, 'r')
                 self.gpu_src += "".join(f.readlines())
                 f.close()
-        workItemNumber = int(min(64, min(self.problem.domains[0].discreteDomain.resolution)))
-        padding = 32
-        # Enabling optimisations regarding problem size
-        float4_enable = True
-        if min(self.problem.domains[0].discreteDomain.resolution) < 4 * workItemNumber:
-            float4_enable = False
-
-        build_options = "-cl-single-precision-constant -cl-opt-disable"
-        build_options += " -D WIDTH=" + str(np.max(self.problem.topology.mesh.resolution))
+        resolution = self.problem.topology.mesh.resolution
+        ## Optimal work item number
+        if len(resolution) == 3:
+            workItemNumber = 64
+        else:
+            workItemNumber = 256
+        print " Work-Item number: ", workItemNumber
+        ## Precision supported
+        print " Precision capability  ",
+        self.correctlyroundeddividesqrt = True
+        if PARMES_REAL_GPU is np.float32:
+            print "for Single Precision: "
+            for v in ['DENORM', 'INF_NAN',
+                      'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF',
+                      'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']:
+                try:
+                    if eval('(self.device.single_fp_config & cl.device_fp_config.' +
+                            v + ') == cl.device_fp_config.' + v):
+                        print v
+                except AttributeError as ae:
+                    if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT':
+                        self.correctlyroundeddividesqrt = False
+                    print v, 'is not supported in OpenCL C 1.2.\n   Exception catched : ', ae
+        else:
+            print "for Double Precision: "
+            if self.device.double_fp_config > 0:
+                for v in ['DENORM', 'INF_NAN',
+                          'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF',
+                          'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']:
+                    try:
+                        if eval('(self.device.double_fp_config & cl.device_fp_config.' +
+                                v + ') == cl.device_fp_config.' + v):
+                            print v
+                    except AttributeError as ae:
+                        if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT':
+                            self.correctlyroundeddividesqrt = False
+                        print  v, 'is supported in OpenCL C 1.2.\n   Exception catched : ', ae
+            else:
+                raise ValueError("Double Precision is not supported by device")
+        ## Building options and kernel settings
+        build_options = ''
+        if self.correctlyroundeddividesqrt:
+            build_options += " -cl-fp32-correctly-rounded-divide-sqrt "
+        if PARMES_REAL_GPU is np.float32:
+            build_options += " -cl-single-precision-constant"
+        build_options += " -D WIDTH=" + str(resolution[0])
         build_options += " -D WGN=" + str(workItemNumber)
-        build_options += " -D PADDING=" + str(padding)
-        if not float4_enable:
-            build_options += " -D BASIC=1"
+
+        ## Collect OpenCL source code
+        print "\n Source code:"
+        ## advection
+        if len(resolution) == 3:
+            if PARMES_REAL_GPU is np.float32:
+                src_advec = 'advection_3D_opt4_builtin.cl'
+            else:
+                src_advec = 'advection_3D_opt2_builtin.cl'
+            advec_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2]))
+            advec_lwi = (int(workItemNumber), 1, 1)
+        else:
+            if PARMES_REAL_GPU is np.float32:
+                src_advec = 'advection_2D_opt4_builtin.cl'
+            else:
+                src_advec = 'advection_2D_buitin.cl'
+            advec_gwi = (int(workItemNumber), int(resolution[1]))
+            advec_lwi = (int(workItemNumber), 1)
+        f = open(os.path.join(GPU_SRC, src_advec))
+        print  "   - advection:", f.name
+        self.gpu_src += "".join(f.readlines())
+        f.close()
+        ## remeshing
+        if len(resolution) == 3:
+            src_remesh = 'remeshing_3D_opt4_private4.cl'
+            src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
+            remesh_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2]))
+            remesh_lwi = (int(workItemNumber), 1, 1)
+        else:
+            src_remesh = 'remeshing_2D_opt4_noBC.cl'
+            src_remesh_weights = 'weights_m6prime_builtin.cl'
+            remesh_gwi = (int(workItemNumber), int(resolution[1]))
+            remesh_lwi = (int(workItemNumber), 1)
+        f = open(os.path.join(GPU_SRC, src_remesh_weights))
+        self.gpu_src += "".join(f.readlines())
+        print  "   - remeshing weights:", f.name
+        f.close()
+        f = open(os.path.join(GPU_SRC, src_remesh))
+        print  "   - remeshing:", f.name
+        self.gpu_src += "".join(f.readlines())
+        f.close()
+        ## copy
+        if len(resolution) == 3:
+            if PARMES_REAL_GPU is np.float32:
+                src_copy = 'copy_3D_opt4.cl'
+                build_options += " -D TILE_DIM_COPY=16 -D BLOCK_ROWS_COPY=8"
+                copy_gwi = (int(resolution[0] / 4), int(resolution[1] / 2), int(resolution[2]))
+                copy_lwi = (4, 8, 1)
+            else:
+                src_copy = 'copy_3D_locMem.cl'
+                build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=4"
+                copy_gwi = (int(resolution[0]), int(resolution[1] / 8), int(resolution[2]))
+                copy_lwi = (32, 4, 1)
+        else:
+            if PARMES_REAL_GPU is np.float32:
+                src_copy = 'copy_2D_opt2.cl'
+                build_options += " -D TILE_DIM_COPY=128 -D BLOCK_ROWS_COPY=4"
+                copy_gwi = (int(resolution[0] / 2), int(resolution[1] / 32))
+                copy_lwi = (64, 4)
+            else:
+                src_copy = 'copy_2D.cl'
+                build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=2"
+                copy_gwi = (int(resolution[0]), int(resolution[1] / 16))
+                copy_lwi = (32, 2)
+        f = open(os.path.join(GPU_SRC, src_copy))
+        print  "   - copy:", f.name
+        self.gpu_src += "".join(f.readlines())
+        f.close()
+        ## transpose
+        if len(resolution) == 3:
+            if PARMES_REAL_GPU is np.float32:
+                src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl'
+                build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8"
+                transpose_xy_gwi = (int(resolution[0] / 2), int(resolution[1] / 4), int(resolution[2]))
+                transpose_xy_lwi = (16, 8, 1)
+                src_transpose_xz = 'transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl'
+                build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4"
+                transpose_xz_gwi = (int(resolution[0]), int(resolution[1] / 4), int(resolution[2] / 4))
+                transpose_xz_lwi = (16, 4, 4)
+            else:
+                src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl'
+                build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=4"
+                transpose_xy_gwi = (int(resolution[0] / 4), int(resolution[1] / 8), int(resolution[2]))
+                transpose_xy_lwi = (8, 4, 1)
+                src_transpose_xz = 'transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl'
+                build_options += " -D TILE_DIM_XZ=8 -D BLOCK_ROWS_XZ=2"
+                transpose_xz_gwi = (int(resolution[0]), int(resolution[1] / 4), int(resolution[2] / 4))
+                transpose_xz_lwi = (8, 2, 2)
+        else:
+            src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl'
+            build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=2"
+            transpose_xy_gwi = (int(resolution[0] / 4), int(resolution[1]) / 16)
+            transpose_xy_lwi = (8, 2)
+        f = open(os.path.join(GPU_SRC, src_transpose_xy))
+        print  "   - transpose_xy:", f.name
+        self.gpu_src += "".join(f.readlines())
+        f.close()
+        if len(resolution) == 3:
+            f = open(os.path.join(GPU_SRC, src_transpose_xz))
+            self.gpu_src += "".join(f.readlines())
+            f.close()
+
+        ## Build code
+        if PARMES_REAL_GPU is np.float32:
+            prg = cl.Program(self.ctx, self.gpu_src.replace('.0', '.0f'))
+        else:
+            prg = cl.Program(self.ctx, self.gpu_src.replace('float', 'double'))
+
         ## OpenCL program
-        self.prg = cl.Program(self.ctx, self.gpu_src).build(build_options)
-        print self.prg.get_build_info(self.device, cl.program_build_info.LOG)
+        self.prg = prg.build(build_options)
+        print "Compiler options : ",
         print self.prg.get_build_info(self.device, cl.program_build_info.OPTIONS)
+        print "Compiler status : ",
         print self.prg.get_build_info(self.device, cl.program_build_info.STATUS)
+        print "Compiler log : ",
+        print self.prg.get_build_info(self.device, cl.program_build_info.LOG)
         print "===\n"
-        # Pad the particle scalar (for transposition efficiency)
-        if len(self.p_scalar.discreteField[0].data.shape) == 2:
-            new_shape = (self.p_scalar.discreteField[0].data.shape[0] + padding,
-                         self.p_scalar.discreteField[0].data.shape[1] + padding)
-            self.p_scalar.discreteField[0].data = np.resize(self.p_scalar.discreteField[0].data, new_shape)
-        else:
-            new_shape = (self.p_scalar.discreteField[0].data.shape[0] + padding,
-                         self.p_scalar.discreteField[0].data.shape[1] + padding,
-                         self.p_scalar.discreteField[0].data.shape[2] + padding)
-            self.p_scalar.discreteField[0].data = np.resize(self.p_scalar.discreteField[0].data, new_shape)
-        # Convert fields to simple precision floats (PARMES_REAL_GPU) and ensure order
+        # Convert fields to recquired precision (PARMES_REAL_GPU) and ensure order
         for f in self.problem.variables:
             for df in f.discreteField:
                 if f.vector:
@@ -184,7 +316,7 @@ class GPUParticleSolver(ParticleSolver):
                     initKernel = cl.Kernel(self.prg, 'init' + f.name)
             if initKernel is not None:
                 for df in f.discreteField:
-                    print f.name, ": kernel init ... ",
+                    print f.name, ": kernel init",
                     global_wg = df.resolution.tolist()
                     if global_wg[0] / workItemNumber > 0:
                         global_wg[0] = workItemNumber
@@ -195,11 +327,13 @@ class GPUParticleSolver(ParticleSolver):
                     else:
                         local_wg = None
                     global_wg = tuple(global_wg)
+                    print global_wg, local_wg,
                     dim = df.topology.dim
                     coord_min = np.ones(4, dtype=PARMES_REAL_GPU)
                     mesh_size = np.ones(4, dtype=PARMES_REAL_GPU)
                     coord_min[0:dim] = df.topology.mesh.origin
                     mesh_size[0:dim] = df.topology.mesh.size
+                    print "...",
                     if f.vector:
                         if dim == 3:
                             evt = initKernel(self.queue,
@@ -244,57 +378,42 @@ class GPUParticleSolver(ParticleSolver):
             if op.discreteOperator.name == "splitting":
                 for sop in op.operators:
                     print sop.discreteOperator.name,
-                    sop.setMethod(KernelLauncher(eval("self.prg." + sop.discreteOperator.name),
+                    if sop.discreteOperator.name == 'advection':
+                        sop.setMethod(KernelLauncher(self.prg.advection,
                                                  self.queue,
-                                                 global_wg,
-                                                 local_wg))
+                                                 advec_gwi,
+                                                 advec_lwi))
+                    elif sop.discreteOperator.name == 'remeshing':
+                        sop.setMethod(KernelLauncher(self.prg.remeshing,
+                                                 self.queue,
+                                                 remesh_gwi,
+                                                 remesh_lwi))
+                    else:
+                        raise ValueError("Unknown operator : " + sop.discreteOperator.name)
             else:
-                print op.discreteOperator.name,
-                op.setMethod(KernelLauncher(eval("self.prg." + op.discreteOperator.name),
-                                             self.queue,
-                                             global_wg,
-                                             local_wg))
+                raise ValueError("Unknown operator : " + op.discreteOperator.name)
         print self.advection.discreteOperator.name,
-        self.advection.discreteOperator.init_copy = KernelLauncher(self.prg.advec_init_copy,
+        self.advection.discreteOperator.init_copy = KernelLauncher(self.prg.copy,
                                                                    self.queue,
-                                                                   global_wg,
-                                                                   local_wg)
-        resolution = self.advection.scalar.discreteField[0].domain.resolution
-        if self.problem.topology.dim == 2:
-            # A work-group perform a (32,2) part of input and (0,16) times in a internal loop
-            # The problem size becomes (nb, nb/16) to let workGroup deal with internal loop
-            # workGroup size (32,2) is rearranged in (64,1)
-            # Problem size (nb, nb/16) is is rearranged in (nb*2,nb/(16*2))
+                                                                   copy_gwi,
+                                                                   copy_lwi)
+        if len(resolution) == 3:
             print self.advection.discreteOperator.name,
-            self.advection.discreteOperator.init_transpose = KernelLauncher(self.prg.advec_init_transpose_2D,
-                                                                            self.queue,
-                                                                            (2 * int(resolution[0]),
-                                                                             int(resolution[1]) / 32),
-                                                                            (64, 1),)
+            k = KernelListLauncher([self.prg.transpose_xy,
+                                    self.prg.transpose_xz],
+                                   self.queue,
+                                   [transpose_xy_gwi,
+                                    transpose_xz_gwi],
+                                   [transpose_xy_lwi,
+                                    transpose_xz_lwi])
+            self.advection.discreteOperator.init_transpose = k
         else:
-            # For swaping axes 0,1 in 3D,
-            # A work-group perform a (32,2,1) part of input and (0,16,0) times in a internal loop
-            # The problem size becomes (nb, nb/16, nb) to let workGroup deal with internal loop
-            # workGroup size (32,2,1) is rearranged in (64,1,1)
-            # Problem size (nb, nb/16, nb) is is rearranged in (nb*2,nb/(16*2),nb)
-            # For swaping axes 0,2 in 3D,
-            # A work-group perform a (32,1,2) part of input and (0,0,16) times in a internal loop
-            # The problem size becomes (nb, nb, nb/16) to let workGroup deal with internal loop
-            # workGroup size (32,1,2) is rearranged in (64,1,1)
-            # Problem size (nb, nb, nb) is is rearranged in (nb*2,nb,nb/(16*2))
             print self.advection.discreteOperator.name,
-            self.advection.discreteOperator.init_transpose = KernelListLauncher([self.prg.advec_init_transpose_3D_01,
-                                                                                 self.prg.advec_init_transpose_3D_02],
-                                                                                self.queue,
-                                                                                [(2 * int(resolution[0]),
-                                                                                  int(resolution[1]) / 32,
-                                                                                  int(resolution[2])),
-                                                                                 (2 * int(resolution[0]),
-                                                                                  int(resolution[1]),
-                                                                                  int(resolution[2]) / 32)
-                                                                                 ],
-                                                                                [(64, 1, 1),
-                                                                                 (64, 1, 1)])
+            k = KernelLauncher(self.prg.transpose_xy,
+                               self.queue,
+                               transpose_xy_gwi,
+                               transpose_xy_lwi)
+            self.advection.discreteOperator.init_transpose = k
         self.problem.io.set_get_data_method(self.get_data_from_device)
         print "===\n"
 
@@ -336,6 +455,7 @@ class KernelListLauncher:
         self.global_size = gsize
         ## OpenCL local size index.
         self.local_size = lsize
+        self.call_number = [0 for k in self.kernel]
 
     def launch(self, d, *args):
         """
@@ -359,7 +479,8 @@ class KernelListLauncher:
 
         Opencl global and local sizes are given in args.
         """
-        #print self.kernel[d].function_name, args[0], args[1]
+        print self.kernel[d].function_name, args[0], args[1]
+        self.call_number[d] += 1
         evt = self.kernel[d](self.queue, *args)
         return evt
 
diff --git a/HySoP/hysop/particular_solvers/gpu_src.cl b/HySoP/hysop/particular_solvers/gpu_src.cl
index e1a8e9c8d94136dc86d4a347fa13b6d8ce217bda..af81deb01f79c1eef55f4f61ce169d70e07cb9c0 100644
--- a/HySoP/hysop/particular_solvers/gpu_src.cl
+++ b/HySoP/hysop/particular_solvers/gpu_src.cl
@@ -1,237 +1,243 @@
-#define GROUP_DIMX      (32)
-#define LOG_GROUP_DIMX  (5)
-#define GROUP_DIMY      (2)
-
-#define TYPE float
-inline TYPE alpha(TYPE y, TYPE s){
+inline float alpha(float y, float s){
   return (y * (y * (y * (y * (13.0f - 5.0f * y) - 9.0f) - 1.0f) + 2.0f) / 24.0f) * s;}
-inline TYPE beta(TYPE y, TYPE s){
+inline float beta(float y, float s){
   return (y * (y * (y * (y * (25.0f * y - 64.0f) + 39.0f) + 16.0f) - 16.0f) / 24.0f)*s;}
-inline TYPE gamma(TYPE y, TYPE s){
+inline float gamma(float y, float s){
   return (y * y * (y * (y * (126.0f - 50.0f * y) - 70.0f) - 30.0f) / 24.0f + 1.0f)*s;}
-inline TYPE delta(TYPE y, TYPE s){
+inline float delta(float y, float s){
   return (y * (y * (y * (y * (50.0f * y - 124.0f) + 66.0f) + 16.0f) + 16.0f) / 24.0f)*s;}
-inline TYPE eta(TYPE y, TYPE s){
+inline float eta(float y, float s){
   return (y * (y * (y * (y * (61.0f - 25.0f * y) - 33.0f) - 1.0f) - 2.0f) / 24.0f)*s;}
-inline TYPE zeta(TYPE y, TYPE s){
+inline float zeta(float y, float s){
   return (y * y * y * (y * (5.0f * y - 12.0f) + 7.0f) / 24.0f)*s;}
 
-
-
-__kernel void advec_init_copy(__global const float* gscal,
-			__global float* pscal)
+#if DIM==3
+__kernel void advection_3D_basic(__global const float* gvelo,
+				 __global float* ppos,
+				 float dt, float min_position, float dx)
 {
   uint gidX = get_global_id(0);
   uint gidY = get_global_id(1);
   uint gidZ = get_global_id(2);
+  int ind, i_ind, i_ind_p;
+  float invdx = 1.0f/dx;
   uint i;
-  for(i=gidX; i<WIDTH; i+=WGN)
+  float v, y, p;
+  for(i=gidX; i<WIDTH; i+=NB_WI_ADVECTION)
     {
-      pscal[i+gidY*(WIDTH+PADDING)+gidZ*(WIDTH+PADDING)*(WIDTH+PADDING)] = gscal[i+gidY*WIDTH+gidZ*WIDTH*WIDTH];
+      v = gvelo[i+gidY*WIDTH+gidZ*WIDTH*WIDTH];
+      p = i*dx + 0.5f*dt*v;
+      ind = convert_int_rtn(p * invdx);
+      y = (p - convert_float(ind) * dx) * invdx;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((ind + 1) % WIDTH);
+      p=(gvelo[i_ind+gidY*WIDTH+gidZ*WIDTH*WIDTH]*(1.0f - y) + gvelo[i_ind_p+gidY*WIDTH+gidZ*WIDTH*WIDTH]*y)*dt + i*dx + min_position;
+      ppos[i+gidY*WIDTH+gidZ*WIDTH*WIDTH] = p;
     }
 }
-
-/*
-  Contraintes :
-      - RÃ©sultat avec padding de 32 (blanck conflicts) dans toutes les directions
-      - Domaine de taille multiple de 32
- */
-__kernel void advec_init_transpose_2D(__global const float* input,
-			__global float* output)
+__kernel void advection_3D_float4(__global const float* gvelo,
+				__global float* ppos,
+				float dt, float min_position, float dx)
 {
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  int4 ind, i_ind, i_ind_p;
+  float invdx = 1.0f/dx;
+  uint i;
+  float4 v, vp, y, p, coord;
+  __local float velocity_cache[WIDTH];
 
-  __local float tile[GROUP_DIMX*(GROUP_DIMX+1)];
-  int block_x = get_group_id(0);
-  int block_y = get_group_id(1);
-
-  int local_x = get_local_id(0) & (GROUP_DIMX - 1);
-  int local_y = get_local_id(0) >> LOG_GROUP_DIMX;
+  for(i=gidX*4; i<WIDTH; i+=NB_WI_ADVECTION*4)
+    {
+      v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
+      velocity_cache[i] = v.x;
+      velocity_cache[i+1] = v.y;
+      velocity_cache[i+2] = v.z;
+      velocity_cache[i+3] = v.w;
+    }
 
-  int local_input  = mad24(local_y, GROUP_DIMX + 1, local_x);
-  int local_output = mad24(local_x, GROUP_DIMX + 1, local_y);
+  barrier(CLK_LOCAL_MEM_FENCE);
 
-  int in_x = mad24(block_x, GROUP_DIMX, local_x);
-  int in_y = mad24(block_y, GROUP_DIMX, local_y);
-  int input_index = mad24(in_y, WIDTH, in_x);
+  for(i=gidX*4; i<WIDTH; i+=NB_WI_ADVECTION*4)
+    {
+      v = vload4(i/4, velocity_cache);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = coord + 0.5f*dt*v;
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(velocity_cache[i_ind.x], velocity_cache[i_ind.y], velocity_cache[i_ind.z], velocity_cache[i_ind.w]);
+      vp = (float4)(velocity_cache[i_ind_p.x], velocity_cache[i_ind_p.y], velocity_cache[i_ind_p.z], velocity_cache[i_ind_p.w]);
+      p = (v*(1.0f - y) + vp*y)*dt + coord + (float4)(min_position);
+      vstore4(p, (i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, ppos);
+    }
+}
+__kernel void remeshing_3D_float4(__global const float* ppos,
+		     __global const float* pscal,
+		     __global float* gscal, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0f/dx;
+  int4 ind;
+  uint i, nb_part = WIDTH/NB_WI_REMESHING;
+  float4 p, s, y;
+  uint4 index4;
 
-  int out_x = mad24(block_y, GROUP_DIMX, local_x);
-  int out_y = mad24(block_x, GROUP_DIMX, local_y);
+  __local float gscal_loc[WIDTH];
 
-  int output_index = mad24(out_y, WIDTH + PADDING, out_x);
+  for(i=gidX*4; i<WIDTH; i+=(NB_WI_REMESHING*4))
+{
+    gscal_loc[i] = 0.0f;
+    gscal_loc[i+1] = 0.0f;
+    gscal_loc[i+2] = 0.0f;
+    gscal_loc[i+3] = 0.0f;
+}
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      p = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, ppos) - (float4)(min_position);
+      s = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH);
+      gscal_loc[index4.x] += (alpha(y.x,s.x));
+      gscal_loc[index4.y] += (alpha(y.y,s.y));
+      gscal_loc[index4.z] += (alpha(y.z,s.z));
+      gscal_loc[index4.w] += (alpha(y.w,s.w));
+barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[index4.x] += (beta(y.x,s.x));
+      gscal_loc[index4.y] += (beta(y.y,s.y));
+      gscal_loc[index4.z] += (beta(y.z,s.z));
+      gscal_loc[index4.w] += (beta(y.w,s.w));
+barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[index4.x] += (gamma(y.x, s.x));
+      gscal_loc[index4.y] += (gamma(y.y, s.y));
+      gscal_loc[index4.z] += (gamma(y.z, s.z));
+      gscal_loc[index4.w] += (gamma(y.w, s.w));
+barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[index4.x] += (delta(y.x, s.x));
+      gscal_loc[index4.y] += (delta(y.y, s.y));
+      gscal_loc[index4.z] += (delta(y.z, s.z));
+      gscal_loc[index4.w] += (delta(y.w, s.w));
+barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[index4.x] += (eta(y.x, s.x));
+      gscal_loc[index4.y] += (eta(y.y, s.y));
+      gscal_loc[index4.z] += (eta(y.z, s.z));
+      gscal_loc[index4.w] += (eta(y.w, s.w));
+barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[index4.x] += (zeta(y.x, s.x));
+      gscal_loc[index4.y] += (zeta(y.y, s.y));
+      gscal_loc[index4.z] += (zeta(y.z, s.z));
+      gscal_loc[index4.w] += (zeta(y.w, s.w));
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(NB_WI_REMESHING*4))
+    vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, gscal);
+}
 
-  int global_input_stride  = WIDTH * GROUP_DIMY;
-  int global_output_stride = (WIDTH + PADDING) * GROUP_DIMY;
+__kernel void copyLocMem_3D(__global const float* in,
+			    __global float* out)
+{
+ uint xIndex = get_group_id(0) * TILE_DIM_COPY_3D + get_local_id(0);
+ uint yIndex = get_group_id(1) * TILE_DIM_COPY_3D + get_local_id(1);
+ uint zIndex = get_global_id(2);
+ uint index = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
 
-  int local_input_stride  = GROUP_DIMY * (GROUP_DIMX + 1);
-  int local_output_stride = GROUP_DIMY;
+  __local float tile[TILE_DIM_COPY_3D][TILE_DIM_COPY_3D];
 
-  for (uint i=0; i<16; i++){
-    tile[local_input] = input[input_index];
-    local_input += local_input_stride;
-    input_index += global_input_stride;
+ for(uint i=0; i<TILE_DIM_COPY_3D; i+=BLOCK_ROWS_COPY_3D)
+  {
+    tile[get_local_id(1)+i][get_local_id(0)] = in[index + i*WIDTH];
   }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  for (uint i=0; i<16; i++){
-    output[output_index] = tile[local_output];
-    local_output += local_output_stride;
-    output_index += global_output_stride;
+barrier(CLK_LOCAL_MEM_FENCE);
+ for(uint i=0; i<TILE_DIM_COPY_3D; i+=BLOCK_ROWS_COPY_3D)
+  {
+    out[index + i*WIDTH] = tile[get_local_id(1)+i][get_local_id(0)];
   }
 }
-__kernel void advec_init_transpose_3D_01(__global const float* input,
-					 __global float* output)
-{
-  /* Nouvelle version avec des index : (256*2, 256/32, 256) (64, 1, 1)
-     Transposition xyz -> yxz dans un buffer local 32,32,1*/
 
-  int block_x = get_group_id(0);
-  int block_y = get_group_id(1);
-  int block_z = get_group_id(2);
-
-  int local_x = get_local_id(0) % (GROUP_DIMX);
-  int local_y = get_local_id(0) / (GROUP_DIMX);
+__kernel void transpose_3D_xy_coalesced_locPad_Diag(__global const float* in,
+						    __global float* out)
+{
+ uint group_id_x = get_group_id(0);
+ uint group_id_y = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
 
-  __local float tile[GROUP_DIMX*(GROUP_DIMX+1)];
+ uint xIndex = group_id_x * TILE_DIM_TRANSPOSE_3D_XY + get_local_id(0);
+ uint yIndex = group_id_y * TILE_DIM_TRANSPOSE_3D_XY + get_local_id(1);
+ uint zIndex = get_global_id(2);
+ uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
 
-  int input_index = local_x + block_x*GROUP_DIMX + (local_y + block_y*GROUP_DIMX)*WIDTH + block_z*WIDTH*WIDTH;
-  int local_input = local_x + local_y*(GROUP_DIMX+1);
-  int local_input_stride = GROUP_DIMY * (GROUP_DIMX+1);
-  int global_input_stride = WIDTH * GROUP_DIMY;
+ xIndex = group_id_y * TILE_DIM_TRANSPOSE_3D_XY + get_local_id(0);
+ yIndex = group_id_x * TILE_DIM_TRANSPOSE_3D_XY + get_local_id(1);
+ uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
 
-  int output_index = local_x + block_y*GROUP_DIMX + (local_y + block_x*GROUP_DIMX)*(WIDTH + PADDING) + block_z*(WIDTH + PADDING)*(WIDTH + PADDING);
-  int local_output = local_y + local_x*(GROUP_DIMX+1);
-  int local_output_stride = GROUP_DIMY;
-  int global_output_stride = (WIDTH + PADDING) * GROUP_DIMY;
+  __local float tile[TILE_DIM_TRANSPOSE_3D_XY][TILE_DIM_TRANSPOSE_3D_XY+1];
 
-  for (uint i=0; i<16; i++){
-    tile[local_input] = input[input_index];
-    local_input += local_input_stride;
-    input_index += global_input_stride;
+ for(uint i=0; i<TILE_DIM_TRANSPOSE_3D_XY; i+=BLOCK_ROWS_TRANSPOSE_3D_XY)
+  {
+    tile[get_local_id(1) + i][get_local_id(0)] = in[index_in + i*WIDTH];
   }
-
   barrier(CLK_LOCAL_MEM_FENCE);
-
-  for (uint i=0; i<16; i++){
-    output[output_index] = tile[local_output];
-    local_output += local_output_stride;
-    output_index += global_output_stride;
+  for(uint i=0; i<TILE_DIM_TRANSPOSE_3D_XY; i+=BLOCK_ROWS_TRANSPOSE_3D_XY)
+  {
+    out[index_out + i*WIDTH] = tile[get_local_id(0)][get_local_id(1) + i];
   }
-
-  /* Fonctionne avec des index:  (64, 256, 256) (64, 1, 1)
-  uint gidX = get_global_id(0);
-  uint gidY = get_global_id(1);
-  uint gidZ = get_global_id(2);
-    for(uint i=gidX; i<WIDTH; i+=WGN)
-    {
-      output[i+gidY*(WIDTH+PADDING)+gidZ*(WIDTH+PADDING)*(WIDTH+PADDING)] = input[gidY+i*WIDTH+gidZ*WIDTH*WIDTH];
-      }
-*/
 }
-
-__kernel void advec_init_transpose_3D_02(__global const float* input,
-					 __global float* output)
+__kernel void transpose_3D_xz_coalesced(__global const float* in,
+					__global float* out)
 {
-  /* Nouvelle version avec des index : (256*2, 256, 256/32) (64, 1, 1)
-     Transposition yxz -> zxy (ie. xyz->zyx) dans un buffer local 32,32,1*/
+ uint xIndex = get_group_id(0) * TILE_DIM_TRANSPOSE_3D_XZ + get_local_id(0);
+ uint yIndex = get_group_id(1) * TILE_DIM_TRANSPOSE_3D_XZ + get_local_id(1);
+ uint zIndex = get_group_id(2) * TILE_DIM_TRANSPOSE_3D_XZ + get_local_id(2);
+ uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
 
-  int block_x = get_group_id(0);
-  int block_y = get_group_id(1);
-  int block_z = get_group_id(2);
+ xIndex = get_group_id(2) * TILE_DIM_TRANSPOSE_3D_XZ + get_local_id(0);
+ zIndex = get_group_id(0) * TILE_DIM_TRANSPOSE_3D_XZ + get_local_id(2);
+ uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
 
-  int local_x = get_local_id(0) % (GROUP_DIMX);
-  int local_z = get_local_id(0) / (GROUP_DIMX);
+  __local float tile[TILE_DIM_TRANSPOSE_3D_XZ][TILE_DIM_TRANSPOSE_3D_XZ][TILE_DIM_TRANSPOSE_3D_XZ];
 
-  __local float tile[GROUP_DIMX*(GROUP_DIMX+1)];
-
-  int input_index = local_x + block_x*GROUP_DIMX + block_y*WIDTH + (local_z+block_z*GROUP_DIMX)*WIDTH*WIDTH;
-  int local_input = local_x + local_z*(GROUP_DIMX+1);
-  int local_input_stride = GROUP_DIMY*(GROUP_DIMX+1);
-   int global_input_stride = GROUP_DIMY*WIDTH*WIDTH;
-
-  int output_index = local_x + block_z*GROUP_DIMX + block_y*(WIDTH + PADDING) + (local_z+block_x*GROUP_DIMX)*(WIDTH + PADDING)*(WIDTH + PADDING);
-  int local_output = local_z + local_x*(GROUP_DIMX+1);
-  int local_output_stride = GROUP_DIMY;
-  int global_output_stride = GROUP_DIMY*(WIDTH + PADDING)*(WIDTH + PADDING);
-
-  for (uint i=0; i<16; i++){
-    tile[local_input] = input[input_index];
-    local_input += local_input_stride;
-    input_index += global_input_stride;
+for(uint j=0; j<TILE_DIM_TRANSPOSE_3D_XZ; j+=BLOCK_ROWS_TRANSPOSE_3D_XZ)
+{
+ for(uint i=0; i<TILE_DIM_TRANSPOSE_3D_XZ; i+=BLOCK_ROWS_TRANSPOSE_3D_XZ)
+  {
+    tile[get_local_id(2) + j][get_local_id(1) + i][get_local_id(0)] = in[index_in + i*WIDTH + j*WIDTH*WIDTH];
   }
-
+}
   barrier(CLK_LOCAL_MEM_FENCE);
-
-  for (uint i=0; i<16; i++){
-    output[output_index] = tile[local_output];
-    local_output += local_output_stride;
-    output_index += global_output_stride;
+for(uint j=0; j<TILE_DIM_TRANSPOSE_3D_XZ; j+=BLOCK_ROWS_TRANSPOSE_3D_XZ)
+{
+  for(uint i=0; i<TILE_DIM_TRANSPOSE_3D_XZ; i+=BLOCK_ROWS_TRANSPOSE_3D_XZ)
+  {
+    out[index_out + i*WIDTH + j*WIDTH*WIDTH] = tile[get_local_id(0)][get_local_id(1)+i][get_local_id(2) + j];
   }
-
-  /* Fonctionne avec des index:  (64, 256, 256) (64, 1, 1)
-  uint gidX = get_global_id(0);
-  uint gidY = get_global_id(1);
-  uint gidZ = get_global_id(2);
-  uint i;
-  for(i=gidX; i<WIDTH; i+=WGN)
-    {
-      output[i+gidY*(WIDTH+PADDING)+gidZ*(WIDTH+PADDING)*(WIDTH+PADDING)] = input[gidZ+gidY*WIDTH+i*WIDTH*WIDTH];
-    }
-  */
 }
-
-
-#ifdef BASIC
-/* Advection Basique */
-__kernel void advection(__global const float* gvelo,
-			__global float* ppos,
-			float dt, float min_position, float dx)
-{
-  uint gidX = get_global_id(0);
-  uint gidY = get_global_id(1);
-  uint gidZ = get_global_id(2);
-  int ind, i_ind, i_ind_p;
-  float invdx = 1.0f/dx;
-  uint i;
-  float v, y, p;
-  for(i=gidX; i<WIDTH; i+=WGN)
-    {
-      v = gvelo[i+gidY*WIDTH+gidZ*WIDTH*WIDTH];
-      p = i*dx + 0.5f*dt*v;
-      ind = convert_int_rtn(p * invdx);
-      y = (p - convert_float(ind) * dx) * invdx;
-      i_ind = ((ind + WIDTH) % WIDTH);
-      i_ind_p = ((ind + 1) % WIDTH);
-      p=(gvelo[i_ind+gidY*WIDTH+gidZ*WIDTH*WIDTH]*(1.0f - y) + gvelo[i_ind_p+gidY*WIDTH+gidZ*WIDTH*WIDTH]*y)*dt + i*dx + min_position;
-      ppos[i+gidY*WIDTH+gidZ*WIDTH*WIDTH] = p;
-    }
-    }
+}
 #else
-/* Advection OpimisÃ©e */
- /*Optimisations :
-   - Memoire locale utilisÃ©e comme cache pour la vitesse
-   - IO en mÃ©moire globale par float4
-   - Calculs par float4
-Contraintes :
-   - Domaine de longueur WIDTH (dans la direction de splitting, ie. dans toutes les directions)
-   - WIDTH > 4*WGN
- */
-__kernel void advection(__global const float* gvelo,
-			__global float* ppos,
-			float dt, float min_position, float dx)
+__kernel void advection_2D_float4(__global const float* gvelo,
+				  __global float* ppos,
+				  float dt, float min_position, float dx)
 {
   uint gidX = get_global_id(0);
   uint gidY = get_global_id(1);
-  uint gidZ = get_global_id(2);
   int4 ind, i_ind, i_ind_p;
   float invdx = 1.0f/dx;
   uint i;
   float4 v, vp, y, p, coord;
   __local float velocity_cache[WIDTH];
 
-  for(i=gidX*4; i<WIDTH; i+=WGN*4)
+  for(i=gidX*4; i<WIDTH; i+=NB_WI_ADVECTION*4)
     {
-      v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
+      v = vload4((i+gidY*WIDTH)/4, gvelo);
       velocity_cache[i] = v.x;
       velocity_cache[i+1] = v.y;
       velocity_cache[i+2] = v.z;
@@ -240,7 +246,7 @@ __kernel void advection(__global const float* gvelo,
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(i=gidX*4; i<WIDTH; i+=WGN*4)
+  for(i=gidX*4; i<WIDTH; i+=NB_WI_ADVECTION*4)
     {
       v = vload4(i/4, velocity_cache);
       coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
@@ -252,124 +258,200 @@ __kernel void advection(__global const float* gvelo,
       v = (float4)(velocity_cache[i_ind.x], velocity_cache[i_ind.y], velocity_cache[i_ind.z], velocity_cache[i_ind.w]);
       vp = (float4)(velocity_cache[i_ind_p.x], velocity_cache[i_ind_p.y], velocity_cache[i_ind_p.z], velocity_cache[i_ind_p.w]);
       p = (v*(1.0f - y) + vp*y)*dt + coord + (float4)(min_position);
-      vstore4(p, (i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, ppos);
+      vstore4(p, (i+gidY*WIDTH)/4, ppos);
     }
-    }
-#endif
-
+}
 
-#ifdef BASIC
-/* Remaillage basique */
-__kernel void remeshing(__global const float* ppos,
-		     __global const float* pscal,
-		     __global float* gscal, float min_position, float dx)
+__kernel void advection_2D_basic(__global const float* gvelo,
+				 __global float* ppos,
+				 float dt, float min_position, float dx)
 {
   uint gidX = get_global_id(0);
   uint gidY = get_global_id(1);
-  uint gidZ = get_global_id(2);
+  int ind, i_ind, i_ind_p;
   float invdx = 1.0f/dx;
-  int ind;
-  uint i, nb_part = WIDTH/WGN;
-  float p, s, y;
-  uint index;
-
-  __local float gscal_loc[WIDTH];
-
-  for(i=gidX; i<WIDTH; i+=WGN)
-    gscal_loc[i] = 0.0f;
-  barrier(CLK_LOCAL_MEM_FENCE);
-  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=1)
+  uint i;
+  float v, y, p;
+  for(i=gidX; i<WIDTH; i+=NB_WI_ADVECTION)
     {
-      p = ppos[(i + gidY*WIDTH + gidZ*WIDTH*WIDTH)] - min_position;
+      v = gvelo[i+gidY*WIDTH];
+      p = i*dx + 0.5f*dt*v;
       ind = convert_int_rtn(p * invdx);
       y = (p - convert_float(ind) * dx) * invdx;
-      s = pscal[(i + gidY*(WIDTH+PADDING) + gidZ*(WIDTH+PADDING)*(WIDTH+PADDING))];
-      index = convert_uint((ind - 2 + WIDTH) % WIDTH);
-      gscal_loc[index] += (alpha(y, s));
-      index = (index + 1) % WIDTH;
-      gscal_loc[index] += (beta(y,s));
-      index = (index + 1) % WIDTH;
-      gscal_loc[index] += (gamma(y,s));
-      index = (index + 1) % WIDTH;
-      gscal_loc[index] += (delta(y,s));
-      index = (index + 1) % WIDTH;
-      gscal_loc[index] += (eta(y,s));
-      index = (index + 1) % WIDTH;
-      gscal_loc[index] += (zeta(y,s));
-    }
-  barrier(CLK_LOCAL_MEM_FENCE);
-  for(i=gidX; i<WIDTH; i+=WGN)
-    gscal[i+gidY*WIDTH+gidZ*WIDTH*WIDTH] = gscal_loc[i];
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((ind + 1) % WIDTH);
+      p=(gvelo[i_ind+gidY*WIDTH]*(1.0f - y) + gvelo[i_ind_p+gidY*WIDTH]*y)*dt + i*dx + min_position;
+      ppos[i+gidY*WIDTH] = p;
     }
-#else
-/* Remaillage OptimisÃ©*/
- /*Optimisations :
-   - Memoire locale utilisÃ©e comme cache pour le scalaire et la synchronisation des Ã©critures
-   - IO en mÃ©moire globale par float4
-   - Calculs par float4
-Contraintes :
-   - Domaine de longueur WIDTH (dans la direction de splitting, ie. dans toutes les directions)
-   - WIDTH > 4*WGN
- */
-__kernel void remeshing(__global const float* ppos,
-		     __global const float* pscal,
-		     __global float* gscal, float min_position, float dx)
+}
+
+
+__kernel void remeshing_2D_float4(__global const float* ppos,
+				  __global const float* pscal,
+				  __global float* gscal, float min_position, float dx)
 {
   uint gidX = get_global_id(0);
   uint gidY = get_global_id(1);
-  uint gidZ = get_global_id(2);
   float invdx = 1.0f/dx;
   int4 ind;
-  uint i, nb_part = WIDTH/WGN;
+  uint i, nb_part = WIDTH/NB_WI_REMESHING;
   float4 p, s, y;
+  float gsx, gsy, gsz, gsw;
   uint4 index4;
 
   __local float gscal_loc[WIDTH];
 
-  for(i=gidX; i<WIDTH; i+=WGN)
+  for(i=gidX*4; i<WIDTH; i+=(NB_WI_REMESHING*4))
+{
     gscal_loc[i] = 0.0f;
+    gscal_loc[i+1] = 0.0f;
+    gscal_loc[i+2] = 0.0f;
+    gscal_loc[i+3] = 0.0f;
+}
   barrier(CLK_LOCAL_MEM_FENCE);
   for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
     {
-      p = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, ppos) - (float4)(min_position);
-      s = vload4((i + gidY*(WIDTH+PADDING) + gidZ*(WIDTH+PADDING)*(WIDTH+PADDING))/4, pscal);
+      p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position);
+      s = vload4((i + gidY*WIDTH)/4, pscal);
       ind = convert_int4_rtn(p * invdx);
       y = (p - convert_float4(ind) * dx) * invdx;
       index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH);
-      gscal_loc[index4.x] += (alpha(y.x, s.x));
-      gscal_loc[index4.y] += (alpha(y.y, s.y));
-      gscal_loc[index4.z] += (alpha(y.z, s.z));
-      gscal_loc[index4.w] += (alpha(y.w, s.w));
+      gsx = gscal_loc[index4.x];
+      gsy = gscal_loc[index4.y];
+      gsz = gscal_loc[index4.z];
+      gsw = gscal_loc[index4.w];
+      gsx = gsx + (alpha(y.x, s.x));
+      gsy = gsy + (alpha(y.y, s.y));
+      gsz = gsz + (alpha(y.z, s.z));
+      gsw = gsw + (alpha(y.w, s.w));
+      gscal_loc[index4.x] = gsx;
+      gscal_loc[index4.y] = gsy;
+      gscal_loc[index4.z] = gsz;
+      gscal_loc[index4.w] = gsw;
+barrier(CLK_LOCAL_MEM_FENCE);
       index4 = (index4 + 1) % WIDTH;
-      gscal_loc[index4.x] += (beta(y.x,s.x));
-      gscal_loc[index4.y] += (beta(y.y,s.y));
-      gscal_loc[index4.z] += (beta(y.z,s.z));
-      gscal_loc[index4.w] += (beta(y.w,s.w));
+      gsx = gscal_loc[index4.x];
+      gsy = gscal_loc[index4.y];
+      gsz = gscal_loc[index4.z];
+      gsw = gscal_loc[index4.w];
+      gsx = gsx + (beta(y.x, s.x));
+      gsy = gsy + (beta(y.y, s.y));
+      gsz = gsz + (beta(y.z, s.z));
+      gsw = gsw + (beta(y.w, s.w));
+      gscal_loc[index4.x] = gsx;
+      gscal_loc[index4.y] = gsy;
+      gscal_loc[index4.z] = gsz;
+      gscal_loc[index4.w] = gsw;
+barrier(CLK_LOCAL_MEM_FENCE);
       index4 = (index4 + 1) % WIDTH;
-      gscal_loc[index4.x] += (gamma(y.x, s.x));
-      gscal_loc[index4.y] += (gamma(y.y, s.y));
-      gscal_loc[index4.z] += (gamma(y.z, s.z));
-      gscal_loc[index4.w] += (gamma(y.w, s.w));
+      gsx = gscal_loc[index4.x];
+      gsy = gscal_loc[index4.y];
+      gsz = gscal_loc[index4.z];
+      gsw = gscal_loc[index4.w];
+      gsx = gsx + (gamma(y.x, s.x));
+      gsy = gsy + (gamma(y.y, s.y));
+      gsz = gsz + (gamma(y.z, s.z));
+      gsw = gsw + (gamma(y.w, s.w));
+      gscal_loc[index4.x] = gsx;
+      gscal_loc[index4.y] = gsy;
+      gscal_loc[index4.z] = gsz;
+      gscal_loc[index4.w] = gsw;
+barrier(CLK_LOCAL_MEM_FENCE);
       index4 = (index4 + 1) % WIDTH;
-      gscal_loc[index4.x] += (delta(y.x, s.x));
-      gscal_loc[index4.y] += (delta(y.y, s.y));
-      gscal_loc[index4.z] += (delta(y.z, s.z));
-      gscal_loc[index4.w] += (delta(y.w, s.w));
+      gsx = gscal_loc[index4.x];
+      gsy = gscal_loc[index4.y];
+      gsz = gscal_loc[index4.z];
+      gsw = gscal_loc[index4.w];
+      gsx = gsx + (delta(y.x, s.x));
+      gsy = gsy + (delta(y.y, s.y));
+      gsz = gsz + (delta(y.z, s.z));
+      gsw = gsw + (delta(y.w, s.w));
+      gscal_loc[index4.x] = gsx;
+      gscal_loc[index4.y] = gsy;
+      gscal_loc[index4.z] = gsz;
+      gscal_loc[index4.w] = gsw;
+barrier(CLK_LOCAL_MEM_FENCE);
       index4 = (index4 + 1) % WIDTH;
-      gscal_loc[index4.x] += (eta(y.x, s.x));
-      gscal_loc[index4.y] += (eta(y.y, s.y));
-      gscal_loc[index4.z] += (eta(y.z, s.z));
-      gscal_loc[index4.w] += (eta(y.w, s.w));
+      gsx = gscal_loc[index4.x];
+      gsy = gscal_loc[index4.y];
+      gsz = gscal_loc[index4.z];
+      gsw = gscal_loc[index4.w];
+      gsx = gsx + (eta(y.x, s.x));
+      gsy = gsy + (eta(y.y, s.y));
+      gsz = gsz + (eta(y.z, s.z));
+      gsw = gsw + (eta(y.w, s.w));
+      gscal_loc[index4.x] = gsx;
+      gscal_loc[index4.y] = gsy;
+      gscal_loc[index4.z] = gsz;
+      gscal_loc[index4.w] = gsw;
+barrier(CLK_LOCAL_MEM_FENCE);
       index4 = (index4 + 1) % WIDTH;
-      gscal_loc[index4.x] += (zeta(y.x, s.x));
-      gscal_loc[index4.y] += (zeta(y.y, s.y));
-      gscal_loc[index4.z] += (zeta(y.z, s.z));
-      gscal_loc[index4.w] += (zeta(y.w, s.w));
+      gsx = gscal_loc[index4.x];
+      gsy = gscal_loc[index4.y];
+      gsz = gscal_loc[index4.z];
+      gsw = gscal_loc[index4.w];
+      gsx = gsx + (zeta(y.x, s.x));
+      gsy = gsy + (zeta(y.y, s.y));
+      gsz = gsz + (zeta(y.z, s.z));
+      gsw = gsw + (zeta(y.w, s.w));
+      gscal_loc[index4.x] = gsx;
+      gscal_loc[index4.y] = gsy;
+      gscal_loc[index4.z] = gsz;
+      gscal_loc[index4.w] = gsw;
     }
   barrier(CLK_LOCAL_MEM_FENCE);
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
-    vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, gscal);
+  for(i=gidX*4; i<WIDTH; i+=(NB_WI_REMESHING*4))
+    vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH)/4, gscal);
+}
+
+__kernel void copyLocMem_2D(__global const float* in,
+			    __global float* out)
+{
+ uint xIndex = get_group_id(0) * TILE_DIM_COPY_2D + get_local_id(0);
+ uint yIndex = get_group_id(1) * TILE_DIM_COPY_2D + get_local_id(1);
+ uint index = xIndex + yIndex * WIDTH;
+
+  __local float tile[TILE_DIM_COPY_2D][TILE_DIM_COPY_2D];
+
+ for(uint i=0; i<TILE_DIM_COPY_2D; i+=BLOCK_ROWS_COPY_2D)
+  {
+    tile[get_local_id(1)+i][get_local_id(0)] = in[index + i*WIDTH];
+  }
+barrier(CLK_LOCAL_MEM_FENCE);
+ for(uint i=0; i<TILE_DIM_COPY_2D; i+=BLOCK_ROWS_COPY_2D)
+  {
+    out[index + i*WIDTH] = tile[get_local_id(1)+i][get_local_id(0)];
+  }
+}
+
+__kernel void transpose_2D_coalesced_locPad_Diag(__global const float* in,
+						 __global float* out)
+{
+ uint group_id_x = get_group_id(0);
+ uint group_id_y = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
+
+ uint xIndex = group_id_x * TILE_DIM_TRANSPOSE_2D + get_local_id(0);
+ uint yIndex = group_id_y * TILE_DIM_TRANSPOSE_2D + get_local_id(1);
+ uint index_in = xIndex + yIndex * WIDTH;
+
+ xIndex = group_id_y * TILE_DIM_TRANSPOSE_2D + get_local_id(0);
+ yIndex = group_id_x * TILE_DIM_TRANSPOSE_2D + get_local_id(1);
+ uint index_out = xIndex + yIndex * WIDTH;
+
+  __local float tile[TILE_DIM_TRANSPOSE_2D][TILE_DIM_TRANSPOSE_2D+1];
+
+ for(uint i=0; i<TILE_DIM_TRANSPOSE_2D; i+=BLOCK_ROWS_TRANSPOSE_2D)
+  {
+    tile[get_local_id(1) + i][get_local_id(0)] = in[index_in + i*WIDTH];
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(uint i=0; i<TILE_DIM_TRANSPOSE_2D; i+=BLOCK_ROWS_TRANSPOSE_2D)
+  {
+    out[index_out + i*WIDTH] = tile[get_local_id(0)][get_local_id(1) + i];
+  }
 }
+
+
 #endif
 
 
@@ -379,6 +461,10 @@ __kernel void remeshing(__global const float* ppos,
 
 
 
+
+
+
+
 /*
 
 // volume computing
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_2D_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..008be61a924d04004e424cf66c0dadd509ac59c2
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_builtin.cl
@@ -0,0 +1,34 @@
+/**
+ * 2D advection kernel.
+ *
+ * Computations are done using builtin functions on vector types.
+ * A local array is used un local memory as a cache for velocity.
+ *
+ * @param gvelo Velocity.
+ * @param ppos Particle position.
+ * @param dt Time step.
+ * @param min_position Domain lower point.
+ * @param dx Space step.
+ */
+__kernel void advection(__global const float* gvelo,
+			__global float* ppos,
+			float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  int ind, i_ind, i_ind_p;
+  float invdx = 1.0/dx;
+  uint i;
+  float v, y, p;
+  for(i=gidX; i<WIDTH; i+=WGN)
+    {
+      v = gvelo[i+gidY*WIDTH];
+      p = fma((float)(0.5*dt),v,i*dx);
+      ind = convert_int_rtn(p * invdx);
+      y = (fma(- convert_float(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((ind + 1) % WIDTH);
+      p=fma(mix(gvelo[i_ind+gidY*WIDTH],gvelo[i_ind_p+gidY*WIDTH],y),dt,i*dx + min_position);
+      ppos[i+gidY*WIDTH] = p;
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt2_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt2_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..d99fe53429fe274cca78682038c06150eda981b9
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt2_builtin.cl
@@ -0,0 +1,49 @@
+/**
+ * 2D advection kernel.
+ *
+ * Memory reads and writes are performed using 2 element OpenCL vectors (float2 or double2).
+ * Computations are done using builtin functions on vector types.
+ * A local array is used un local memory as a cache for velocity.
+ *
+ * @param gvelo Velocity.
+ * @param ppos Particle position.
+ * @param dt Time step.
+ * @param min_position Domain lower point.
+ * @param dx Space step.
+ */
+__kernel void advection(__global const float* gvelo,
+			__global float* ppos,
+			float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  int2 ind, i_ind, i_ind_p;
+  float invdx = 1.0/dx;
+  uint i;
+  float2 v, vp, y, p, coord;
+  __local float velocity_cache[WIDTH];
+
+  for(i=gidX*2; i<WIDTH; i+=WGN*2)
+    {
+      v = vload2((i+gidY*WIDTH)/2, gvelo);
+      velocity_cache[i] = v.x;
+      velocity_cache[i+1] = v.y;
+    }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=gidX*2; i<WIDTH; i+=WGN*2)
+    {
+      v = vload2(i/2, velocity_cache);
+      coord = (float2)(i*dx, (i+1)*dx);
+      p = fma((float2)(0.5*dt),v,coord);
+      ind = convert_int2_rtn(p * invdx);
+      y = (fma(- convert_float2(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float2)(velocity_cache[i_ind.x], velocity_cache[i_ind.y]);
+      vp = (float2)(velocity_cache[i_ind_p.x], velocity_cache[i_ind_p.y]);
+      p=fma(mix(v,vp,y),dt,coord + (float2)(min_position));
+      vstore2(p, (i+gidY*WIDTH)/2, ppos);
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt4_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt4_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..c9d9637ad6728e8b5346532d706618100e647324
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt4_builtin.cl
@@ -0,0 +1,53 @@
+/**
+ * 2D advection kernel.
+ *
+ * Memory reads and writes are performed using 4 element OpenCL vectors (float4 or double4).
+ * Computations are done using builtin functions on vector types.
+ * A local array is used un local memory as a cache for velocity.
+ *
+ * @param gvelo Velocity.
+ * @param ppos Particle position.
+ * @param dt Time step.
+ * @param min_position Domain lower point.
+ * @param dx Space step.
+ */
+__kernel void advection(__global const float* gvelo,
+			__global float* ppos,
+			float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  int4 ind, i_ind, i_ind_p;
+  float invdx = 1.0/dx;
+  uint i;
+  float4 v, vp, y, p, coord;
+  __local float velocity_cache[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=WGN*4)
+    {
+      v = vload4((i+gidY*WIDTH)/4, gvelo);
+      velocity_cache[i] = v.x;
+      velocity_cache[i+1] = v.y;
+      velocity_cache[i+2] = v.z;
+      velocity_cache[i+3] = v.w;
+    }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=gidX*4; i<WIDTH; i+=WGN*4)
+    {
+      v = vload4(i/4, velocity_cache);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(velocity_cache[i_ind.x], velocity_cache[i_ind.y],
+                   velocity_cache[i_ind.z], velocity_cache[i_ind.w]);
+      vp = (float4)(velocity_cache[i_ind_p.x], velocity_cache[i_ind_p.y],
+                    velocity_cache[i_ind_p.z], velocity_cache[i_ind_p.w]);
+      p=fma(mix(v,vp,y),dt,coord + (float4)(min_position));
+      vstore4(p, (i+gidY*WIDTH)/4, ppos);
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt8_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt8_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..cab7c1dd42d87d26fb21c864a80211d430d2a460
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt8_builtin.cl
@@ -0,0 +1,76 @@
+/**
+ * 2D advection kernel.
+ *
+ * Memory reads and writes are performed using 8 element OpenCL vectors (float8 or double8).
+ * Computations are done using builtin functions on vector types.
+ * A local array is used un local memory as a cache for velocity.
+ *
+ * @param gvelo Velocity.
+ * @param ppos Particle position.
+ * @param dt Time step.
+ * @param min_position Domain lower point.
+ * @param dx Space step.
+ */
+__kernel void advection(__global const float* gvelo,
+			__global float* ppos,
+			float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  int8 ind, i_ind, i_ind_p;
+  float invdx = 1.0/dx;
+  uint i;
+  float8 v, vp, y, p, coord;
+  __local float velocity_cache[WIDTH];
+
+  for(i=gidX*8; i<WIDTH; i+=WGN*8)
+    {
+      v = vload8((i+gidY*WIDTH)/8, gvelo);
+      velocity_cache[i] = v.s0;
+      velocity_cache[i+1] = v.s1;
+      velocity_cache[i+2] = v.s2;
+      velocity_cache[i+3] = v.s3;
+      velocity_cache[i+4] = v.s4;
+      velocity_cache[i+5] = v.s5;
+      velocity_cache[i+6] = v.s6;
+      velocity_cache[i+7] = v.s7;
+    }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=gidX*8; i<WIDTH; i+=WGN*8)
+    {
+      v = vload8(i/8, velocity_cache);
+      coord = (float8)(i*dx,
+		       (i+1)*dx,
+		       (i+2)*dx,
+		       (i+3)*dx,
+		       (i+4)*dx,
+		       (i+5)*dx,
+		       (i+6)*dx,
+		       (i+7)*dx);
+      p = fma((float8)(0.5*dt),v,coord);
+      ind = convert_int8_rtn(p * invdx);
+      y = (fma(- convert_float8(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float8)(velocity_cache[i_ind.s0],
+                   velocity_cache[i_ind.s1],
+                   velocity_cache[i_ind.s2],
+                   velocity_cache[i_ind.s3],
+                   velocity_cache[i_ind.s4],
+                   velocity_cache[i_ind.s5],
+                   velocity_cache[i_ind.s6],
+                   velocity_cache[i_ind.s7]);
+      vp = (float8)(velocity_cache[i_ind_p.s0],
+                    velocity_cache[i_ind_p.s1],
+                    velocity_cache[i_ind_p.s2],
+                    velocity_cache[i_ind_p.s3],
+                    velocity_cache[i_ind_p.s4],
+                    velocity_cache[i_ind_p.s5],
+                    velocity_cache[i_ind_p.s6],
+                    velocity_cache[i_ind_p.s7]);
+      p = fma(mix(v,vp,y),dt,coord + (float8)(min_position));
+      vstore8(p, (i+gidY*WIDTH)/8, ppos);
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_3D_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..c54dd72679192fcca254c170d8669d3cbe80742f
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_builtin.cl
@@ -0,0 +1,37 @@
+/**
+ * 3D advection kernel.
+ *
+ * Computations are done using builtin functions on vector types.
+ * A local array is used un local memory as a cache for velocity.
+ *
+ * @param gvelo Velocity.
+ * @param ppos Particle position.
+ * @param dt Time step.
+ * @param min_position Domain lower point.
+ * @param dx Space step.
+ */
+__kernel void advection(__global const float* gvelo,
+			__global float* ppos,
+			float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  int ind, i_ind, i_ind_p;
+  float invdx = 1.0/dx;
+  uint i;
+  float v, y, p;
+  for(i=gidX; i<WIDTH; i+=WGN)
+    {
+      v = gvelo[i+gidY*WIDTH+gidZ*WIDTH*WIDTH];
+      p = fma((float)(0.5*dt),v,i*dx);
+      ind = convert_int_rtn(p * invdx);
+      y = (fma(- convert_float(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((ind + 1) % WIDTH);
+      p=fma(mix(gvelo[i_ind+gidY*WIDTH+gidZ*WIDTH*WIDTH],
+                gvelo[i_ind_p+gidY*WIDTH+gidZ*WIDTH*WIDTH],y),
+            dt,i*dx + min_position);
+      ppos[i+gidY*WIDTH+gidZ*WIDTH*WIDTH] = p;
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt2_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt2_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..3891bb10dcdc5442cb14e4857f385eb4f4c7f4c8
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt2_builtin.cl
@@ -0,0 +1,50 @@
+/**
+ * 3D advection kernel.
+ *
+ * Memory reads and writes are performed using 2 element OpenCL vectors (float2 or double2).
+ * Computations are done using builtin functions on vector types.
+ * A local array is used un local memory as a cache for velocity.
+ *
+ * @param gvelo Velocity.
+ * @param ppos Particle position.
+ * @param dt Time step.
+ * @param min_position Domain lower point.
+ * @param dx Space step.
+ */
+__kernel void advection(__global const float* gvelo,
+			__global float* ppos,
+			float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  int2 ind, i_ind, i_ind_p;
+  float invdx = 1.0/dx;
+  uint i;
+  float2 v, vp, y, p, coord;
+  __local float velocity_cache[WIDTH];
+
+  for(i=gidX*2; i<WIDTH; i+=WGN*2)
+    {
+      v = vload2((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/2, gvelo);
+      velocity_cache[i] = v.x;
+      velocity_cache[i+1] = v.y;
+    }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=gidX*2; i<WIDTH; i+=WGN*2)
+    {
+      v = vload2(i/2, velocity_cache);
+      coord = (float2)(i*dx, (i+1)*dx);
+      p = fma((float2)(0.5*dt),v,coord);
+      ind = convert_int2_rtn(p * invdx);
+      y = (fma(- convert_float2(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float2)(velocity_cache[i_ind.x], velocity_cache[i_ind.y]);
+      vp = (float2)(velocity_cache[i_ind_p.x], velocity_cache[i_ind_p.y]);
+      p=fma(mix(v,vp,y),dt,coord + (float2)(min_position));
+      vstore2(p, (i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/2, ppos);
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt4_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt4_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..49ae8e1d4bb4ab796e0b659eaeadc7e8177bb0eb
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt4_builtin.cl
@@ -0,0 +1,54 @@
+/**
+ * 3D advection kernel.
+ *
+ * Memory reads and writes are performed using 4 element OpenCL vectors (float4 or double4).
+ * Computations are done using builtin functions on vector types.
+ * A local array is used un local memory as a cache for velocity.
+ *
+ * @param gvelo Velocity.
+ * @param ppos Particle position.
+ * @param dt Time step.
+ * @param min_position Domain lower point.
+ * @param dx Space step.
+ */
+__kernel void advection(__global const float* gvelo,
+			__global float* ppos,
+			float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  int4 ind, i_ind, i_ind_p;
+  float invdx = 1.0/dx;
+  uint i;
+  float4 v, vp, y, p, coord;
+  __local float velocity_cache[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=WGN*4)
+    {
+      v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
+      velocity_cache[i] = v.x;
+      velocity_cache[i+1] = v.y;
+      velocity_cache[i+2] = v.z;
+      velocity_cache[i+3] = v.w;
+    }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=gidX*4; i<WIDTH; i+=WGN*4)
+    {
+      v = vload4(i/4, velocity_cache);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(velocity_cache[i_ind.x], velocity_cache[i_ind.y],
+                   velocity_cache[i_ind.z], velocity_cache[i_ind.w]);
+      vp = (float4)(velocity_cache[i_ind_p.x], velocity_cache[i_ind_p.y],
+                    velocity_cache[i_ind_p.z], velocity_cache[i_ind_p.w]);
+      p=fma(mix(v,vp,y),dt,coord + (float4)(min_position));
+      vstore4(p, (i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, ppos);
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt8_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt8_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..db3fb09a765970c6a15b7c3f43a99fc3e92c46bb
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt8_builtin.cl
@@ -0,0 +1,77 @@
+/**
+ * 3D advection kernel.
+ *
+ * Memory reads and writes are performed using 8 element OpenCL vectors (float8 or double8).
+ * Computations are done using builtin functions on vector types.
+ * A local array is used un local memory as a cache for velocity.
+ *
+ * @param gvelo Velocity.
+ * @param ppos Particle position.
+ * @param dt Time step.
+ * @param min_position Domain lower point.
+ * @param dx Space step.
+ */
+__kernel void advection(__global const float* gvelo,
+			__global float* ppos,
+			float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  int8 ind, i_ind, i_ind_p;
+  float invdx = 1.0/dx;
+  uint i;
+  float8 v, vp, y, p, coord;
+  __local float velocity_cache[WIDTH];
+
+  for(i=gidX*8; i<WIDTH; i+=WGN*8)
+    {
+      v = vload8((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/8, gvelo);
+      velocity_cache[i] = v.s0;
+      velocity_cache[i+1] = v.s1;
+      velocity_cache[i+2] = v.s2;
+      velocity_cache[i+3] = v.s3;
+      velocity_cache[i+4] = v.s4;
+      velocity_cache[i+5] = v.s5;
+      velocity_cache[i+6] = v.s6;
+      velocity_cache[i+7] = v.s7;
+    }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=gidX*8; i<WIDTH; i+=WGN*8)
+    {
+      v = vload8(i/8, velocity_cache);
+      coord = (float8)(i*dx,
+		       (i+1)*dx,
+		       (i+2)*dx,
+		       (i+3)*dx,
+		       (i+4)*dx,
+		       (i+5)*dx,
+		       (i+6)*dx,
+		       (i+7)*dx);
+      p = fma((float8)(0.5*dt),v,coord);
+      ind = convert_int8_rtn(p * invdx);
+      y = (fma(- convert_float8(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float8)(velocity_cache[i_ind.s0],
+                   velocity_cache[i_ind.s1],
+                   velocity_cache[i_ind.s2],
+                   velocity_cache[i_ind.s3],
+                   velocity_cache[i_ind.s4],
+                   velocity_cache[i_ind.s5],
+                   velocity_cache[i_ind.s6],
+                   velocity_cache[i_ind.s7]);
+      vp = (float8)(velocity_cache[i_ind_p.s0],
+                    velocity_cache[i_ind_p.s1],
+                    velocity_cache[i_ind_p.s2],
+                    velocity_cache[i_ind_p.s3],
+                    velocity_cache[i_ind_p.s4],
+                    velocity_cache[i_ind_p.s5],
+                    velocity_cache[i_ind_p.s6],
+                    velocity_cache[i_ind_p.s7]);
+      p = fma(mix(v,vp,y),dt,coord + (float8)(min_position));
+      vstore8(p, (i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/8, ppos);
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/copy_2D.cl b/HySoP/hysop/particular_solvers/gpu_src/copy_2D.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1af25c6d58beb24ead4e25dfbd21cb78adc64613
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/copy_2D.cl
@@ -0,0 +1,14 @@
+
+__kernel void copy(__global const float* in,
+		   __global float* out)
+{
+  uint xIndex = get_group_id(0) * TILE_DIM_COPY + get_local_id(0);
+  uint yIndex = get_group_id(1) * TILE_DIM_COPY + get_local_id(1);
+  uint zIndex = get_global_id(2);
+  uint index = xIndex + yIndex * WIDTH + zIndex*WIDTH*WIDTH;
+
+  for(uint i=0; i<TILE_DIM_COPY; i+=BLOCK_ROWS_COPY)
+    {
+      out[index + i*WIDTH] = in[index + i*WIDTH];
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/copy_2D_opt2.cl b/HySoP/hysop/particular_solvers/gpu_src/copy_2D_opt2.cl
new file mode 100644
index 0000000000000000000000000000000000000000..4cff9b8e204cb4f43fb3525ec19d6ac66d401fc9
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/copy_2D_opt2.cl
@@ -0,0 +1,17 @@
+
+__kernel void copy(__global const float* in,
+		   __global float* out)
+{
+  uint xIndex = (get_group_id(0) * TILE_DIM_COPY + get_local_id(0)*2);
+  uint yIndex = get_group_id(1) * TILE_DIM_COPY + get_local_id(1);
+  uint index = xIndex + yIndex * WIDTH;
+  float x,y,z,w;
+
+  for(uint i=0; i<TILE_DIM_COPY; i+=BLOCK_ROWS_COPY)
+    {
+      x = in[index + i*WIDTH];
+      y = in[index + 1 + i*WIDTH];
+      out[index + i*WIDTH] = x;
+      out[index + 1 + i*WIDTH] = y;
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/copy_3D_locMem.cl b/HySoP/hysop/particular_solvers/gpu_src/copy_3D_locMem.cl
new file mode 100644
index 0000000000000000000000000000000000000000..b13c4b735ca5000ba8697cdfd43aa0cf024a789c
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/copy_3D_locMem.cl
@@ -0,0 +1,21 @@
+
+__kernel void copy(__global const float* in,
+		   __global float* out)
+{
+  uint xIndex = get_group_id(0) * TILE_DIM_COPY + get_local_id(0);
+  uint yIndex = get_group_id(1) * TILE_DIM_COPY + get_local_id(1);
+  uint zIndex = get_global_id(2);
+  uint index = xIndex + yIndex * WIDTH + zIndex*WIDTH*WIDTH;
+
+  __local float tile[TILE_DIM_COPY][TILE_DIM_COPY];
+
+  for(uint i=0; i<TILE_DIM_COPY; i+=BLOCK_ROWS_COPY)
+    {
+      tile[get_local_id(1)+i][get_local_id(0)] = in[index + i*WIDTH];
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(uint i=0; i<TILE_DIM_COPY; i+=BLOCK_ROWS_COPY)
+    {
+      out[index + i*WIDTH] = tile[get_local_id(1)+i][get_local_id(0)];
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/copy_3D_opt4.cl b/HySoP/hysop/particular_solvers/gpu_src/copy_3D_opt4.cl
new file mode 100644
index 0000000000000000000000000000000000000000..e5bec1c04df53e5b901db18b6c959bbc4443fb55
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/copy_3D_opt4.cl
@@ -0,0 +1,22 @@
+
+__kernel void copy(__global const float* in,
+		   __global float* out)
+{
+  uint xIndex = (get_group_id(0) * TILE_DIM_COPY + get_local_id(0)*4);
+  uint yIndex = get_group_id(1) * TILE_DIM_COPY + get_local_id(1);
+  uint zIndex = get_global_id(2);
+  uint index = xIndex + yIndex * WIDTH + zIndex*WIDTH*WIDTH;
+  float x,y,z,w;
+
+  for(uint i=0; i<TILE_DIM_COPY; i+=BLOCK_ROWS_COPY)
+    {
+      x = in[index + i*WIDTH];
+      y = in[index + 1 + i*WIDTH];
+      z = in[index + 2 + i*WIDTH];
+      w = in[index + 3 + i*WIDTH];
+      out[index + i*WIDTH] = x;
+      out[index + 1 + i*WIDTH] = y;
+      out[index + 2 + i*WIDTH] = z;
+      out[index + 3 + i*WIDTH] = w;
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..895250723bdd1eaa04357817122fd5205eb10e1c
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_noBC.cl
@@ -0,0 +1,90 @@
+/**
+ * Local memory as a 2D array to avoid bank conflicts.
+ *
+ * @param id Index 1D
+ * @param nb_part Particle number per work-item
+ *
+ * @return Index in a 2D space.
+ */
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGN+(id/nb_part);
+}
+
+/**
+  * 2D Remeshing kernel.
+  *
+  * @param ppos Particle position
+  * @param pscal Particle scalar
+  * @param gscal Grid scalar
+  * @param min_position Domain lower point
+  * @param dx Space step
+  */
+__kernel void remeshing(__global const float* ppos,
+			__global const float* pscal,
+			__global float* gscal, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int4 ind;
+  uint i, nb_part = WIDTH/WGN;
+  float4 p, s, y;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    {
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position);
+      s = vload4((i + gidY*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH);
+      gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (eta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (eta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (eta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (eta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (zeta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (zeta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (zeta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (zeta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_private4.cl b/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_private4.cl
new file mode 100644
index 0000000000000000000000000000000000000000..bd831d5abf3d791b984c67d3e4dee9d3b447f004
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_private4.cl
@@ -0,0 +1,76 @@
+
+__kernel void remeshing(__global const float* ppos,
+			__global const float* pscal,
+			__global float* gscal, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int4 ind;
+  uint i, nb_part = WIDTH/WGN;
+  float4 p, s, y, gs;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    {
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position);
+      s = vload4((i + gidY*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH);
+      gs =  (alpha(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (beta(y, s));
+      gscal_loc[index4.x]+= gs.x;
+      gscal_loc[index4.y]+= gs.y;
+      gscal_loc[index4.z]+= gs.z;
+      gscal_loc[index4.w]+= gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (gamma(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (delta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (eta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (zeta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_private4_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_private4_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..014edd31671f10731a58ba826b876bb5bf612fa1
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_private4_noBC.cl
@@ -0,0 +1,79 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGN+(id/nb_part);
+}
+__kernel void remeshing(__global const float* ppos,
+			__global const float* pscal,
+			__global float* gscal, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int4 ind;
+  uint i, nb_part = WIDTH/WGN;
+  float4 p, s, y, gs;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    {
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position);
+      s = vload4((i + gidY*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH);
+      gs =  (alpha(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (beta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs = (gamma(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs = (delta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (eta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs = (zeta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/remeshing_3D_opt4_private4.cl b/HySoP/hysop/particular_solvers/gpu_src/remeshing_3D_opt4_private4.cl
new file mode 100644
index 0000000000000000000000000000000000000000..eb67791af74bdfb46e79c38c87fcc216d5fc6032
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/remeshing_3D_opt4_private4.cl
@@ -0,0 +1,81 @@
+
+__kernel void remeshing(__global const float* ppos,
+			__global const float* pscal,
+			__global float* gscal, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0/dx;
+  int4 ind;
+  uint i, nb_part = WIDTH/WGN;
+  float4 p, s, y, gs;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    {
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      p = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, ppos) - (float4)(min_position);
+      s = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH);
+      gs =  (alpha(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (beta(y, s));
+      gscal_loc[index4.x]+= gs.x;
+      gscal_loc[index4.y]+= gs.y;
+      gscal_loc[index4.z]+= gs.z;
+      gscal_loc[index4.w]+= gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (gamma(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (delta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (eta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (zeta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+    vstore4((float4)(gscal_loc[i],
+                     gscal_loc[i+1],
+                     gscal_loc[i+2],
+                     gscal_loc[i+3]),
+            (i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/transpose_2D_xy_coalesced_locPad_Diag_opt4.cl b/HySoP/hysop/particular_solvers/gpu_src/transpose_2D_xy_coalesced_locPad_Diag_opt4.cl
new file mode 100644
index 0000000000000000000000000000000000000000..b6853e37ba37b97e35fbf8a7d224acfcbda7c5a2
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/transpose_2D_xy_coalesced_locPad_Diag_opt4.cl
@@ -0,0 +1,36 @@
+
+__kernel void transpose_xy(__global const float* in,
+			   __global float* out)
+{
+  float4 temp;
+  uint group_id_x = get_group_id(0);
+  uint group_id_y = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
+
+  uint xIndex = group_id_x * TILE_DIM_XY + get_local_id(0)*4;
+  uint yIndex = group_id_y * TILE_DIM_XY + get_local_id(1);
+  uint index_in = xIndex + yIndex * WIDTH;
+
+  xIndex = group_id_y * TILE_DIM_XY + get_local_id(0)*4;
+  yIndex = group_id_x * TILE_DIM_XY + get_local_id(1);
+  uint index_out = xIndex + yIndex * WIDTH;
+
+  __local float tile[TILE_DIM_XY][TILE_DIM_XY+1];
+
+  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
+    {
+      temp = vload4((index_in + i*WIDTH)/4, in);
+      tile[get_local_id(1) + i][get_local_id(0)*4] = temp.x;
+      tile[get_local_id(1) + i][get_local_id(0)*4+1] = temp.y;
+      tile[get_local_id(1) + i][get_local_id(0)*4+2] = temp.z;
+      tile[get_local_id(1) + i][get_local_id(0)*4+3] = temp.w;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
+    {
+      temp = (float4)(tile[get_local_id(0)*4][get_local_id(1) + i],
+		      tile[get_local_id(0)*4+1][get_local_id(1) + i],
+		      tile[get_local_id(0)*4+2][get_local_id(1) + i],
+		      tile[get_local_id(0)*4+3][get_local_id(1) + i]);
+      vstore4(temp, (index_out + i*WIDTH)/4, out);
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl
new file mode 100644
index 0000000000000000000000000000000000000000..f94717ae5827d2b0bdd50d9d27da32336e935aef
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl
@@ -0,0 +1,33 @@
+
+__kernel void transpose_xy(__global const float* in,
+			   __global float* out)
+{
+  float2 temp;
+  uint group_id_x = get_group_id(0);
+  uint group_id_y = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
+
+  uint xIndex = group_id_x * TILE_DIM_XY + get_local_id(0)*2;
+  uint yIndex = group_id_y * TILE_DIM_XY + get_local_id(1);
+  uint zIndex = get_global_id(2);
+  uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
+
+  xIndex = group_id_y * TILE_DIM_XY + get_local_id(0)*2;
+  yIndex = group_id_x * TILE_DIM_XY + get_local_id(1);
+  uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
+
+  __local float tile[TILE_DIM_XY][TILE_DIM_XY+1];
+
+  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
+    {
+      temp = vload2((index_in + i*WIDTH)/2, in);
+      tile[get_local_id(1) + i][get_local_id(0)*2] = temp.x;
+      tile[get_local_id(1) + i][get_local_id(0)*2+1] = temp.y;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
+    {
+      temp = (float2)(tile[get_local_id(0)*2][get_local_id(1) + i],
+		      tile[get_local_id(0)*2+1][get_local_id(1) + i]);
+      vstore2(temp, (index_out + i*WIDTH)/2, out);
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl
new file mode 100644
index 0000000000000000000000000000000000000000..e8afa56b9744f41d70327adde3541c83d0f17806
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl
@@ -0,0 +1,37 @@
+
+__kernel void transpose_xy(__global const float* in,
+			   __global float* out)
+{
+  float4 temp;
+  uint group_id_x = get_group_id(0);
+  uint group_id_y = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
+
+  uint xIndex = group_id_x * TILE_DIM_XY + get_local_id(0)*4;
+  uint yIndex = group_id_y * TILE_DIM_XY + get_local_id(1);
+  uint zIndex = get_global_id(2);
+  uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
+
+  xIndex = group_id_y * TILE_DIM_XY + get_local_id(0)*4;
+  yIndex = group_id_x * TILE_DIM_XY + get_local_id(1);
+  uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
+
+  __local float tile[TILE_DIM_XY][TILE_DIM_XY+1];
+
+  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
+    {
+      temp = vload4((index_in + i*WIDTH)/4, in);
+      tile[get_local_id(1) + i][get_local_id(0)*4] = temp.x;
+      tile[get_local_id(1) + i][get_local_id(0)*4+1] = temp.y;
+      tile[get_local_id(1) + i][get_local_id(0)*4+2] = temp.z;
+      tile[get_local_id(1) + i][get_local_id(0)*4+3] = temp.w;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
+    {
+      temp = (float4)(tile[get_local_id(0)*4][get_local_id(1) + i],
+		      tile[get_local_id(0)*4+1][get_local_id(1) + i],
+		      tile[get_local_id(0)*4+2][get_local_id(1) + i],
+		      tile[get_local_id(0)*4+3][get_local_id(1) + i]);
+      vstore4(temp, (index_out + i*WIDTH)/4, out);
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_opt2.cl b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_opt2.cl
new file mode 100644
index 0000000000000000000000000000000000000000..f94717ae5827d2b0bdd50d9d27da32336e935aef
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_opt2.cl
@@ -0,0 +1,33 @@
+
+__kernel void transpose_xy(__global const float* in,
+			   __global float* out)
+{
+  float2 temp;
+  uint group_id_x = get_group_id(0);
+  uint group_id_y = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
+
+  uint xIndex = group_id_x * TILE_DIM_XY + get_local_id(0)*2;
+  uint yIndex = group_id_y * TILE_DIM_XY + get_local_id(1);
+  uint zIndex = get_global_id(2);
+  uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
+
+  xIndex = group_id_y * TILE_DIM_XY + get_local_id(0)*2;
+  yIndex = group_id_x * TILE_DIM_XY + get_local_id(1);
+  uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
+
+  __local float tile[TILE_DIM_XY][TILE_DIM_XY+1];
+
+  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
+    {
+      temp = vload2((index_in + i*WIDTH)/2, in);
+      tile[get_local_id(1) + i][get_local_id(0)*2] = temp.x;
+      tile[get_local_id(1) + i][get_local_id(0)*2+1] = temp.y;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
+    {
+      temp = (float2)(tile[get_local_id(0)*2][get_local_id(1) + i],
+		      tile[get_local_id(0)*2+1][get_local_id(1) + i]);
+      vstore2(temp, (index_out + i*WIDTH)/2, out);
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl
new file mode 100644
index 0000000000000000000000000000000000000000..67e8891bd6910bb0d2463ec367f10a11d8aa5ba8
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl
@@ -0,0 +1,34 @@
+
+__kernel void transpose_xz(__global const float* in,
+			   __global float* out)
+{
+  uint group_id_x = get_group_id(0);
+  uint group_id_z = (get_group_id(0) + get_group_id(2)) % get_num_groups(0);
+
+  uint xIndex = group_id_x * TILE_DIM_XZ + get_local_id(0);
+  uint yIndex = get_group_id(1) * TILE_DIM_XZ + get_local_id(1);
+  uint zIndex = group_id_z * TILE_DIM_XZ + get_local_id(2);
+  uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
+
+  xIndex = group_id_z * TILE_DIM_XZ + get_local_id(0);
+  zIndex = group_id_x * TILE_DIM_XZ + get_local_id(2);
+  uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
+
+  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ];
+
+  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_ROWS_XZ)
+    {
+      for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
+	{
+	  tile[get_local_id(2) + j][get_local_id(1) + i][get_local_id(0)] = in[index_in + i*WIDTH + j*WIDTH*WIDTH];
+	}
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_ROWS_XZ)
+    {
+      for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
+	{
+	  out[index_out + i*WIDTH + j*WIDTH*WIDTH] = tile[get_local_id(0)][get_local_id(1)+i][get_local_id(2) + j];
+	}
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl
new file mode 100644
index 0000000000000000000000000000000000000000..a112ea1993e8885aa57c845d09120b466a57d21f
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl
@@ -0,0 +1,34 @@
+
+__kernel void transpose_xz(__global const float* in,
+			   __global float* out)
+{
+  uint group_id_x = get_group_id(0);
+  uint group_id_z = (get_group_id(0) + get_group_id(2)) % get_num_groups(0);
+
+  uint xIndex = group_id_x * TILE_DIM_XZ + get_local_id(0);
+  uint yIndex = get_group_id(1) * TILE_DIM_XZ + get_local_id(1);
+  uint zIndex = group_id_z * TILE_DIM_XZ + get_local_id(2);
+  uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
+
+  xIndex = group_id_z * TILE_DIM_XZ + get_local_id(0);
+  zIndex = group_id_x * TILE_DIM_XZ + get_local_id(2);
+  uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH;
+
+  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+1];
+
+  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_ROWS_XZ)
+    {
+      for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
+	{
+	  tile[get_local_id(2) + j][get_local_id(1) + i][get_local_id(0)] = in[index_in + i*WIDTH + j*WIDTH*WIDTH];
+	}
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_ROWS_XZ)
+    {
+      for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
+	{
+	  out[index_out + i*WIDTH + j*WIDTH*WIDTH] = tile[get_local_id(0)][get_local_id(1)+i][get_local_id(2) + j];
+	}
+    }
+}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime.cl
new file mode 100644
index 0000000000000000000000000000000000000000..cd21a32e74bff85d463b68da55d44bd8f83749ee
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime.cl
@@ -0,0 +1,20 @@
+/**
+ * M6prime weights.
+ *
+ * @param y Distance between the particle and left-hand grid point
+ * @param s Scalar of the particle
+ *
+ * @return weights
+ */
+inline float alpha(float y, float s){
+  return (y * (y * (y * (y * (13.0 - 5.0 * y) - 9.0) - 1.0) + 2.0) / 24.0) * s;}
+inline float beta(float y, float s){
+  return (y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0) / 24.0)*s;}
+inline float gamma(float y, float s){
+  return (y * y * (y * (y * (126.0 - 50.0 * y) - 70.0) - 30.0) / 24.0 + 1.0)*s;}
+inline float delta(float y, float s){
+  return (y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0) / 24.0)*s;}
+inline float eta(float y, float s){
+  return (y * (y * (y * (y * (61.0 - 25.0 * y) - 33.0) - 1.0) - 2.0) / 24.0)*s;}
+inline float zeta(float y, float s){
+  return (y * y * y * (y * (5.0 * y - 12.0) + 7.0) / 24.0)*s;}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..f63a419523ab10048ed59ca85cbb25e3e1326da1
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_builtin.cl
@@ -0,0 +1,13 @@
+
+inline float alpha(float y, float s){
+  return (y * fma(y , fma(y , fma(y , fma(-5.0, y, 13.0),- 9.0),- 1.0), 2.0) / 24.0) * s;}
+inline float beta(float y, float s){
+  return (y * fma(y , fma(y , fma(y , fma(25.0 , y ,- 64.0), 39.0) , 16.0), - 16.0) / 24.0)*s;}
+inline float gamma(float y, float s){
+  return (y * y * fma(y , fma(y , fma( - 50.0 , y, 126.0) ,- 70.0) ,- 30.0) / 24.0 + 1.0)*s;}
+inline float delta(float y, float s){
+  return (y * fma(y , fma(y, fma(y, fma(50.0, y, - 124.0), 66.0) , 16.0) , 16.0) / 24.0)*s;}
+inline float eta(float y, float s){
+  return (y * fma(y , fma(y , fma(y , fma(- 25.0 , y, 61.0), - 33.0), - 1.0), - 2.0) / 24.0)*s;}
+inline float zeta(float y, float s){
+  return (y * y * y * fma(y , fma(5.0 , y ,- 12.0) , 7.0) / 24.0)*s;}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_opt4.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_opt4.cl
new file mode 100644
index 0000000000000000000000000000000000000000..865327f319211b18015e1ed556b7456c54b6cb7c
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_opt4.cl
@@ -0,0 +1,13 @@
+
+inline float4 alpha(float4 y, float4 s){
+  return (y * (y * (y * (y * (13.0 - 5.0 * y) - 9.0) - 1.0) + 2.0) / 24.0) * s;}
+inline float4 beta(float4 y, float4 s){
+  return (y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0) / 24.0)*s;}
+inline float4 gamma(float4 y, float4 s){
+  return (y * y * (y * (y * (126.0 - 50.0 * y) - 70.0) - 30.0) / 24.0 + 1.0)*s;}
+inline float4 delta(float4 y, float4 s){
+  return (y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0) / 24.0)*s;}
+inline float4 eta(float4 y, float4 s){
+  return (y * (y * (y * (y * (61.0 - 25.0 * y) - 33.0) - 1.0) - 2.0) / 24.0)*s;}
+inline float4 zeta(float4 y, float4 s){
+  return (y * y * y * (y * (5.0 * y - 12.0) + 7.0) / 24.0)*s;}
diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_opt4_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_opt4_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..3a4c2eaeb63ee4317914062ddfb07a0fd38bb6a3
--- /dev/null
+++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_opt4_builtin.cl
@@ -0,0 +1,13 @@
+
+inline float4 alpha(float4 y, float4 s){
+  return (y * fma(y , fma(y , fma(y , fma(-5.0, y, 13.0),- 9.0),- 1.0), 2.0) / 24.0) * s;}
+inline float4 beta(float4 y, float4 s){
+  return (y * fma(y , fma(y , fma(y , fma(25.0 , y ,- 64.0), 39.0) , 16.0), - 16.0) / 24.0)*s;}
+inline float4 gamma(float4 y, float4 s){
+  return (y * y * fma(y , fma(y , fma( - 50.0 , y, 126.0) ,- 70.0) ,- 30.0) / 24.0 + 1.0)*s;}
+inline float4 delta(float4 y, float4 s){
+  return (y * fma(y , fma(y, fma(y, fma(50.0, y, - 124.0), 66.0) , 16.0) , 16.0) / 24.0)*s;}
+inline float4 eta(float4 y, float4 s){
+  return (y * fma(y , fma(y , fma(y , fma(- 25.0 , y, 61.0), - 33.0), - 1.0), - 2.0) / 24.0)*s;}
+inline float4 zeta(float4 y, float4 s){
+  return (y * y * y * fma(y , fma(5.0 , y ,- 12.0) , 7.0) / 24.0)*s;}
diff --git a/HySoP/hysop/tools/printer.py b/HySoP/hysop/tools/printer.py
index 1d973844fb752c4d12ae09dbea26b7453f6d6429..d29e85438207437fd5b678d5b4ff4847d29d9059 100644
--- a/HySoP/hysop/tools/printer.py
+++ b/HySoP/hysop/tools/printer.py
@@ -93,13 +93,13 @@ class Printer():
                 evtk.imageToVTK(filename, pointData=self._build_vtk_dict())
                 print "not yet implemented"
             except IOError:
-            ## Standard output
+                ## Standard output
                 f = open(filename, 'w')
                 shape = self.fields[0].domain.discreteDomain.resolution
                 step = self.fields[0].domain.discreteDomain.step
                 if len(shape) == 2:
-                    for i in xrange(shape[0]):
-                        for j in xrange(shape[1]):
+                    for i in xrange(shape[0]-1):
+                        for j in xrange(shape[1]-1):
                             f.write("{0:8.12} {1:8.12} ".format(i * step[0], j * step[1]))
                             for field in self.fields:
                                 if field.vector:
@@ -109,9 +109,9 @@ class Printer():
                                     f.write("{0:8.12} ".format(field.discreteField[0][i, j]))
                             f.write("\n")
                 else:
-                    for i in xrange(shape[0]):
-                        for j in xrange(shape[1]):
-                            for k in xrange(shape[2]):
+                    for i in xrange(shape[0]-1):
+                        for j in xrange(shape[1]-1):
+                            for k in xrange(shape[2]-1):
                                 f.write("{0:8.12} {1:8.12} {2:8.12} ".format(i * step[0], j * step[1], k * step[2]))
                                 for field in self.fields:
                                     if field.vector: