diff --git a/HySoP/hysop/gpu/cl_src/advection_2D_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_2D_builtin.cl
index 008be61a924d04004e424cf66c0dadd509ac59c2..0ebb79ffd416783d76971cbf755e255c38463949 100644
--- a/HySoP/hysop/gpu/cl_src/advection_2D_builtin.cl
+++ b/HySoP/hysop/gpu/cl_src/advection_2D_builtin.cl
@@ -20,7 +20,7 @@ __kernel void advection(__global const float* gvelo,
   float invdx = 1.0/dx;
   uint i;
   float v, y, p;
-  for(i=gidX; i<WIDTH; i+=WGN)
+  for(i=gidX; i<WIDTH; i+=WGNADVEC)
     {
       v = gvelo[i+gidY*WIDTH];
       p = fma((float)(0.5*dt),v,i*dx);
diff --git a/HySoP/hysop/gpu/cl_src/advection_2D_opt2_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_2D_opt2_builtin.cl
index d99fe53429fe274cca78682038c06150eda981b9..6910c47ad24b958222e61004dfb91ae5c78393d0 100644
--- a/HySoP/hysop/gpu/cl_src/advection_2D_opt2_builtin.cl
+++ b/HySoP/hysop/gpu/cl_src/advection_2D_opt2_builtin.cl
@@ -23,7 +23,7 @@ __kernel void advection(__global const float* gvelo,
   float2 v, vp, y, p, coord;
   __local float velocity_cache[WIDTH];
 
-  for(i=gidX*2; i<WIDTH; i+=WGN*2)
+  for(i=gidX*2; i<WIDTH; i+=WGNADVEC*2)
     {
       v = vload2((i+gidY*WIDTH)/2, gvelo);
       velocity_cache[i] = v.x;
@@ -32,7 +32,7 @@ __kernel void advection(__global const float* gvelo,
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(i=gidX*2; i<WIDTH; i+=WGN*2)
+  for(i=gidX*2; i<WIDTH; i+=WGNADVEC*2)
     {
       v = vload2(i/2, velocity_cache);
       coord = (float2)(i*dx, (i+1)*dx);
diff --git a/HySoP/hysop/gpu/cl_src/advection_2D_opt4_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_2D_opt4_builtin.cl
index c9d9637ad6728e8b5346532d706618100e647324..302cc4a0dee8bb152a438966d03f6e00dec22a3f 100644
--- a/HySoP/hysop/gpu/cl_src/advection_2D_opt4_builtin.cl
+++ b/HySoP/hysop/gpu/cl_src/advection_2D_opt4_builtin.cl
@@ -23,7 +23,7 @@ __kernel void advection(__global const float* gvelo,
   float4 v, vp, y, p, coord;
   __local float velocity_cache[WIDTH];
 
-  for(i=gidX*4; i<WIDTH; i+=WGN*4)
+  for(i=gidX*4; i<WIDTH; i+=WGNADVEC*4)
     {
       v = vload4((i+gidY*WIDTH)/4, gvelo);
       velocity_cache[i] = v.x;
@@ -34,7 +34,7 @@ __kernel void advection(__global const float* gvelo,
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(i=gidX*4; i<WIDTH; i+=WGN*4)
+  for(i=gidX*4; i<WIDTH; i+=WGNADVEC*4)
     {
       v = vload4(i/4, velocity_cache);
       coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
diff --git a/HySoP/hysop/gpu/cl_src/advection_2D_opt8_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_2D_opt8_builtin.cl
index cab7c1dd42d87d26fb21c864a80211d430d2a460..3d87393b5fc6b3bf329183c110276b327e3b742d 100644
--- a/HySoP/hysop/gpu/cl_src/advection_2D_opt8_builtin.cl
+++ b/HySoP/hysop/gpu/cl_src/advection_2D_opt8_builtin.cl
@@ -23,7 +23,7 @@ __kernel void advection(__global const float* gvelo,
   float8 v, vp, y, p, coord;
   __local float velocity_cache[WIDTH];
 
-  for(i=gidX*8; i<WIDTH; i+=WGN*8)
+  for(i=gidX*8; i<WIDTH; i+=WGNADVEC*8)
     {
       v = vload8((i+gidY*WIDTH)/8, gvelo);
       velocity_cache[i] = v.s0;
@@ -38,7 +38,7 @@ __kernel void advection(__global const float* gvelo,
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(i=gidX*8; i<WIDTH; i+=WGN*8)
+  for(i=gidX*8; i<WIDTH; i+=WGNADVEC*8)
     {
       v = vload8(i/8, velocity_cache);
       coord = (float8)(i*dx,
diff --git a/HySoP/hysop/gpu/cl_src/advection_3D_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_3D_builtin.cl
index c54dd72679192fcca254c170d8669d3cbe80742f..1e0e6883501a2164d96053e59d8dcd80083aa33a 100644
--- a/HySoP/hysop/gpu/cl_src/advection_3D_builtin.cl
+++ b/HySoP/hysop/gpu/cl_src/advection_3D_builtin.cl
@@ -21,7 +21,7 @@ __kernel void advection(__global const float* gvelo,
   float invdx = 1.0/dx;
   uint i;
   float v, y, p;
-  for(i=gidX; i<WIDTH; i+=WGN)
+  for(i=gidX; i<WIDTH; i+=WGNADVEC)
     {
       v = gvelo[i+gidY*WIDTH+gidZ*WIDTH*WIDTH];
       p = fma((float)(0.5*dt),v,i*dx);
diff --git a/HySoP/hysop/gpu/cl_src/advection_3D_opt2_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_3D_opt2_builtin.cl
index 3891bb10dcdc5442cb14e4857f385eb4f4c7f4c8..777b9d3150184c9fab2e65217a57acbc46a971fe 100644
--- a/HySoP/hysop/gpu/cl_src/advection_3D_opt2_builtin.cl
+++ b/HySoP/hysop/gpu/cl_src/advection_3D_opt2_builtin.cl
@@ -24,7 +24,7 @@ __kernel void advection(__global const float* gvelo,
   float2 v, vp, y, p, coord;
   __local float velocity_cache[WIDTH];
 
-  for(i=gidX*2; i<WIDTH; i+=WGN*2)
+  for(i=gidX*2; i<WIDTH; i+=WGNADVEC*2)
     {
       v = vload2((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/2, gvelo);
       velocity_cache[i] = v.x;
@@ -33,7 +33,7 @@ __kernel void advection(__global const float* gvelo,
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(i=gidX*2; i<WIDTH; i+=WGN*2)
+  for(i=gidX*2; i<WIDTH; i+=WGNADVEC*2)
     {
       v = vload2(i/2, velocity_cache);
       coord = (float2)(i*dx, (i+1)*dx);
diff --git a/HySoP/hysop/gpu/cl_src/advection_3D_opt4_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_3D_opt4_builtin.cl
index 49ae8e1d4bb4ab796e0b659eaeadc7e8177bb0eb..54fff87d16772990c20e1d8fc48563b6362b50f2 100644
--- a/HySoP/hysop/gpu/cl_src/advection_3D_opt4_builtin.cl
+++ b/HySoP/hysop/gpu/cl_src/advection_3D_opt4_builtin.cl
@@ -24,7 +24,7 @@ __kernel void advection(__global const float* gvelo,
   float4 v, vp, y, p, coord;
   __local float velocity_cache[WIDTH];
 
-  for(i=gidX*4; i<WIDTH; i+=WGN*4)
+  for(i=gidX*4; i<WIDTH; i+=WGNADVEC*4)
     {
       v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
       velocity_cache[i] = v.x;
@@ -35,7 +35,7 @@ __kernel void advection(__global const float* gvelo,
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(i=gidX*4; i<WIDTH; i+=WGN*4)
+  for(i=gidX*4; i<WIDTH; i+=WGNADVEC*4)
     {
       v = vload4(i/4, velocity_cache);
       coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
diff --git a/HySoP/hysop/gpu/cl_src/advection_3D_opt8_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_3D_opt8_builtin.cl
index db3fb09a765970c6a15b7c3f43a99fc3e92c46bb..f019290fb6378c63ae549771bde3f67ac8975407 100644
--- a/HySoP/hysop/gpu/cl_src/advection_3D_opt8_builtin.cl
+++ b/HySoP/hysop/gpu/cl_src/advection_3D_opt8_builtin.cl
@@ -24,7 +24,7 @@ __kernel void advection(__global const float* gvelo,
   float8 v, vp, y, p, coord;
   __local float velocity_cache[WIDTH];
 
-  for(i=gidX*8; i<WIDTH; i+=WGN*8)
+  for(i=gidX*8; i<WIDTH; i+=WGNADVEC*8)
     {
       v = vload8((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/8, gvelo);
       velocity_cache[i] = v.s0;
@@ -39,7 +39,7 @@ __kernel void advection(__global const float* gvelo,
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(i=gidX*8; i<WIDTH; i+=WGN*8)
+  for(i=gidX*8; i<WIDTH; i+=WGNADVEC*8)
     {
       v = vload8(i/8, velocity_cache);
       coord = (float8)(i*dx,
diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt4_builtin_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt4_builtin_noBC.cl
index 2eaec614a26d971b549d417d48d85a2ce4f5dd80..b782866e914d2bab073e9bc8f1d17ec2e4c8d334 100644
--- a/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt4_builtin_noBC.cl
+++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt4_builtin_noBC.cl
@@ -1,6 +1,6 @@
 
 inline uint noBC_id(int id, int nb_part){
-  return (id%nb_part)*WGN+(id/nb_part);
+  return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part);
 }
 __kernel void advection_and_remeshing(__global const float* gvelo,
 				      __global const float* pscal,
@@ -11,14 +11,14 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
   uint gidY = get_global_id(1);
   float invdx = 1.0/dx;
   int4 ind, i_ind, i_ind_p;
-  uint i, nb_part = WIDTH/WGN;
+  uint i, nb_part = WIDTH/WGNADVECANDREMESH;
   float4 p, s, y, gs, v, vp, coord;
   uint4 index4;
 
   __local float gscal_loc[WIDTH];
   __local float gvelo_loc[WIDTH];
 
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
     {
       v = vload4((i+gidY*WIDTH)/4, gvelo);
       gvelo_loc[noBC_id(i,nb_part)] = v.x;
@@ -90,6 +90,6 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
       barrier(CLK_LOCAL_MEM_FENCE);
     }
   barrier(CLK_LOCAL_MEM_FENCE);
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
     vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal);
 }
diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt8_builtin_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt8_builtin_noBC.cl
index 289b9d4a6a0c4fc9bd2f84c09e96d294385485ed..0f0d12414b885bbeb75afbdd55a400f69bbe0a8b 100644
--- a/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt8_builtin_noBC.cl
+++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt8_builtin_noBC.cl
@@ -1,6 +1,6 @@
 
 inline uint noBC_id(int id, int nb_part){
-  return (id%nb_part)*WGN+(id/nb_part);
+  return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part);
 }
 __kernel void advection_and_remeshing(__global const float* gvelo,
 				      __global const float* pscal,
@@ -11,17 +11,17 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
   uint gidY = get_global_id(1);
   float invdx = 1.0/dx;
   int8 ind, i_ind, i_ind_p;
-  uint i, nb_part = WIDTH/WGN;
-  float8 p, s, y, gs, v, vp, coord;
+  uint i, nb_part = WIDTH/WGNADVECANDREMESH;
+  float8 p, s, y, v, vp, coord;
   uint8 index8;
 
   __local float gscal_loc[WIDTH];
   __local float gvelo_loc[WIDTH];
 
-  for(i=gidX*8; i<WIDTH; i+=(WGN*8))
+  for(i=gidX*8; i<WIDTH; i+=(WGNADVECANDREMESH*8))
     {
       v = vload8((i+gidY*WIDTH)/8, gvelo);
-      gvelo_loc[noBC_id(i,nb_part)] = v.s0;
+      gvelo_loc[noBC_id(i+0,nb_part)] = v.s0;
       gvelo_loc[noBC_id(i+1,nb_part)] = v.s1;
       gvelo_loc[noBC_id(i+2,nb_part)] = v.s2;
       gvelo_loc[noBC_id(i+3,nb_part)] = v.s3;
@@ -29,14 +29,14 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
       gvelo_loc[noBC_id(i+5,nb_part)] = v.s5;
       gvelo_loc[noBC_id(i+6,nb_part)] = v.s6;
       gvelo_loc[noBC_id(i+7,nb_part)] = v.s7;
-      gscal_loc[i] = 0.0;
-      gscal_loc[i+1] = 0.0;
-      gscal_loc[i+2] = 0.0;
-      gscal_loc[i+3] = 0.0;
-      gscal_loc[i+4] = 0.0;
-      gscal_loc[i+5] = 0.0;
-      gscal_loc[i+6] = 0.0;
-      gscal_loc[i+7] = 0.0;
+      gscal_loc[noBC_id(i+0,nb_part)] = 0.0;
+      gscal_loc[noBC_id(i+1,nb_part)] = 0.0;
+      gscal_loc[noBC_id(i+2,nb_part)] = 0.0;
+      gscal_loc[noBC_id(i+3,nb_part)] = 0.0;
+      gscal_loc[noBC_id(i+4,nb_part)] = 0.0;
+      gscal_loc[noBC_id(i+5,nb_part)] = 0.0;
+      gscal_loc[noBC_id(i+6,nb_part)] = 0.0;
+      gscal_loc[noBC_id(i+7,nb_part)] = 0.0;
     }
   barrier(CLK_LOCAL_MEM_FENCE);
   for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=8)
@@ -145,7 +145,7 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
       barrier(CLK_LOCAL_MEM_FENCE);
     }
   barrier(CLK_LOCAL_MEM_FENCE);
-  for(i=gidX*8; i<WIDTH; i+=(WGN*8))
+  for(i=gidX*8; i<WIDTH; i+=(WGNADVECANDREMESH*8))
     vstore8((float8)(gscal_loc[noBC_id(i,nb_part)],
                      gscal_loc[noBC_id(i+1,nb_part)],
                      gscal_loc[noBC_id(i+2,nb_part)],
diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4.cl
index 46e9bc59b2ea793d24cd36c429a41bb0e448ff8e..5a380089032e434c0779a1ea1496df8c52cc2582 100644
--- a/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4.cl
+++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4.cl
@@ -9,14 +9,14 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
   uint gidZ = get_global_id(2);
   float invdx = 1.0/dx;
   int4 ind, i_ind, i_ind_p;
-  uint i, nb_part = WIDTH/WGN;
+  uint i, nb_part = WIDTH/WGNADVECANDREMESH;
   float4 p, s, y, gs, v, vp, coord;
   uint4 index4;
 
   __local float gscal_loc[WIDTH];
   __local float gvelo_loc[WIDTH];
 
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
     {
       v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
       gvelo_loc[i] = v.x;
@@ -96,6 +96,6 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
       gscal_loc[index4.w] += gs.w;
     }
   barrier(CLK_LOCAL_MEM_FENCE);
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
     vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal);
 }
diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl
index 5d221f67393d24f1f7614f422ef0396ad22d8c20..c66338a804599ac37d49ea03b0808ea2dc824168 100644
--- a/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl
+++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl
@@ -1,6 +1,6 @@
 
 inline uint noBC_id(int id, int nb_part){
-  return (id%nb_part)*WGN+(id/nb_part);
+  return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part);
 }
 __kernel void advection_and_remeshing(__global const float* gvelo,
 				      __global const float* pscal,
@@ -12,14 +12,14 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
   uint gidZ = get_global_id(2);
   float invdx = 1.0/dx;
   int4 ind, i_ind, i_ind_p;
-  uint i, nb_part = WIDTH/WGN;
+  uint i, nb_part = WIDTH/WGNADVECANDREMESH;
   float4 p, s, y, gs, v, vp, coord;
   uint4 index4;
 
   __local float gscal_loc[WIDTH];
   __local float gvelo_loc[WIDTH];
 
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
     {
       v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
       gvelo_loc[noBC_id(i, nb_part)] = v.x;
@@ -96,6 +96,6 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
       gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
     }
   barrier(CLK_LOCAL_MEM_FENCE);
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
     vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal);
 }
diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_2D_opt4_builtin_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_2D_opt4_builtin_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..da486b5394795be6d42dae6cf35ab7daac782c58
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_2D_opt4_builtin_noBC.cl
@@ -0,0 +1,83 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part);
+}
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt,  float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int4 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGNADVECANDREMESH;
+  float4 p, s, y, gs, v, vp, coord;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
+    {
+      v = vload4((i+gidY*WIDTH)/4, gvelo);
+      gvelo_loc[noBC_id(i,nb_part)] = v.x;
+      gvelo_loc[noBC_id(i+1,nb_part)] = v.y;
+      gvelo_loc[noBC_id(i+2,nb_part)] = v.z;
+      gvelo_loc[noBC_id(i+3,nb_part)] = v.w;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      v = (float4)(gvelo_loc[noBC_id(i,nb_part)],
+                   gvelo_loc[noBC_id(i+1,nb_part)],
+                   gvelo_loc[noBC_id(i+2,nb_part)],
+                   gvelo_loc[noBC_id(i+3,nb_part)]);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]);
+      vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]);
+      p=fma(mix(v,vp,y),dt,coord);
+
+      s = vload4((i + gidY*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 1 + WIDTH) % WIDTH);
+      gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
+    vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_2D_opt8_builtin_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_2D_opt8_builtin_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..371d03ef7a559c804076bda156e2648a806a2a82
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_2D_opt8_builtin_noBC.cl
@@ -0,0 +1,137 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part);
+}
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt,float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int8 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGNADVECANDREMESH;
+  float8 p, s, y, gs, v, vp, coord;
+  uint8 index8;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*8; i<WIDTH; i+=(WGNADVECANDREMESH*8))
+    {
+      v = vload8((i+gidY*WIDTH)/8, gvelo);
+      gvelo_loc[noBC_id(i,nb_part)] = v.s0;
+      gvelo_loc[noBC_id(i+1,nb_part)] = v.s1;
+      gvelo_loc[noBC_id(i+2,nb_part)] = v.s2;
+      gvelo_loc[noBC_id(i+3,nb_part)] = v.s3;
+      gvelo_loc[noBC_id(i+4,nb_part)] = v.s4;
+      gvelo_loc[noBC_id(i+5,nb_part)] = v.s5;
+      gvelo_loc[noBC_id(i+6,nb_part)] = v.s6;
+      gvelo_loc[noBC_id(i+7,nb_part)] = v.s7;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+      gscal_loc[i+4] = 0.0;
+      gscal_loc[i+5] = 0.0;
+      gscal_loc[i+6] = 0.0;
+      gscal_loc[i+7] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=8)
+    {
+      v = (float8)(gvelo_loc[noBC_id(i,nb_part)],
+                   gvelo_loc[noBC_id(i+1,nb_part)],
+                   gvelo_loc[noBC_id(i+2,nb_part)],
+                   gvelo_loc[noBC_id(i+3,nb_part)],
+                   gvelo_loc[noBC_id(i+4,nb_part)],
+                   gvelo_loc[noBC_id(i+5,nb_part)],
+                   gvelo_loc[noBC_id(i+6,nb_part)],
+                   gvelo_loc[noBC_id(i+7,nb_part)]);
+      coord = (float8)(i*dx,
+		       (i+1)*dx,
+		       (i+2)*dx,
+		       (i+3)*dx,
+		       (i+4)*dx,
+		       (i+5)*dx,
+		       (i+6)*dx,
+		       (i+7)*dx);
+      p = fma((float8)(0.5*dt),v,coord);
+      ind = convert_int8_rtn(p * invdx);
+      y = (fma(- convert_float8(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float8)(gvelo_loc[noBC_id(i_ind.s0,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s1,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s2,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s3,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s4,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s5,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s6,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s7,nb_part)]);
+      vp = (float8)(gvelo_loc[noBC_id(i_ind_p.s0,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s1,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s2,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s3,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s4,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s5,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s6,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s7,nb_part)]);
+      p = fma(mix(v,vp,y),dt,coord);
+
+      s = vload8((i + gidY*WIDTH)/8, pscal);
+      ind = convert_int8_rtn(p * invdx);
+      y = (p - convert_float8(ind) * dx) * invdx;
+      index8 = convert_uint8((ind - 1 + WIDTH) % WIDTH);
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (alpha(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (alpha(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (alpha(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (alpha(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (alpha(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (alpha(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (alpha(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (alpha(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (beta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (beta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (beta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (beta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (beta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (beta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (beta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (beta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (gamma(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (gamma(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (gamma(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (gamma(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (gamma(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (gamma(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (gamma(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (gamma(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (delta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (delta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (delta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (delta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (delta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (delta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (delta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (delta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*8; i<WIDTH; i+=(WGNADVECANDREMESH*8))
+    vstore8((float8)(gscal_loc[noBC_id(i,nb_part)],
+                     gscal_loc[noBC_id(i+1,nb_part)],
+                     gscal_loc[noBC_id(i+2,nb_part)],
+                     gscal_loc[noBC_id(i+3,nb_part)],
+                     gscal_loc[noBC_id(i+4,nb_part)],
+                     gscal_loc[noBC_id(i+5,nb_part)],
+                     gscal_loc[noBC_id(i+6,nb_part)],
+                     gscal_loc[noBC_id(i+7,nb_part)]), (i + gidY*WIDTH)/8, gscal);
+}
diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_3D_opt4_builtin_private4.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_3D_opt4_builtin_private4.cl
new file mode 100644
index 0000000000000000000000000000000000000000..10600e47a45b654f02e61a56e1ecfea040fcf16d
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_3D_opt4_builtin_private4.cl
@@ -0,0 +1,86 @@
+
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0/dx;
+  int4 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGNADVECANDREMESH;
+  float4 p, s, y, gs, v, vp, coord;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
+    {
+      v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
+      gvelo_loc[i] = v.x;
+      gvelo_loc[i+1] = v.y;
+      gvelo_loc[i+2] = v.z;
+      gvelo_loc[i+3] = v.w;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      v = vload4(i/4, gvelo_loc);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(gvelo_loc[i_ind.x], gvelo_loc[i_ind.y],
+                   gvelo_loc[i_ind.z], gvelo_loc[i_ind.w]);
+      vp = (float4)(gvelo_loc[i_ind_p.x], gvelo_loc[i_ind_p.y],
+                    gvelo_loc[i_ind_p.z], gvelo_loc[i_ind_p.w]);
+      p=fma(mix(v,vp,y),dt,coord);
+
+      s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 1 + WIDTH) % WIDTH);
+      barrier(CLK_LOCAL_MEM_FENCE);
+      gs = (alpha(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs = (beta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (gamma(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (delta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
+    vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_3D_opt4_builtin_private4_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_3D_opt4_builtin_private4_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..b6e5fd1c0c7afb4bb513f66b8541260edb5c8174
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_3D_opt4_builtin_private4_noBC.cl
@@ -0,0 +1,115 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part);
+}
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0/dx;
+  int4 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGNADVECANDREMESH;
+  float4 p, s, y, gs, v, vp, coord;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
+    {
+      v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
+      gvelo_loc[noBC_id(i, nb_part)] = v.x;
+      gvelo_loc[noBC_id(i+1, nb_part)] = v.y;
+      gvelo_loc[noBC_id(i+2, nb_part)] = v.z;
+      gvelo_loc[noBC_id(i+3, nb_part)] = v.w;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      v = (float4)(gvelo_loc[noBC_id(i,nb_part)],
+                   gvelo_loc[noBC_id(i+1,nb_part)],
+                   gvelo_loc[noBC_id(i+2,nb_part)],
+                   gvelo_loc[noBC_id(i+3,nb_part)]);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]);
+      vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]);
+      p=fma(mix(v,vp,y),dt,coord);
+
+      s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH);
+      gs =  (alpha(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (beta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs = (gamma(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (delta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (eta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs = (zeta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (theta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs = (iota(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
+    vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..0e506b6f454872d8ad13bfb0213fe1095ea0d44b
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl
@@ -0,0 +1,107 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part);
+}
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt,  float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int4 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGNADVECANDREMESH;
+  float4 p, s, y, gs, v, vp, coord;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
+    {
+      v = vload4((i+gidY*WIDTH)/4, gvelo);
+      gvelo_loc[noBC_id(i,nb_part)] = v.x;
+      gvelo_loc[noBC_id(i+1,nb_part)] = v.y;
+      gvelo_loc[noBC_id(i+2,nb_part)] = v.z;
+      gvelo_loc[noBC_id(i+3,nb_part)] = v.w;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      v = (float4)(gvelo_loc[noBC_id(i,nb_part)],
+                   gvelo_loc[noBC_id(i+1,nb_part)],
+                   gvelo_loc[noBC_id(i+2,nb_part)],
+                   gvelo_loc[noBC_id(i+3,nb_part)]);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]);
+      vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]);
+      p=fma(mix(v,vp,y),dt,coord);
+
+      s = vload4((i + gidY*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH);
+      gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (eta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (eta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (eta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (eta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (zeta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (zeta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (zeta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (zeta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (theta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (theta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (theta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (theta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gscal_loc[noBC_id(index4.x,nb_part)] += (iota(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (iota(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (iota(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (iota(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
+    vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..8c5efa160d98d70d55772668214fd288618f8a1b
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl
@@ -0,0 +1,177 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part);
+}
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt,float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int8 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGNADVECANDREMESH;
+  float8 p, s, y, gs, v, vp, coord;
+  uint8 index8;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*8; i<WIDTH; i+=(WGNADVECANDREMESH*8))
+    {
+      v = vload8((i+gidY*WIDTH)/8, gvelo);
+      gvelo_loc[noBC_id(i,nb_part)] = v.s0;
+      gvelo_loc[noBC_id(i+1,nb_part)] = v.s1;
+      gvelo_loc[noBC_id(i+2,nb_part)] = v.s2;
+      gvelo_loc[noBC_id(i+3,nb_part)] = v.s3;
+      gvelo_loc[noBC_id(i+4,nb_part)] = v.s4;
+      gvelo_loc[noBC_id(i+5,nb_part)] = v.s5;
+      gvelo_loc[noBC_id(i+6,nb_part)] = v.s6;
+      gvelo_loc[noBC_id(i+7,nb_part)] = v.s7;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+      gscal_loc[i+4] = 0.0;
+      gscal_loc[i+5] = 0.0;
+      gscal_loc[i+6] = 0.0;
+      gscal_loc[i+7] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=8)
+    {
+      v = (float8)(gvelo_loc[noBC_id(i,nb_part)],
+                   gvelo_loc[noBC_id(i+1,nb_part)],
+                   gvelo_loc[noBC_id(i+2,nb_part)],
+                   gvelo_loc[noBC_id(i+3,nb_part)],
+                   gvelo_loc[noBC_id(i+4,nb_part)],
+                   gvelo_loc[noBC_id(i+5,nb_part)],
+                   gvelo_loc[noBC_id(i+6,nb_part)],
+                   gvelo_loc[noBC_id(i+7,nb_part)]);
+      coord = (float8)(i*dx,
+		       (i+1)*dx,
+		       (i+2)*dx,
+		       (i+3)*dx,
+		       (i+4)*dx,
+		       (i+5)*dx,
+		       (i+6)*dx,
+		       (i+7)*dx);
+      p = fma((float8)(0.5*dt),v,coord);
+      ind = convert_int8_rtn(p * invdx);
+      y = (fma(- convert_float8(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float8)(gvelo_loc[noBC_id(i_ind.s0,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s1,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s2,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s3,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s4,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s5,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s6,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.s7,nb_part)]);
+      vp = (float8)(gvelo_loc[noBC_id(i_ind_p.s0,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s1,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s2,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s3,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s4,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s5,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s6,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.s7,nb_part)]);
+      p = fma(mix(v,vp,y),dt,coord);
+
+      s = vload8((i + gidY*WIDTH)/8, pscal);
+      ind = convert_int8_rtn(p * invdx);
+      y = (p - convert_float8(ind) * dx) * invdx;
+      index8 = convert_uint8((ind - 3 + WIDTH) % WIDTH);
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (alpha(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (alpha(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (alpha(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (alpha(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (alpha(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (alpha(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (alpha(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (alpha(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (beta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (beta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (beta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (beta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (beta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (beta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (beta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (beta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (gamma(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (gamma(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (gamma(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (gamma(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (gamma(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (gamma(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (gamma(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (gamma(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (delta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (delta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (delta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (delta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (delta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (delta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (delta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (delta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (eta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (eta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (eta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (eta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (eta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (eta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (eta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (eta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (zeta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (zeta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (zeta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (zeta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (zeta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (zeta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (zeta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (zeta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (theta(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (theta(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (theta(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (theta(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (theta(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (theta(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (theta(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (theta(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index8 = (index8 + 1) % WIDTH;
+      gscal_loc[noBC_id(index8.s0,nb_part)] += (iota(y.s0,s.s0));
+      gscal_loc[noBC_id(index8.s1,nb_part)] += (iota(y.s1,s.s1));
+      gscal_loc[noBC_id(index8.s2,nb_part)] += (iota(y.s2,s.s2));
+      gscal_loc[noBC_id(index8.s3,nb_part)] += (iota(y.s3,s.s3));
+      gscal_loc[noBC_id(index8.s4,nb_part)] += (iota(y.s4,s.s4));
+      gscal_loc[noBC_id(index8.s5,nb_part)] += (iota(y.s5,s.s5));
+      gscal_loc[noBC_id(index8.s6,nb_part)] += (iota(y.s6,s.s6));
+      gscal_loc[noBC_id(index8.s7,nb_part)] += (iota(y.s7,s.s7));
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*8; i<WIDTH; i+=(WGNADVECANDREMESH*8))
+    vstore8((float8)(gscal_loc[noBC_id(i,nb_part)],
+                     gscal_loc[noBC_id(i+1,nb_part)],
+                     gscal_loc[noBC_id(i+2,nb_part)],
+                     gscal_loc[noBC_id(i+3,nb_part)],
+                     gscal_loc[noBC_id(i+4,nb_part)],
+                     gscal_loc[noBC_id(i+5,nb_part)],
+                     gscal_loc[noBC_id(i+6,nb_part)],
+                     gscal_loc[noBC_id(i+7,nb_part)]), (i + gidY*WIDTH)/8, gscal);
+}
diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl
new file mode 100644
index 0000000000000000000000000000000000000000..45731a83edb2e3b018ead2419b63379bac7aa900
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl
@@ -0,0 +1,115 @@
+
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0/dx;
+  int4 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGNADVECANDREMESH;
+  float4 p, s, y, gs, v, vp, coord;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
+    {
+      v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
+      gvelo_loc[i] = v.x;
+      gvelo_loc[i+1] = v.y;
+      gvelo_loc[i+2] = v.z;
+      gvelo_loc[i+3] = v.w;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      v = vload4(i/4, gvelo_loc);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(gvelo_loc[i_ind.x], gvelo_loc[i_ind.y],
+                   gvelo_loc[i_ind.z], gvelo_loc[i_ind.w]);
+      vp = (float4)(gvelo_loc[i_ind_p.x], gvelo_loc[i_ind_p.y],
+                    gvelo_loc[i_ind_p.z], gvelo_loc[i_ind_p.w]);
+      p=fma(mix(v,vp,y),dt,coord);
+
+      s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH);
+      barrier(CLK_LOCAL_MEM_FENCE);
+      gs = (alpha(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs = (beta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (gamma(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (delta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs = (eta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (zeta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (theta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      index4 = (index4 + 1) % WIDTH;
+      //barrier(CLK_LOCAL_MEM_FENCE);
+      gs =  (iota(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
+    vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..dd942553dd0f23743da88ca7da5c40347bdb210d
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl
@@ -0,0 +1,88 @@
+
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part);
+}
+__kernel void advection_and_remeshing(__global const float* gvelo,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0/dx;
+  int4 ind, i_ind, i_ind_p;
+  uint i, nb_part = WIDTH/WGNADVECANDREMESH;
+  float4 p, s, y, gs, v, vp, coord;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+  __local float gvelo_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
+    {
+      v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo);
+      gvelo_loc[noBC_id(i, nb_part)] = v.x;
+      gvelo_loc[noBC_id(i+1, nb_part)] = v.y;
+      gvelo_loc[noBC_id(i+2, nb_part)] = v.z;
+      gvelo_loc[noBC_id(i+3, nb_part)] = v.w;
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      v = (float4)(gvelo_loc[noBC_id(i,nb_part)],
+                   gvelo_loc[noBC_id(i+1,nb_part)],
+                   gvelo_loc[noBC_id(i+2,nb_part)],
+                   gvelo_loc[noBC_id(i+3,nb_part)]);
+      coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx);
+      p = fma((float4)(0.5*dt),v,coord);
+      ind = convert_int4_rtn(p * invdx);
+      y = (fma(- convert_float4(ind),dx,p))* invdx ;
+      i_ind = ((ind + WIDTH) % WIDTH);
+      i_ind_p = ((i_ind + 1) % WIDTH);
+      v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)],
+                   gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]);
+      vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)],
+                    gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]);
+      p=fma(mix(v,vp,y),dt,coord);
+
+      s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 1 + WIDTH) % WIDTH);
+      gs =  (alpha(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (beta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs = (gamma(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (delta(y, s));
+      gscal_loc[noBC_id(index4.x, nb_part)] += gs.x;
+      gscal_loc[noBC_id(index4.y, nb_part)] += gs.y;
+      gscal_loc[noBC_id(index4.z, nb_part)] += gs.z;
+      gscal_loc[noBC_id(index4.w, nb_part)] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4))
+    vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_noBC.cl b/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_noBC.cl
index 895250723bdd1eaa04357817122fd5205eb10e1c..7347bb0c5ef9e4fa13ecda9272d6bf5e0e7699fc 100644
--- a/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_noBC.cl
+++ b/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_noBC.cl
@@ -7,7 +7,7 @@
  * @return Index in a 2D space.
  */
 inline uint noBC_id(int id, int nb_part){
-  return (id%nb_part)*WGN+(id/nb_part);
+  return (id%nb_part)*WGNREMESH+(id/nb_part);
 }
 
 /**
@@ -27,13 +27,13 @@ __kernel void remeshing(__global const float* ppos,
   uint gidY = get_global_id(1);
   float invdx = 1.0/dx;
   int4 ind;
-  uint i, nb_part = WIDTH/WGN;
+  uint i, nb_part = WIDTH/WGNREMESH;
   float4 p, s, y;
   uint4 index4;
 
   __local float gscal_loc[WIDTH];
 
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
     {
       gscal_loc[i] = 0.0;
       gscal_loc[i+1] = 0.0;
@@ -85,6 +85,6 @@ __kernel void remeshing(__global const float* ppos,
       barrier(CLK_LOCAL_MEM_FENCE);
     }
   barrier(CLK_LOCAL_MEM_FENCE);
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
     vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal);
 }
diff --git a/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4.cl b/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4.cl
index bd831d5abf3d791b984c67d3e4dee9d3b447f004..6e63f6408e790973f3a0e1155f64f02a4c5a349f 100644
--- a/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4.cl
+++ b/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4.cl
@@ -7,13 +7,13 @@ __kernel void remeshing(__global const float* ppos,
   uint gidY = get_global_id(1);
   float invdx = 1.0/dx;
   int4 ind;
-  uint i, nb_part = WIDTH/WGN;
+  uint i, nb_part = WIDTH/WGNREMESH;
   float4 p, s, y, gs;
   uint4 index4;
 
   __local float gscal_loc[WIDTH];
 
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
     {
       gscal_loc[i] = 0.0;
       gscal_loc[i+1] = 0.0;
@@ -71,6 +71,6 @@ __kernel void remeshing(__global const float* ppos,
       barrier(CLK_LOCAL_MEM_FENCE);
     }
   barrier(CLK_LOCAL_MEM_FENCE);
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
     vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH)/4, gscal);
 }
diff --git a/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4_noBC.cl b/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4_noBC.cl
index 014edd31671f10731a58ba826b876bb5bf612fa1..9e9447359fdd45e8d5e411b35276c372f7816fc6 100644
--- a/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4_noBC.cl
+++ b/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4_noBC.cl
@@ -1,6 +1,6 @@
 
 inline uint noBC_id(int id, int nb_part){
-  return (id%nb_part)*WGN+(id/nb_part);
+  return (id%nb_part)*WGNREMESH+(id/nb_part);
 }
 __kernel void remeshing(__global const float* ppos,
 			__global const float* pscal,
@@ -10,13 +10,13 @@ __kernel void remeshing(__global const float* ppos,
   uint gidY = get_global_id(1);
   float invdx = 1.0/dx;
   int4 ind;
-  uint i, nb_part = WIDTH/WGN;
+  uint i, nb_part = WIDTH/WGNREMESH;
   float4 p, s, y, gs;
   uint4 index4;
 
   __local float gscal_loc[WIDTH];
 
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
     {
       gscal_loc[i] = 0.0;
       gscal_loc[i+1] = 0.0;
@@ -74,6 +74,6 @@ __kernel void remeshing(__global const float* ppos,
       barrier(CLK_LOCAL_MEM_FENCE);
     }
   barrier(CLK_LOCAL_MEM_FENCE);
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
     vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH)/4, gscal);
 }
diff --git a/HySoP/hysop/gpu/cl_src/remeshing_3D_opt4_private4.cl b/HySoP/hysop/gpu/cl_src/remeshing_3D_opt4_private4.cl
index eb67791af74bdfb46e79c38c87fcc216d5fc6032..922a8fe28e19c354330364dc0e6b5fc05f6315cd 100644
--- a/HySoP/hysop/gpu/cl_src/remeshing_3D_opt4_private4.cl
+++ b/HySoP/hysop/gpu/cl_src/remeshing_3D_opt4_private4.cl
@@ -8,13 +8,13 @@ __kernel void remeshing(__global const float* ppos,
   uint gidZ = get_global_id(2);
   float invdx = 1.0/dx;
   int4 ind;
-  uint i, nb_part = WIDTH/WGN;
+  uint i, nb_part = WIDTH/WGNREMESH;
   float4 p, s, y, gs;
   uint4 index4;
 
   __local float gscal_loc[WIDTH];
 
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
     {
       gscal_loc[i] = 0.0;
       gscal_loc[i+1] = 0.0;
@@ -72,7 +72,7 @@ __kernel void remeshing(__global const float* ppos,
       barrier(CLK_LOCAL_MEM_FENCE);
     }
   barrier(CLK_LOCAL_MEM_FENCE);
-  for(i=gidX*4; i<WIDTH; i+=(WGN*4))
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
     vstore4((float4)(gscal_loc[i],
                      gscal_loc[i+1],
                      gscal_loc[i+2],
diff --git a/HySoP/hysop/gpu/cl_src/remeshing_m4prime_2D_opt4_noBC.cl b/HySoP/hysop/gpu/cl_src/remeshing_m4prime_2D_opt4_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..ed9edaeb09bec2acf955d41e7fe2f9f0efe0367f
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/remeshing_m4prime_2D_opt4_noBC.cl
@@ -0,0 +1,78 @@
+/**
+ * Local memory as a 2D array to avoid bank conflicts.
+ *
+ * @param id Index 1D
+ * @param nb_part Particle number per work-item
+ *
+ * @return Index in a 2D space.
+ */
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGNREMESH+(id/nb_part);
+}
+
+/**
+  * 2D Remeshing kernel.
+  *
+  * @param ppos Particle position
+  * @param pscal Particle scalar
+  * @param gscal Grid scalar
+  * @param min_position Domain lower point
+  * @param dx Space step
+  */
+__kernel void remeshing(__global const float* ppos,
+			__global const float* pscal,
+			__global float* gscal, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int4 ind;
+  uint i, nb_part = WIDTH/WGNREMESH;
+  float4 p, s, y;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
+    {
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position);
+      s = vload4((i + gidY*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 1 + WIDTH) % WIDTH); // i-1
+      gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i
+      gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i+1
+      gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i+2
+      gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
+    vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/gpu/cl_src/remeshing_m4prime_3D_opt4_private4.cl b/HySoP/hysop/gpu/cl_src/remeshing_m4prime_3D_opt4_private4.cl
new file mode 100644
index 0000000000000000000000000000000000000000..c30200bb883bfcb34181236bda74d87a44ab524a
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/remeshing_m4prime_3D_opt4_private4.cl
@@ -0,0 +1,67 @@
+
+__kernel void remeshing(__global const float* ppos,
+			__global const float* pscal,
+			__global float* gscal, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0/dx;
+  int4 ind;
+  uint i, nb_part = WIDTH/WGNREMESH;
+  float4 p, s, y, gs;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
+    {
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      p = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, ppos) - (float4)(min_position);
+      s = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 1 + WIDTH) % WIDTH);
+      gs =  (alpha(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (beta(y, s));
+      gscal_loc[index4.x]+= gs.x;
+      gscal_loc[index4.y]+= gs.y;
+      gscal_loc[index4.z]+= gs.z;
+      gscal_loc[index4.w]+= gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (gamma(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (delta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
+    vstore4((float4)(gscal_loc[i],
+                     gscal_loc[i+1],
+                     gscal_loc[i+2],
+                     gscal_loc[i+3]),
+            (i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/gpu/cl_src/remeshing_m8prime_2D_opt4_noBC.cl b/HySoP/hysop/gpu/cl_src/remeshing_m8prime_2D_opt4_noBC.cl
new file mode 100644
index 0000000000000000000000000000000000000000..65ef443d956024b80b433dbc6560c910828f27a6
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/remeshing_m8prime_2D_opt4_noBC.cl
@@ -0,0 +1,102 @@
+/**
+ * Local memory as a 2D array to avoid bank conflicts.
+ *
+ * @param id Index 1D
+ * @param nb_part Particle number per work-item
+ *
+ * @return Index in a 2D space.
+ */
+inline uint noBC_id(int id, int nb_part){
+  return (id%nb_part)*WGNREMESH+(id/nb_part);
+}
+
+/**
+  * 2D Remeshing kernel.
+  *
+  * @param ppos Particle position
+  * @param pscal Particle scalar
+  * @param gscal Grid scalar
+  * @param min_position Domain lower point
+  * @param dx Space step
+  */
+__kernel void remeshing(__global const float* ppos,
+			__global const float* pscal,
+			__global float* gscal, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  float invdx = 1.0/dx;
+  int4 ind;
+  uint i, nb_part = WIDTH/WGNREMESH;
+  float4 p, s, y;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
+    {
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position);
+      s = vload4((i + gidY*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH); // i-3
+      gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i-2
+      gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i-1
+      gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i
+      gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; //i+1
+      gscal_loc[noBC_id(index4.x,nb_part)] += (eta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (eta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (eta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (eta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i+2
+      gscal_loc[noBC_id(index4.x,nb_part)] += (zeta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (zeta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (zeta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (zeta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i+3
+      gscal_loc[noBC_id(index4.x,nb_part)] += (theta(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (theta(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (theta(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (theta(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH; // i+4
+      gscal_loc[noBC_id(index4.x,nb_part)] += (iota(y.x, s.x));
+      gscal_loc[noBC_id(index4.y,nb_part)] += (iota(y.y, s.y));
+      gscal_loc[noBC_id(index4.z,nb_part)] += (iota(y.z, s.z));
+      gscal_loc[noBC_id(index4.w,nb_part)] += (iota(y.w, s.w));
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
+    vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/gpu/cl_src/remeshing_m8prime_3D_opt4_private4.cl b/HySoP/hysop/gpu/cl_src/remeshing_m8prime_3D_opt4_private4.cl
new file mode 100644
index 0000000000000000000000000000000000000000..b983724dadbb652ce735579feacc61456f3bdf1a
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/remeshing_m8prime_3D_opt4_private4.cl
@@ -0,0 +1,95 @@
+
+__kernel void remeshing(__global const float* ppos,
+			__global const float* pscal,
+			__global float* gscal, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);
+  uint gidY = get_global_id(1);
+  uint gidZ = get_global_id(2);
+  float invdx = 1.0/dx;
+  int4 ind;
+  uint i, nb_part = WIDTH/WGNREMESH;
+  float4 p, s, y, gs;
+  uint4 index4;
+
+  __local float gscal_loc[WIDTH];
+
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
+    {
+      gscal_loc[i] = 0.0;
+      gscal_loc[i+1] = 0.0;
+      gscal_loc[i+2] = 0.0;
+      gscal_loc[i+3] = 0.0;
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4)
+    {
+      p = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, ppos) - (float4)(min_position);
+      s = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, pscal);
+      ind = convert_int4_rtn(p * invdx);
+      y = (p - convert_float4(ind) * dx) * invdx;
+      index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH);
+      gs =  (alpha(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (beta(y, s));
+      gscal_loc[index4.x]+= gs.x;
+      gscal_loc[index4.y]+= gs.y;
+      gscal_loc[index4.z]+= gs.z;
+      gscal_loc[index4.w]+= gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (gamma(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (delta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (eta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (zeta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (theta(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      index4 = (index4 + 1) % WIDTH;
+      gs =  (iota(y, s));
+      gscal_loc[index4.x] += gs.x;
+      gscal_loc[index4.y] += gs.y;
+      gscal_loc[index4.z] += gs.z;
+      gscal_loc[index4.w] += gs.w;
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4))
+    vstore4((float4)(gscal_loc[i],
+                     gscal_loc[i+1],
+                     gscal_loc[i+2],
+                     gscal_loc[i+3]),
+            (i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, gscal);
+}
diff --git a/HySoP/hysop/gpu/cl_src/weights_m4prime.cl b/HySoP/hysop/gpu/cl_src/weights_m4prime.cl
new file mode 100644
index 0000000000000000000000000000000000000000..7bac233787cd3e32f336b5569a86140d1d72a11c
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/weights_m4prime.cl
@@ -0,0 +1,16 @@
+/**
+ * M4prime weights.
+ *
+ * @param y Distance between the particle and left-hand grid point
+ * @param s Scalar of the particle
+ *
+ * @return weights
+ */
+inline float alpha(float y, float s){
+  return (y * (y * (-0.5 * y + 1.0) - 0.5)) * s;}
+inline float beta(float y, float s){
+  return (y * y * (1.5 * y - 2.5) + 1.0) * s;}
+inline float gamma(float y, float s){
+  return (y * (y * (-1.5 * y + 2.0) + 0.5)) * s;}
+inline float delta(float y, float s){
+  return (y * y * (0.5 * y - 0.5)) * s;}
diff --git a/HySoP/hysop/gpu/cl_src/weights_m4prime_builtin.cl b/HySoP/hysop/gpu/cl_src/weights_m4prime_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..9d1e63caf52ccacecb6567c8cd8be7025998748b
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/weights_m4prime_builtin.cl
@@ -0,0 +1,9 @@
+
+inline float alpha(float y, float s){
+    return ((y * fma(y * fma(y, -0.5, 1.0), - 0.5))) * s;}
+inline float beta(float y, float s){
+  return (y * y * fma(1.5, y, - 2.5), 1.0) * s;}
+inline float gamma(float y, float s){
+  return (y * fma(y * fma(-1.5, y, 2.0), 0.5)) * s;}
+inline float delta(float y, float s){
+  return (y * y * fma(0.5, y, - 0.5)) * s;}
diff --git a/HySoP/hysop/gpu/cl_src/weights_m4prime_opt4.cl b/HySoP/hysop/gpu/cl_src/weights_m4prime_opt4.cl
new file mode 100644
index 0000000000000000000000000000000000000000..6ead89d937370b08319d14a6e17a942e2390cfe5
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/weights_m4prime_opt4.cl
@@ -0,0 +1,9 @@
+
+inline float4 alpha(float4 y, float4 s){
+  return (y * (y * (-0.5 * y + 1.0) - 0.5)) * s;}
+inline float4 beta(float4 y, float4 s){
+  return (y * y * (1.5 * y - 2.5) + 1.0) * s;}
+inline float4 gamma(float4 y, float4 s){
+  return (y * (y * (-1.5 * y + 2.0) + 0.5)) * s;}
+inline float4 delta(float4 y, float4 s){
+  return (y * y * (0.5 * y - 0.5)) * s;}
diff --git a/HySoP/hysop/gpu/cl_src/weights_m4prime_opt4_builtin.cl b/HySoP/hysop/gpu/cl_src/weights_m4prime_opt4_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..43d6a2806bfc21238d9f2e1aec1f809342905322
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/weights_m4prime_opt4_builtin.cl
@@ -0,0 +1,9 @@
+
+inline float4 alpha(float4 y, float4 s){
+    return ((y * fma(y * fma(y, -0.5, 1.0), - 0.5))) * s;}
+inline float4 beta(float4 y, float4 s){
+  return (y * y * fma(1.5, y, - 2.5), 1.0) * s;}
+inline float4 gamma(float4 y, float4 s){
+  return (y * fma(y * fma(-1.5, y, 2.0), 0.5)) * s;}
+inline float4 delta(float4 y, float4 s){
+  return (y * y * fma(0.5, y, - 0.5)) * s;}
diff --git a/HySoP/hysop/gpu/cl_src/weights_m8prime_builtin.cl b/HySoP/hysop/gpu/cl_src/weights_m8prime_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..94d393d333c91e7404f2c7e4ce8a58b2683a8189
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/weights_m8prime_builtin.cl
@@ -0,0 +1,33 @@
+/**
+ * M8prime weights.
+ *
+ * @param y Distance between the particle and left-hand grid point
+ * @param s Scalar of the particle
+ *
+ * @return weights
+ *
+ * Use OpenCL fma builtin function.
+ */
+inline float alpha(float y, float s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) / 3360.0) * s;}
+
+inline float beta(float y, float s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(70.0,y, - 175.0), - 140.0), + 770.0), - 560.0), - 350.0), + 504.0), - 102.0) / 3360.0)*s;}
+
+inline float gamma(float y, float s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-210.0,y, + 609.0), + 224.0), - 2135.0), + 910.0), + 2765.0), - 2520.0), + 255.0) / 3360.0)*s;}
+
+inline float delta(float y, float s){
+  return (fma(y*y, fma(y*y, fma(y*y, fma(70.0,y, - 231.0), + 588.0), - 980.0), + 604.0) / 672.0)*s;}
+
+inline float eta(float y, float s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-70.0,y, 259.0), - 84.0), - 427.0), - 182.0), + 553.0), + 504.0), + 51.0) / 672.0)*s;}
+
+inline float zeta(float y, float s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(210.0,y,- 861.0), + 532.0), + 770.0), + 560.0), - 350.0), - 504.0), - 102.0) / 3360.0)*s;}
+
+inline float theta(float y, float s){
+  return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) / 3360.0)*s;}
+
+inline float iota(float y, float s){
+  return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) / 3360.0)*s;}
diff --git a/HySoP/hysop/gpu/cl_src/weights_m8prime_opt4_builtin.cl b/HySoP/hysop/gpu/cl_src/weights_m8prime_opt4_builtin.cl
new file mode 100644
index 0000000000000000000000000000000000000000..90f28e492a222e5cadf18f25227db59e3c8a51a4
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/weights_m8prime_opt4_builtin.cl
@@ -0,0 +1,33 @@
+/**
+ * M8prime weights.
+ *
+ * @param y Distance between the particle and left-hand grid point
+ * @param s Scalar of the particle
+ *
+ * @return weights
+ *
+ * Use OpenCL fma builtin function, computed by float4.
+ */
+inline float4 alpha(float4 y, float4 s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) / 3360.0) * s;}
+
+inline float4 beta(float4 y, float4 s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(70.0,y, - 175.0), - 140.0), + 770.0), - 560.0), - 350.0), + 504.0), - 102.0) / 3360.0)*s;}
+
+inline float4 gamma(float4 y, float4 s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-210.0,y, + 609.0), + 224.0), - 2135.0), + 910.0), + 2765.0), - 2520.0), + 255.0) / 3360.0)*s;}
+
+inline float4 delta(float4 y, float4 s){
+  return (fma(y*y, fma(y*y, fma(y*y, fma(70.0,y, - 231.0), + 588.0), - 980.0), + 604.0) / 672.0)*s;}
+
+inline float4 eta(float4 y, float4 s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-70.0,y, 259.0), - 84.0), - 427.0), - 182.0), + 553.0), + 504.0), + 51.0) / 672.0)*s;}
+
+inline float4 zeta(float4 y, float4 s){
+  return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(210.0,y,- 861.0), + 532.0), + 770.0), + 560.0), - 350.0), - 504.0), - 102.0) / 3360.0)*s;}
+
+inline float4 theta(float4 y, float4 s){
+  return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) / 3360.0)*s;}
+
+inline float4 iota(float4 y, float4 s){
+  return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) / 3360.0)*s;}
diff --git a/HySoP/hysop/gpu/tools.py b/HySoP/hysop/gpu/tools.py
index d3a12e0f5fb8ab4a3894083dfec9d63f1bc7fbf4..5b5d8c115be72129b0829820337ba60b3ac3b270 100644
--- a/HySoP/hysop/gpu/tools.py
+++ b/HySoP/hysop/gpu/tools.py
@@ -100,7 +100,7 @@ def _check_double_precision_capability(device, prec):
                       'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF',
                       'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']:
                 try:
-                    if eval('(sdevice.double_fp_config &' +
+                    if eval('(device.double_fp_config &' +
                             ' cl.device_fp_config.' +
                             v + ') == cl.device_fp_config.' + v):
                         print v
diff --git a/HySoP/hysop/numerics/gpu_kernel.py b/HySoP/hysop/numerics/gpu_kernel.py
index 144e520ec5fefd2830c889317020c3a77d6cb7b5..16459e4f3363894cc524912ae620ce2e60b1ef53 100644
--- a/HySoP/hysop/numerics/gpu_kernel.py
+++ b/HySoP/hysop/numerics/gpu_kernel.py
@@ -38,9 +38,11 @@ class KernelListLauncher:
         @param args : kernel arguments.
         @return OpenCL Event
 
-        OpenCL global size and local sizes are not given in args. Class member are used.
+        OpenCL global size and local sizes are not given in
+        args. Class member are used.
         """
-        return KernelListLauncher.launch_sizes_in_args(self, d, self.global_size[d], self.local_size[d], *args)
+        return KernelListLauncher.launch_sizes_in_args(
+            self, d, self.global_size[d], self.local_size[d], *args)
 
     def launch_sizes_in_args(self, d, *args):
         """
@@ -65,14 +67,16 @@ class KernelListLauncher:
         if d is not None:
             return self.kernel[d].get_info(cl.kernel_info.FUNCTION_NAME)
         else:
-            return [k.get_info(cl.kernel_info.FUNCTION_NAME) for k in self.kernel]
+            return [k.get_info(cl.kernel_info.FUNCTION_NAME)
+                    for k in self.kernel]
 
 
 class KernelLauncher(KernelListLauncher):
     """
     OpenCL kernel launcher.
 
-    Manage launching of one OpenCL kernel as a KernelListLauncher with a list of one kernel.
+    Manage launching of one OpenCL kernel as a KernelListLauncher
+    with a list of one kernel.
     """
     def __init__(self, kernel, queue, gsize, lsize):
         """
@@ -109,7 +113,8 @@ class KernelLauncher(KernelListLauncher):
         @param args : kernel arguments.
         @return OpenCL Event
 
-        OpenCL global size and local sizes are not given in args. Class member are used.
+        OpenCL global size and local sizes are not given in args.
+        Class member are used.
         """
         return KernelListLauncher.launch(self, 0, *args)
 
diff --git a/HySoP/hysop/operator/advection.py b/HySoP/hysop/operator/advection.py
index 6304e9421a28f9dd87146b820c0aa9fbe9520794..c695657c73bc51aab90be11cf08c3a1c939701e7 100644
--- a/HySoP/hysop/operator/advection.py
+++ b/HySoP/hysop/operator/advection.py
@@ -42,16 +42,16 @@ class Advection(Operator):
         if self.discreteOperator is None:
             if self.method.find('gpu_2k') >= 0:
                 from parmepy.operator.gpu_particle_advection_2k \
-                    import GPUParticleAdvection
+                    import GPUParticleAdvection2k
                 self.discreteOperator = \
-                    GPUParticleAdvection(self, method=self.method,
-                                         **self.config)
+                    GPUParticleAdvection2k(self, method=self.method,
+                                           **self.config)
             elif self.method.find('gpu_1k') >= 0:
                 from parmepy.operator.gpu_particle_advection_1k \
-                    import GPUParticleAdvection
+                    import GPUParticleAdvection1k
                 self.discreteOperator = \
-                    GPUParticleAdvection(self, method=self.method,
-                                         **self.config)
+                    GPUParticleAdvection1k(self, method=self.method,
+                                           **self.config)
             elif self.method.find('scales') >= 0:
                 from parmepy.operator.scales_advection \
                     import ScalesAdvection
diff --git a/HySoP/hysop/operator/continuous.py b/HySoP/hysop/operator/continuous.py
index e59ccc948d99e4ecd58626ef0f234da7f314c7c0..67d814d8875db3a100f4dd73df7091dc977f5693 100644
--- a/HySoP/hysop/operator/continuous.py
+++ b/HySoP/hysop/operator/continuous.py
@@ -42,6 +42,13 @@ class Operator(object):
         for v in self.variables:
             v.discretize(self.resolutions[v])
 
+    def finalize(self):
+        """
+        Method called on simulation finalization.
+        Clean operator at time loop end.
+        """
+        self.discreteOperator.finalize()
+
     def apply(self, *args):
         """
         Apply operator.
diff --git a/HySoP/hysop/operator/discrete.py b/HySoP/hysop/operator/discrete.py
index 26e94fe1214938ba8736d64d924e5c682ae68277..81931cf1604a567068fa4edc5733877a2e0a02f7 100644
--- a/HySoP/hysop/operator/discrete.py
+++ b/HySoP/hysop/operator/discrete.py
@@ -48,6 +48,13 @@ class DiscreteOperator(object):
         raise NotImplementedError("Need to override method in " +
                                   "a subclass of DiscreteOperator")
 
+    def finalize(self):
+        """
+        Method called on simulation finalization.
+        Clean operator at time loop end.
+        """
+        pass
+
     def __str__(self):
         """ToString method"""
         s = "Applying on following fields : \n"
diff --git a/HySoP/hysop/operator/gpu_particle_advection.py b/HySoP/hysop/operator/gpu_particle_advection.py
new file mode 100644
index 0000000000000000000000000000000000000000..93076531e6d42313da50ca26a7eece11bdfba895
--- /dev/null
+++ b/HySoP/hysop/operator/gpu_particle_advection.py
@@ -0,0 +1,319 @@
+"""
+@package parmepy.operator.gpu_particle_advection_1k
+
+Discrete advection representation
+"""
+from abc import ABCMeta, abstractmethod
+from parmepy.constants import np
+from parmepy.operator.particle_advection import ParticleAdvection
+from parmepy.fields.continuous import Field
+from parmepy.fields.gpu_discrete import GPUVectorField
+from parmepy.fields.gpu_discrete import GPUScalarField
+from parmepy.gpu import PARMES_REAL_GPU, PARMES_DOUBLE_GPU, \
+    PARMES_INTEGER_GPU, GPU_SRC, cl
+from parmepy.gpu.tools import get_opencl_environment, \
+    _check_double_precision_capability
+from parmepy.numerics.gpu_kernel import KernelLauncher
+from parmepy.numerics.gpu_kernel import KernelListLauncher
+from parmepy.numerics.splitting import Splitting
+
+
+class GPUParticleAdvection(ParticleAdvection):
+    """
+    Particle advection operator representation on GPU.
+
+    """
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def __init__(self, advec,
+                 platform_id=0, device_id=0,
+                 device_type='gpu',
+                 method='', src=None, precision=PARMES_REAL_GPU):
+        """
+        Create a Advection operator.
+        Work on a given scalar at a given velocity to produce scalar
+        distribution at new positions.
+
+        @param advec : Transport operator
+        @param idVelocityD : Index of velocity discretisation to use.
+        @param idScalarD : Index of scalar discretisation to use.
+        @param method : the method to use.
+        """
+        ParticleAdvection.__init__(self, advec)
+        self.name = "advection_gpu"
+        self.method = method
+        self.user_gpu_src = src
+        self.gpu_precision = precision
+        self.num_method = None
+        self.resolution = self.scalar.topology.mesh.resolution
+        self.workItemNumber = None
+        self.platform, self.device, self.ctx, self.queue = \
+            get_opencl_environment(platform_id, device_id, device_type)
+
+    def _set_WorkItems(self, vector_width=1):
+        ## Optimal work item number
+        if len(self.resolution) == 3:
+            workItemNumber = 64 if min(self.resolution) >= 64 \
+                else min(self.resolution)
+        else:
+            workItemNumber = 256 if min(self.resolution) >= 256 \
+                else min(self.resolution)
+        ## Change work-item regarding vector_width
+        if workItemNumber * vector_width > self.resolution[0]:
+            if self.resolution[0] % vector_width > 0:
+                raise ValueError(
+                    "Resolution ({0}) must be a multiple of {1}".format(
+                        self.resolution[0], vector_width))
+            workItemNumber = self.resolution[0] // vector_width
+        if len(self.resolution) == 3:
+            gwi = (int(workItemNumber),
+                   int(self.resolution[1]), int(self.resolution[2]))
+            lwi = (int(workItemNumber), 1, 1)
+        else:
+            gwi = (int(workItemNumber),
+                   int(self.resolution[1]))
+            lwi = (int(workItemNumber), 1)
+        return workItemNumber, gwi, lwi
+
+    def setUp(self):
+        self.workItemNumber, self.gwi, self.lwi = self._set_WorkItems()
+        print "Work-items basic config : ",
+        print self.workItemNumber, self.gwi, self.lwi
+        dim = self.scalar.topology.dim
+        self.coord_min = np.ones(4, dtype=self.gpu_precision)
+        self.mesh_size = np.ones(4, dtype=self.gpu_precision)
+        self.coord_min[0:dim] = np.asarray(self.scalar.topology.mesh.origin,
+                                           dtype=self.gpu_precision)
+        self.mesh_size[0:dim] = np.asarray(
+            self.scalar.topology.mesh.space_step,
+            dtype=self.gpu_precision)
+        print "=== Kernel sources ==="
+        self.build_options = ""
+        if _check_double_precision_capability(self.device,
+                                              self.gpu_precision):
+            self.build_options += " -cl-fp32-correctly-rounded-divide-sqrt "
+        if self.gpu_precision is PARMES_REAL_GPU:
+            self.build_options += " -cl-single-precision-constant"
+        self.build_options += " -D WIDTH=" + str(self.resolution[0])
+        self.build_options += " -D WGN=" + str(self.workItemNumber)
+        self.gpu_src = self._collect_usr_cl_src(self.user_gpu_src)
+        self.gpu_src += self._collect_kernels_cl_src()
+        print "===\n"
+        print "=== Kernel sources compiling ==="
+        ## Build code
+        if self.gpu_precision is PARMES_REAL_GPU:
+            prg = cl.Program(self.ctx, self.gpu_src.replace('.0', '.0f'))
+        else:
+            prg = cl.Program(self.ctx, self.gpu_src.replace('float', 'double'))
+        ## OpenCL program
+        print self.build_options
+        self.prg = prg.build(self.build_options)
+        print "Compiler options : ",
+        print self.prg.get_build_info(
+            self.device, cl.program_build_info.OPTIONS)
+        print "Compiler status : ",
+        print self.prg.get_build_info(
+            self.device, cl.program_build_info.STATUS)
+        print "Compiler log : ",
+        print self.prg.get_build_info(self.device, cl.program_build_info.LOG)
+        print "===\n"
+        print "=== OpenCL Buffer allocations ==="
+        self._buffer_allocations()
+        print "===\n"
+        print "=== OpenCL Buffer initialisation ==="
+        self._buffer_initialisations()
+        print "===\n"
+        print "=== OpenCL Kernels affectation ==="
+        self._kernel_affectations()
+        print "===\n"
+        self.numMethod = Splitting(self._apply_in_dir, self.scalar.dimension)
+        print "Method used:", self.name
+
+    def _buffer_initialisations(self):
+        data, transfer_time, compute_time = 0, 0., 0.
+        for gpudf in self.variables:
+            match = 'init' + gpudf.name
+            initKernel = None
+            # Looking for initKernel
+            for k in self.prg.all_kernels():
+                k_name = k.get_info(cl.kernel_info.FUNCTION_NAME)
+                if match.find(k_name) >= 0:
+                    initKernel = cl.Kernel(self.prg, k_name)
+            if initKernel is not None:
+                kl = KernelLauncher(initKernel, self.queue,
+                                    self.gwi, self.lwi)
+                if gpudf.vector:
+                    args = [gpudf.gpu_data[d]
+                            for d in xrange(len(self.resolution))]
+                    args.append(self.coord_min)
+                    args.append(self.mesh_size)
+                    evt = kl.launch(*tuple(args))
+                else:
+                    args = [gpudf.gpu_data]
+                    args.append(self.coord_min)
+                    args.append(self.mesh_size)
+                    evt = kl.launch(*tuple(args))
+                self.queue.finish()
+                temp_time = (evt.profile.end - evt.profile.start) * 1e-9
+                print "Done in ", temp_time, "sec"
+                compute_time += temp_time
+                gpudf.contains_data = False
+                gpudf.data_on_device = True
+            else:
+                if gpudf.contains_data:
+                    print gpudf.name, ":",
+                    temp_data, temp_time = gpudf.toDevice()
+                    data += temp_data
+                    transfer_time += temp_time
+        if data > 0:
+            print "Total Transfers : ", data, "Bytes transfered at ",
+            print "{0:.3f} GBytes/sec".format((data * 1e-9) / transfer_time)
+        if compute_time > 0.:
+            print "Total Computing  : ", compute_time, "sec"
+
+    def _collect_kernels_cl_src_copy(self):
+        gpu_src = ""
+        ## copy
+        if len(self.resolution) == 3:
+            if self.gpu_precision is PARMES_REAL_GPU:
+                src_copy = 'copy_3D_opt4.cl'
+                self.build_options += " -D TILE_DIM_COPY=16"
+                self.build_options += " -D BLOCK_ROWS_COPY=8"
+                self.copy_gwi = (int(self.resolution[0] / 4),
+                                 int(self.resolution[1] / 2),
+                                 int(self.resolution[2]))
+                self.copy_lwi = (4, 8, 1)
+            else:
+                src_copy = 'copy_3D_locMem.cl'
+                self.build_options += " -D TILE_DIM_COPY=32"
+                self.build_options += " -D BLOCK_ROWS_COPY=8"
+                self.copy_gwi = (int(self.resolution[0]),
+                                 int(self.resolution[1] / 4),
+                                 int(self.resolution[2]))
+                self.copy_lwi = (32, 8, 1)
+        else:
+            if self.gpu_precision is PARMES_REAL_GPU:
+                src_copy = 'copy_2D_opt2.cl'
+                self.build_options += " -D TILE_DIM_COPY=16"
+                self.build_options += " -D BLOCK_ROWS_COPY=8"
+                self.copy_gwi = (int(self.resolution[0] / 2),
+                                 int(self.resolution[1] / 2))
+                self.copy_lwi = (8, 8)
+            else:
+                src_copy = 'copy_2D_opt2.cl'
+                self.build_options += " -D TILE_DIM_COPY=32"
+                self.build_options += " -D BLOCK_ROWS_COPY=2"
+                self.copy_gwi = (int(self.resolution[0] / 2),
+                                 int(self.resolution[1] / 16))
+                self.copy_lwi = (16, 2)
+        f = open(GPU_SRC + src_copy)
+        print  "   - copy:", f.name
+        gpu_src += "".join(f.readlines())
+        f.close()
+        print "       - config :", self.copy_gwi, self.copy_lwi
+        return gpu_src
+
+    def _collect_kernels_cl_src_transpositions(self):
+        gpu_src = ""
+        ## transpose
+        if len(self.resolution) == 3:
+            if self.gpu_precision is PARMES_REAL_GPU:
+                src_transpose_xy = \
+                    'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl'
+                self.build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8"
+                self.transpose_xy_gwi = (int(self.resolution[0] / 2),
+                                         int(self.resolution[1] / 2),
+                                         int(self.resolution[2]))
+                self.transpose_xy_lwi = (8, 8, 1)
+                src_transpose_xz = \
+                    'transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl'
+                self.build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4"
+                self.transpose_xz_gwi = (int(self.resolution[0]),
+                                         int(self.resolution[1] / 4),
+                                         int(self.resolution[2] / 4))
+                self.transpose_xz_lwi = (16, 4, 4)
+            else:
+                src_transpose_xy = \
+                    'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl'
+                self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=4"
+                self.transpose_xy_gwi = (int(self.resolution[0] / 4),
+                                         int(self.resolution[1] / 8),
+                                         int(self.resolution[2]))
+                self.transpose_xy_lwi = (8, 4, 1)
+                src_transpose_xz = \
+                    'transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl'
+                self.build_options += " -D TILE_DIM_XZ=8 -D BLOCK_ROWS_XZ=2"
+                self.transpose_xz_gwi = (int(self.resolution[0]),
+                                         int(self.resolution[1] / 4),
+                                         int(self.resolution[2] / 4))
+                self.transpose_xz_lwi = (8, 2, 2)
+        else:
+            if self.gpu_precision is PARMES_REAL_GPU:
+                src_transpose_xy = \
+                    'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl'
+                self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8"
+                self.transpose_xy_gwi = (int(self.resolution[0] / 4),
+                                         int(self.resolution[1]) / 4)
+                self.transpose_xy_lwi = (8, 8)
+            else:
+                src_transpose_xy = \
+                    'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl'
+                self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=2"
+                self.transpose_xy_gwi = (int(self.resolution[0] / 4),
+                                         int(self.resolution[1]) / 16)
+                self.transpose_xy_lwi = (8, 2)
+        f = open(GPU_SRC + src_transpose_xy)
+        print  "   - transpose_xy:", f.name
+        gpu_src += "".join(f.readlines())
+        f.close()
+        print "       - config :", self.transpose_xy_gwi, self.transpose_xy_lwi
+        if len(self.resolution) == 3:
+            print  "   - transpose_xz:", f.name
+            f = open(GPU_SRC + src_transpose_xz)
+            gpu_src += "".join(f.readlines())
+            f.close()
+            print "       - config :",
+            print self.transpose_xz_gwi, self.transpose_xz_lwi
+        return gpu_src
+
+    def _collect_usr_cl_src(self, usr_src):
+        print "usr src", usr_src
+        gpu_src = ""
+        if usr_src is not None:
+            if isinstance(usr_src, list):
+                for src in usr_src:
+                    print "Sources files (user): ", src
+                    f = open(src, 'r')
+                    gpu_src += "".join(f.readlines())
+                    f.close()
+            else:
+                print "Sources files (user): ", usr_src
+                f = open(usr_src, 'r')
+                gpu_src += "".join(f.readlines())
+                f.close()
+        return gpu_src
+
+    def apply(self, t, dt, ite):
+        self.numMethod(t, dt)
+
+    def finalize(self):
+        """
+        Free OpenCL memory.
+        """
+        for gpudf in self.variables:
+            print "deallocate :", gpudf.name
+            if gpudf.vector:
+                [gpudf.gpu_data[d].release()
+                 for d in xrange(len(self.resolution))]
+            else:
+                gpudf.gpu_data.release()
+
+    def __str__(self):
+        s = "Advection_P (DiscreteOperator). " + DiscreteOperator.__str__(self)
+        return s
+
+if __name__ == "__main__":
+    print __doc__
+    print "- Provided class : Transport_d"
+    print AdvectionDOp.__doc__
diff --git a/HySoP/hysop/operator/gpu_particle_advection_1k.py b/HySoP/hysop/operator/gpu_particle_advection_1k.py
index 3cc58a390ccd67e825447516971b863625e1d9d6..8a24071199d464c50fd43e1625f2dc595b630672 100644
--- a/HySoP/hysop/operator/gpu_particle_advection_1k.py
+++ b/HySoP/hysop/operator/gpu_particle_advection_1k.py
@@ -4,7 +4,7 @@
 Discrete advection representation
 """
 from parmepy.constants import np
-from parmepy.operator.particle_advection import ParticleAdvection
+from parmepy.operator.gpu_particle_advection import GPUParticleAdvection
 from parmepy.fields.continuous import Field
 from parmepy.fields.gpu_discrete import GPUVectorField
 from parmepy.fields.gpu_discrete import GPUScalarField
@@ -17,11 +17,10 @@ from parmepy.numerics.gpu_kernel import KernelListLauncher
 from parmepy.numerics.splitting import Splitting
 
 
-class GPUParticleAdvection(ParticleAdvection):
+class GPUParticleAdvection1k(GPUParticleAdvection):
     """
     Particle advection operator representation on GPU.
 
-    Advection and Remeshing are performed in a single kernel.
     """
 
     def __init__(self, advec,
@@ -38,83 +37,21 @@ class GPUParticleAdvection(ParticleAdvection):
         @param idScalarD : Index of scalar discretisation to use.
         @param method : the method to use.
         """
-        ParticleAdvection.__init__(self, advec)
-        self.name = "advection_gpu"
-        self.method = method
-        self.user_gpu_src = src
-        self.gpu_precision = precision
-        self.num_method = None
-        self.resolution = self.scalar.topology.mesh.resolution
-        self.compute_time_remesh = [0., 0., 0.]
-        self.platform, self.device, self.ctx, self.queue = \
-            get_opencl_environment(platform_id, device_id, device_type)
+        GPUParticleAdvection.__init__(self, advec,
+                                      platform_id, device_id,
+                                      device_type,
+                                      method, src,
+                                      precision)
 
     def setUp(self):
-        ## Optimal work item number
-        if len(self.resolution) == 3:
-            self.workItemNumber = 64 if min(self.resolution) >= 64 \
-                else min(self.resolution)
-        else:
-            self.workItemNumber = 256 if min(self.resolution) >= 256 \
-                else min(self.resolution)
-        print " Work-Item number: ", self.workItemNumber
-        dim = self.scalar.topology.dim
-        self.coord_min = np.ones(4, dtype=self.gpu_precision)
-        self.mesh_size = np.ones(4, dtype=self.gpu_precision)
-        self.coord_min[0:dim] = np.asarray(self.scalar.topology.mesh.origin,
-                                           dtype=self.gpu_precision)
-        self.mesh_size[0:dim] = np.asarray(
-            self.scalar.topology.mesh.space_step,
-            dtype=self.gpu_precision)
-        if len(self.resolution) == 3:
-            self.gwi = (int(self.workItemNumber),
-                        int(self.resolution[1]), (self.resolution[2]))
-            self.lwi = (int(self.workItemNumber), 1, 1)
-        else:
-            self.gwi = (int(self.workItemNumber),
-                        int(self.resolution[1]))
-            self.lwi = (int(self.workItemNumber), 1)
-        print "=== Kernel sources ==="
-        self.build_options = ""
-        if _check_double_precision_capability(self.device,
-                                              self.gpu_precision):
-            self.build_options += " -cl-fp32-correctly-rounded-divide-sqrt "
-        if self.gpu_precision is PARMES_REAL_GPU:
-            self.build_options += " -cl-single-precision-constant"
-        self.build_options += " -D WIDTH=" + str(self.resolution[0])
-        self.build_options += " -D WGN=" + str(self.workItemNumber)
-        self.gpu_src = self._collect_usr_cl_src(self.user_gpu_src)
-        self.gpu_src += self._collect_kernels_cl_src()
-        print "===\n"
-        print "=== Kernel sources compiling ==="
-        ## Build code
-        if self.gpu_precision is PARMES_REAL_GPU:
-            prg = cl.Program(self.ctx, self.gpu_src.replace('.0', '.0f'))
-        else:
-            prg = cl.Program(self.ctx, self.gpu_src.replace('float', 'double'))
-        ## OpenCL program
-        self.prg = prg.build(self.build_options)
-        print "Compiler options : ",
-        print self.prg.get_build_info(
-            self.device, cl.program_build_info.OPTIONS)
-        print "Compiler status : ",
-        print self.prg.get_build_info(
-            self.device, cl.program_build_info.STATUS)
-        print "Compiler log : ",
-        print self.prg.get_build_info(self.device, cl.program_build_info.LOG)
-        print "===\n"
-        print "=== OpenCL Buffer allocations ==="
-        self._buffer_allocations()
-        print "===\n"
-        print "=== OpenCL Buffer initialisation ==="
-        self._buffer_initialisations()
-        print "===\n"
-        print "=== OpenCL Kernels affectation ==="
+        GPUParticleAdvection.setUp(self)
+
+    def _kernel_affectations(self):
         self.num_advec_and_remesh = KernelLauncher(
             self.prg.advection_and_remeshing,
             self.queue,
-            self.gwi,
-            self.lwi)
+            self.gwi_advec_and_remesh,
+            self.lwi_advec_and_remesh)
         self.init_copy = KernelLauncher(self.prg.copy,
                                         self.queue,
                                         self.copy_gwi,
@@ -134,51 +71,6 @@ class GPUParticleAdvection(ParticleAdvection):
                                self.queue,
                                self.transpose_xy_gwi,
                                self.transpose_xy_lwi)
-        print "===\n"
-        self.numMethod = Splitting(self._apply_in_dir, self.scalar.dimension)
-        print "Method used:", self.name
-
-    def _buffer_initialisations(self):
-        data, transfer_time, compute_time = 0, 0., 0.
-        for gpudf in self.variables:
-            match = 'init' + gpudf.name
-            initKernel = None
-            # Looking for initKernel
-            for k in self.prg.all_kernels():
-                k_name = k.get_info(cl.kernel_info.FUNCTION_NAME)
-                if match.find(k_name) >= 0:
-                    initKernel = cl.Kernel(self.prg, k_name)
-            if initKernel is not None:
-                kl = KernelLauncher(initKernel, self.queue,
-                                    self.gwi, self.lwi)
-                if gpudf.vector:
-                    args = [gpudf.gpu_data[d]
-                            for d in xrange(len(self.resolution))]
-                    args.append(self.coord_min)
-                    args.append(self.mesh_size)
-                    evt = kl.launch(*tuple(args))
-                else:
-                    args = [gpudf.gpu_data]
-                    args.append(self.coord_min)
-                    args.append(self.mesh_size)
-                    evt = kl.launch(*tuple(args))
-                self.queue.finish()
-                temp_time = (evt.profile.end - evt.profile.start) * 1e-9
-                print "Done in ", temp_time, "sec"
-                compute_time += temp_time
-                gpudf.contains_data = False
-                gpudf.data_on_device = True
-            else:
-                if gpudf.contains_data:
-                    print gpudf.name, ":",
-                    temp_data, temp_time = gpudf.toDevice()
-                    data += temp_data
-                    transfer_time += temp_time
-        if data > 0:
-            print "Total Transfers : ", data, "Bytes transfered at ",
-            print "{0:.3f} GBytes/sec".format((data * 1e-9) / transfer_time)
-        if compute_time > 0.:
-            print "Total Computing  : ", compute_time, "sec"
 
     def _buffer_allocations(self):
         ## Velocity.
@@ -196,7 +88,8 @@ class GPUParticleAdvection(ParticleAdvection):
             self.queue, particle_scalar.discreteField[0],
             self.gpu_precision)
         self.part_scalar = particle_scalar.discreteField[0]
-        self.variables = [self.scalar, self.velocity, self.part_scalar]
+        self.variables = [self.scalar, self.velocity,
+                          self.part_scalar]
         total_mem_used = 0
         for gpuv in self.variables:
             total_mem_used += gpuv.mem_size
@@ -207,128 +100,91 @@ class GPUParticleAdvection(ParticleAdvection):
 
     def _collect_kernels_cl_src(self):
         gpu_src = ""
+        ## advection and remeshing
+        advec_and_remesh_vector_width = 4
+        src_advec_and_remesh = 'advection_and_remesh_'
         if len(self.resolution) == 3:
             if self.gpu_precision is PARMES_REAL_GPU:
-                if self.method.find('m6prime'):
+                if self.method.find('m4prime') >= 0:
+                    src_advec_and_remesh += \
+                        'm4prime_3D_opt4_builtin_private4.cl'
+                    src_remesh_weights = 'weights_m4prime_opt4_builtin.cl'
+                elif self.method.find('m6prime') >= 0:
+                    src_advec_and_remesh += '3D_opt4_builtin_private4.cl'
                     src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
-                    src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4.cl'
-                else:
+                elif self.method.find('m8prime') >= 0:
+                    src_advec_and_remesh += \
+                        'm8prime_3D_opt4_builtin_private4.cl'
                     src_remesh_weights = 'weights_m8prime_opt4_builtin.cl'
-                    src_advec_and_remesh = 'advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl'
+                else:
+                    print 'No remeshing fromula specified, use default m6prime'
+                    src_advec_and_remesh += '3D_opt4_builtin_private4.cl'
+                    src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
             else:
-                if self.method.find('m6prime'):
+                if self.method.find('m4prime') >= 0:
+                    src_advec_and_remesh += \
+                        'm4prime_3D_opt4_builtin_private4_noBC.cl'
+                    src_remesh_weights = 'weights_m4prime_opt4_builtin.cl'
+                elif self.method.find('m6prime') >= 0:
+                    src_advec_and_remesh += '3D_opt4_builtin_private4_noBC.cl'
                     src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
-                    src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4_noBC.cl'
-                else:
+                elif self.method.find('m8prime') >= 0:
+                    src_advec_and_remesh += \
+                        'm8prime_3D_opt4_builtin_private4_noBC.cl'
                     src_remesh_weights = 'weights_m8prime_opt4_builtin.cl'
-                    src_advec_and_remesh = 'advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl'
+                else:
+                    print 'No remeshing fromula specified, use default m6prime'
+                    src_advec_and_remesh += '3D_opt4_builtin_private4_noBC.cl'
+                    src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
         else:
             if self.gpu_precision is PARMES_REAL_GPU:
-                if self.method.find('m6prime'):
+                advec_and_remesh_vector_width = 8
+                if self.method.find('m4prime') >= 0:
+                    src_advec_and_remesh += 'm4prime_2D_opt8_builtin_noBC.cl'
+                    src_remesh_weights = 'weights_m4prime_builtin.cl'
+                elif self.method.find('m6prime') >= 0:
+                    src_advec_and_remesh += '2D_opt8_builtin_noBC.cl'
                     src_remesh_weights = 'weights_m6prime_builtin.cl'
-                    src_advec_and_remesh = 'advection_and_remesh_2D_opt8_builtin_noBC.cl'
-                else:
+                elif self.method.find('m8prime') >= 0:
+                    src_advec_and_remesh += 'm8prime_2D_opt8_builtin_noBC.cl'
                     src_remesh_weights = 'weights_m8prime_builtin.cl'
-                    src_advec_and_remesh = 'advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl'
+                else:
+                    print 'No remeshing fromula specified, use default m6prime'
+                    src_advec_and_remesh += '2D_opt8_builtin_noBC.cl'
+                    src_remesh_weights = 'weights_m6prime_builtin.cl'
             else:
-                if self.method.find('m6prime'):
+                if self.method.find('m4prime') >= 0:
+                    src_advec_and_remesh += 'm4prime_2D_opt4_builtin_noBC.cl'
+                    src_remesh_weights = 'weights_m4prime_builtin.cl'
+                elif self.method.find('m6prime') >= 0:
+                    src_advec_and_remesh += '2D_opt4_builtin_noBC.cl'
                     src_remesh_weights = 'weights_m6prime_builtin.cl'
-                    src_advec_and_remesh = 'advection_and_remesh_2D_opt4_builtin_noBC.cl'
-                else:
+                elif self.method.find('m8prime') >= 0:
+                    src_advec_and_remesh += 'm8prime_2D_opt4_builtin_noBC.cl'
                     src_remesh_weights = 'weights_m8prime_builtin.cl'
-                    src_advec_and_remesh = 'advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl'
+                else:
+                    print 'No remeshing fromula specified, use default m6prime'
+                    src_advec_and_remesh += '2D_opt4_builtin_noBC.cl'
+                    src_remesh_weights = 'weights_m6prime_builtin.cl'
+        self.WINb_advec_and_remesh, self.gwi_advec_and_remesh, \
+            self.lwi_advec_and_remesh = self._set_WorkItems(
+                advec_and_remesh_vector_width)
+        self.build_options += " -D WGNADVECANDREMESH=" +
+        self.build_options += str(self.WINb_advec_and_remesh)
         f = open(GPU_SRC + src_remesh_weights)
-        print  "   - remeshing weights:", f.name
         gpu_src += "".join(f.readlines())
+        print  "   - remeshing weights:", f.name
         f.close()
         f = open(GPU_SRC + src_advec_and_remesh)
-        print  "   - advection and remesh:", f.name
+        print  "   - advection and remeshing:", f.name
         gpu_src += "".join(f.readlines())
         f.close()
-        ## copy
-        if len(self.resolution) == 3:
-            if self.gpu_precision is PARMES_REAL_GPU:
-                src_copy = 'copy_3D_opt4.cl'
-                self.build_options += " -D TILE_DIM_COPY=16 -D BLOCK_ROWS_COPY=8"
-                self.copy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1] / 2), int(self.resolution[2]))
-                self.copy_lwi = (4, 8, 1)
-            else:
-                src_copy = 'copy_3D_locMem.cl'
-                self.build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=8"
-                self.copy_gwi = (int(self.resolution[0]), int(self.resolution[1] / 4), int(self.resolution[2]))
-                self.copy_lwi = (32, 8, 1)
-        else:
-            if self.gpu_precision is PARMES_REAL_GPU:
-                src_copy = 'copy_2D_opt2.cl'
-                self.build_options += " -D TILE_DIM_COPY=16 -D BLOCK_ROWS_COPY=8"
-                self.copy_gwi = (int(self.resolution[0] / 2), int(self.resolution[1] / 2))
-                self.copy_lwi = (8, 8)
-            else:
-                src_copy = 'copy_2D_opt2.cl'
-                self.build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=2"
-                self.copy_gwi = (int(self.resolution[0] / 2), int(self.resolution[1] / 16))
-                self.copy_lwi = (16, 2)
-        f = open(GPU_SRC + src_copy)
-        print  "   - copy:", f.name
-        gpu_src += "".join(f.readlines())
-        f.close()
-        ## transpose
-        if len(self.resolution) == 3:
-            if self.gpu_precision is PARMES_REAL_GPU:
-                src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl'
-                self.build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8"
-                self.transpose_xy_gwi = (int(self.resolution[0] / 2), int(self.resolution[1] / 2), int(self.resolution[2]))
-                self.transpose_xy_lwi = (8, 8, 1)
-                src_transpose_xz = 'transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl'
-                self.build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4"
-                self.transpose_xz_gwi = (int(self.resolution[0]), int(self.resolution[1] / 4), int(self.resolution[2] / 4))
-                self.transpose_xz_lwi = (16, 4, 4)
-            else:
-                src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl'
-                self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=4"
-                self.transpose_xy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1] / 8), int(self.resolution[2]))
-                self.transpose_xy_lwi = (8, 4, 1)
-                src_transpose_xz = 'transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl'
-                self.build_options += " -D TILE_DIM_XZ=8 -D BLOCK_ROWS_XZ=2"
-                self.transpose_xz_gwi = (int(self.resolution[0]), int(self.resolution[1] / 4), int(self.resolution[2] / 4))
-                self.transpose_xz_lwi = (8, 2, 2)
-        else:
-            if self.gpu_precision is PARMES_REAL_GPU:
-                src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl'
-                self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8"
-                self.transpose_xy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1]) / 4)
-                self.transpose_xy_lwi = (8, 8)
-            else:
-                src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl'
-                self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=2"
-                self.transpose_xy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1]) / 16)
-                self.transpose_xy_lwi = (8, 2)
-        f = open(GPU_SRC + src_transpose_xy)
-        print  "   - transpose_xy:", f.name
-        gpu_src += "".join(f.readlines())
-        f.close()
-        if len(self.resolution) == 3:
-            f = open(GPU_SRC + src_transpose_xz)
-            gpu_src += "".join(f.readlines())
-            f.close()
+        print "       - config :",
+        print self.gwi_advec_and_remesh, self.lwi_advec_and_remesh
+        gpu_src += self._collect_kernels_cl_src_copy()
+        gpu_src += self._collect_kernels_cl_src_transpositions()
         return gpu_src
 
-    def _collect_usr_cl_src(self, usr_src):
-        print "usr src", usr_src
-        gpu_src = ""
-        if usr_src is not None:
-            if isinstance(usr_src, list):
-                for src in usr_src:
-                    print "Sources files (user): ", src
-                    f = open(src, 'r')
-                    gpu_src += "".join(f.readlines())
-                    f.close()
-            else:
-                print "Sources files (user): ", usr_src
-                f = open(usr_src, 'r')
-                gpu_src += "".join(f.readlines())
-                f.close()
-        return gpu_src
 
     def _apply_in_dir(self, t, dt, d, split_id):
         """
@@ -364,16 +220,15 @@ class GPUParticleAdvection(ParticleAdvection):
                                                    self.part_scalar.gpu_data)
                 else:
                     evt_init = \
-                        self.init_transpose.launch(1, self.gpu_scalar.gpu_data,
+                        self.init_transpose.launch(1, self.scalar.gpu_data,
                                                    self.part_scalar.gpu_data)
-        # Advection and remeshing
+        # Advection
         evt_advec = self.num_advec_and_remesh.launch(self.velocity.gpu_data[d],
                                                      self.part_scalar.gpu_data,
                                                      self.scalar.gpu_data,
                                                      self.gpu_precision(dt),
                                                      self.coord_min[d],
                                                      self.mesh_size[d])
-        self.num_advec_and_remesh.finish()
         for df in self.output:
             df.data_on_device = True
             df.contains_data = False
@@ -389,8 +244,6 @@ class GPUParticleAdvection(ParticleAdvection):
         self.total_time += (c_time_advec + c_time_init)
         return (c_time_advec + c_time_init)
 
-    def apply(self, t, dt, ite):
-        self.numMethod(t, dt)
 
     def printComputeTime(self):
         self.time_info[0] = "\"Advection total\" "
diff --git a/HySoP/hysop/operator/gpu_particle_advection_2k.py b/HySoP/hysop/operator/gpu_particle_advection_2k.py
index f94f201a5eb80c1039234e804c19e3f4769ddb51..598eebf5eceb4685c519f4cab0751e499a6e33d3 100644
--- a/HySoP/hysop/operator/gpu_particle_advection_2k.py
+++ b/HySoP/hysop/operator/gpu_particle_advection_2k.py
@@ -4,7 +4,7 @@
 Discrete advection representation
 """
 from parmepy.constants import np
-from parmepy.operator.particle_advection import ParticleAdvection
+from parmepy.operator.gpu_particle_advection import GPUParticleAdvection
 from parmepy.fields.continuous import Field
 from parmepy.fields.gpu_discrete import GPUVectorField
 from parmepy.fields.gpu_discrete import GPUScalarField
@@ -17,7 +17,7 @@ from parmepy.numerics.gpu_kernel import KernelListLauncher
 from parmepy.numerics.splitting import Splitting
 
 
-class GPUParticleAdvection(ParticleAdvection):
+class GPUParticleAdvection2k(GPUParticleAdvection):
     """
     Particle advection operator representation on GPU.
 
@@ -37,125 +37,25 @@ class GPUParticleAdvection(ParticleAdvection):
         @param idScalarD : Index of scalar discretisation to use.
         @param method : the method to use.
         """
-        ParticleAdvection.__init__(self, advec)
-        self.name = "advection_gpu"
-        self.method = method
-        self.user_gpu_src = src
-        self.gpu_precision = precision
-        self.num_method = None
-        self.resolution = self.scalar.topology.mesh.resolution
+        GPUParticleAdvection.__init__(self, advec,
+                                      platform_id, device_id,
+                                      device_type,
+                                      method, src,
+                                      precision)
         self.compute_time_remesh = [0., 0., 0.]
-        print "=== OpenCL environment ==="
-        #Get platform.
-        try:
-            ## OpenCL platform
-            self.platform = cl.get_platforms()[platform_id]
-        except IndexError:
-            print "  Incorrect platform_id :", platform_id, ".",
-            print " Only ", len(cl.get_platforms()), " available.",
-            print " Getting defalut platform. "
-            self.platform = cl.get_platforms()[0]
-        print "  Platform   "
-        print "  - Name       :", self.platform.name
-        print "  - Version    :", self.platform.version
-        #Get device.
-        try:
-            ## OpenCL device
-            self.device = self.platform.get_devices(
-                eval("cl.device_type." + device_type.upper()))[device_id]
-        except cl.RuntimeError as e:
-            print "RuntimeError:", e
-            self.device = cl.create_some_context().devices[0]
-        except AttributeError as e:
-            print "AttributeError:", e
-            self.device = cl.create_some_context().devices[0]
-        print "  Device"
-        print "  - Name                :",
-        print self.device.name
-        print "  - Type                :",
-        print cl.device_type.to_string(self.device.type)
-        print "  - C Version           :",
-        print self.device.opencl_c_version
-        print "  - Global mem size     :",
-        print self.device.global_mem_size / (1024 ** 3), "GB"
-        print "===\n"
-        #Creates GPU Context
-        ## OpenCL context
-        self.ctx = cl.Context([self.device])
-        #Create CommandQueue on the GPU Context
-        ## OpenCL command queue
-        self.queue = cl.CommandQueue(
-            self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     def setUp(self):
-        ## Optimal work item number
-        if len(self.resolution) == 3:
-            self.workItemNumber = 64 if min(self.resolution) >= 64 \
-                else min(self.resolution)
-        else:
-            self.workItemNumber = 256 if min(self.resolution) >= 256 \
-                else min(self.resolution)
-        print " Work-Item number: ", self.workItemNumber
-        dim = self.scalar.topology.dim
-        self.coord_min = np.ones(4, dtype=self.gpu_precision)
-        self.mesh_size = np.ones(4, dtype=self.gpu_precision)
-        self.coord_min[0:dim] = np.asarray(self.scalar.topology.mesh.origin,
-                                           dtype=self.gpu_precision)
-        self.mesh_size[0:dim] = np.asarray(
-            self.scalar.topology.mesh.space_step,
-            dtype=self.gpu_precision)
-        if len(self.resolution) == 3:
-            self.gwi = (int(self.workItemNumber),
-                        int(self.resolution[1]), (self.resolution[2]))
-            self.lwi = (int(self.workItemNumber), 1, 1)
-        else:
-            self.gwi = (int(self.workItemNumber),
-                        int(self.resolution[1]))
-            self.lwi = (int(self.workItemNumber), 1)
-        print "=== Kernel sources ==="
-        self.build_options = ""
-        if _check_double_precision_capability(self.device,
-                                              self.gpu_precision):
-            self.build_options += " -cl-fp32-correctly-rounded-divide-sqrt "
-        if self.gpu_precision is PARMES_REAL_GPU:
-            self.build_options += " -cl-single-precision-constant"
-        self.build_options += " -D WIDTH=" + str(self.resolution[0])
-        self.build_options += " -D WGN=" + str(self.workItemNumber)
-        self.gpu_src = self._collect_usr_cl_src(self.user_gpu_src)
-        self.gpu_src += self._collect_kernels_cl_src()
-        print "===\n"
-        print "=== Kernel sources compiling ==="
-        ## Build code
-        if self.gpu_precision is PARMES_REAL_GPU:
-            prg = cl.Program(self.ctx, self.gpu_src.replace('.0', '.0f'))
-        else:
-            prg = cl.Program(self.ctx, self.gpu_src.replace('float', 'double'))
-        ## OpenCL program
-        self.prg = prg.build(self.build_options)
-        print "Compiler options : ",
-        print self.prg.get_build_info(
-            self.device, cl.program_build_info.OPTIONS)
-        print "Compiler status : ",
-        print self.prg.get_build_info(
-            self.device, cl.program_build_info.STATUS)
-        print "Compiler log : ",
-        print self.prg.get_build_info(self.device, cl.program_build_info.LOG)
-        print "===\n"
-        print "=== OpenCL Buffer allocations ==="
-        self._buffer_allocations()
-        print "===\n"
-        print "=== OpenCL Buffer initialisation ==="
-        self._buffer_initialisations()
-        print "===\n"
-        print "=== OpenCL Kernels affectation ==="
+        GPUParticleAdvection.setUp(self)
+
+    def _kernel_affectations(self):
         self.num_advec = KernelLauncher(self.prg.advection,
                                         self.queue,
-                                        self.gwi,
-                                        self.lwi)
+                                        self.gwi_advec,
+                                        self.lwi_advec)
         self.num_remesh = KernelLauncher(self.prg.remeshing,
                                          self.queue,
-                                         self.gwi,
-                                         self.lwi)
+                                         self.gwi_remesh,
+                                         self.lwi_remesh)
         self.init_copy = KernelLauncher(self.prg.copy,
                                         self.queue,
                                         self.copy_gwi,
@@ -175,51 +75,6 @@ class GPUParticleAdvection(ParticleAdvection):
                                self.queue,
                                self.transpose_xy_gwi,
                                self.transpose_xy_lwi)
-        print "===\n"
-        self.numMethod = Splitting(self._apply_in_dir, self.scalar.dimension)
-        print "Method used:", self.name
-
-    def _buffer_initialisations(self):
-        data, transfer_time, compute_time = 0, 0., 0.
-        for gpudf in self.variables:
-            match = 'init' + gpudf.name
-            initKernel = None
-            # Looking for initKernel
-            for k in self.prg.all_kernels():
-                k_name = k.get_info(cl.kernel_info.FUNCTION_NAME)
-                if match.find(k_name) >= 0:
-                    initKernel = cl.Kernel(self.prg, k_name)
-            if initKernel is not None:
-                kl = KernelLauncher(initKernel, self.queue,
-                                    self.gwi, self.lwi)
-                if gpudf.vector:
-                    args = [gpudf.gpu_data[d]
-                            for d in xrange(len(self.resolution))]
-                    args.append(self.coord_min)
-                    args.append(self.mesh_size)
-                    evt = kl.launch(*tuple(args))
-                else:
-                    args = [gpudf.gpu_data]
-                    args.append(self.coord_min)
-                    args.append(self.mesh_size)
-                    evt = kl.launch(*tuple(args))
-                self.queue.finish()
-                temp_time = (evt.profile.end - evt.profile.start) * 1e-9
-                print "Done in ", temp_time, "sec"
-                compute_time += temp_time
-                gpudf.contains_data = False
-                gpudf.data_on_device = True
-            else:
-                if gpudf.contains_data:
-                    print gpudf.name, ":",
-                    temp_data, temp_time = gpudf.toDevice()
-                    data += temp_data
-                    transfer_time += temp_time
-        if data > 0:
-            print "Total Transfers : ", data, "Bytes transfered at ",
-            print "{0:.3f} GBytes/sec".format((data * 1e-9) / transfer_time)
-        if compute_time > 0.:
-            print "Total Computing  : ", compute_time, "sec"
 
     def _buffer_allocations(self):
         ## Velocity.
@@ -258,35 +113,57 @@ class GPUParticleAdvection(ParticleAdvection):
 
     def _collect_kernels_cl_src(self):
         gpu_src = ""
+        print self.method
         if len(self.resolution) == 3:
             if self.gpu_precision is PARMES_REAL_GPU:
                 src_advec = 'advection_3D_opt4_builtin.cl'
+                advec_vector_width = 4
             else:
                 src_advec = 'advection_3D_opt2_builtin.cl'
+                advec_vector_width = 2
         else:
             if self.gpu_precision is PARMES_REAL_GPU:
                 src_advec = 'advection_2D_opt4_builtin.cl'
+                advec_vector_width = 4
             else:
                 src_advec = 'advection_2D_builtin.cl'
+                advec_vector_width = 1
+        self.WINb_advec, self.gwi_advec, \
+            self.lwi_advec = self._set_WorkItems(advec_vector_width)
+        self.build_options += " -D WGNADVEC=" + str(self.WINb_advec)
         f = open(GPU_SRC + src_advec)
         print  "   - advection:", f.name
         gpu_src += "".join(f.readlines())
         f.close()
+        print "       - config :",
+        print self.gwi_advec, self.lwi_advec
         ## remeshing
+        remesh_vector_width = 4
         if len(self.resolution) == 3:
-            if self.method.find('m6prime'):
+            if self.method.find('m6prime') >= 0:
                 src_remesh = 'remeshing_3D_opt4_private4.cl'
                 src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
-            else:
+            elif self.method.find('m8prime') >= 0:
                 src_remesh = 'remeshing_m8prime_3D_opt4_private4.cl'
                 src_remesh_weights = 'weights_m8prime_opt4_builtin.cl'
+            else:
+                print 'No remeshing fromula specified, use default m6prime'
+                src_remesh = 'remeshing_3D_opt4_private4.cl'
+                src_remesh_weights = 'weights_m6prime_opt4_builtin.cl'
         else:
-            if self.method.find('m6prime'):
+            if self.method.find('m6prime') >= 0:
                 src_remesh = 'remeshing_2D_opt4_noBC.cl'
                 src_remesh_weights = 'weights_m6prime_builtin.cl'
-            else:
+            elif self.method.find('m8prime') >= 0:
                 src_remesh = 'remeshing_m8prime_2D_opt4_noBC.cl'
                 src_remesh_weights = 'weights_m8prime_builtin.cl'
+            else:
+                print 'No remeshing fromula specified, use default m6prime'
+                src_remesh = 'remeshing_2D_opt4_noBC.cl'
+                src_remesh_weights = 'weights_m6prime_builtin.cl'
+        self.WINb_remesh, self.gwi_remesh, \
+            self.lwi_remesh = self._set_WorkItems(remesh_vector_width)
+        self.build_options += " -D WGNREMESH=" + str(self.WINb_remesh)
         f = open(GPU_SRC + src_remesh_weights)
         gpu_src += "".join(f.readlines())
         print  "   - remeshing weights:", f.name
@@ -295,90 +172,12 @@ class GPUParticleAdvection(ParticleAdvection):
         print  "   - remeshing:", f.name
         gpu_src += "".join(f.readlines())
         f.close()
-        ## copy
-        if len(self.resolution) == 3:
-            if self.gpu_precision is PARMES_REAL_GPU:
-                src_copy = 'copy_3D_opt4.cl'
-                self.build_options += " -D TILE_DIM_COPY=16 -D BLOCK_ROWS_COPY=8"
-                self.copy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1] / 2), int(self.resolution[2]))
-                self.copy_lwi = (4, 8, 1)
-            else:
-                src_copy = 'copy_3D_locMem.cl'
-                self.build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=8"
-                self.copy_gwi = (int(self.resolution[0]), int(self.resolution[1] / 4), int(self.resolution[2]))
-                self.copy_lwi = (32, 8, 1)
-        else:
-            if self.gpu_precision is PARMES_REAL_GPU:
-                src_copy = 'copy_2D_opt2.cl'
-                self.build_options += " -D TILE_DIM_COPY=16 -D BLOCK_ROWS_COPY=8"
-                self.copy_gwi = (int(self.resolution[0] / 2), int(self.resolution[1] / 2))
-                self.copy_lwi = (8, 8)
-            else:
-                src_copy = 'copy_2D_opt2.cl'
-                self.build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=2"
-                self.copy_gwi = (int(self.resolution[0] / 2), int(self.resolution[1] / 16))
-                self.copy_lwi = (16, 2)
-        f = open(GPU_SRC + src_copy)
-        print  "   - copy:", f.name
-        gpu_src += "".join(f.readlines())
-        f.close()
-        ## transpose
-        if len(self.resolution) == 3:
-            if self.gpu_precision is PARMES_REAL_GPU:
-                src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl'
-                self.build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8"
-                self.transpose_xy_gwi = (int(self.resolution[0] / 2), int(self.resolution[1] / 2), int(self.resolution[2]))
-                self.transpose_xy_lwi = (8, 8, 1)
-                src_transpose_xz = 'transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl'
-                self.build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4"
-                self.transpose_xz_gwi = (int(self.resolution[0]), int(self.resolution[1] / 4), int(self.resolution[2] / 4))
-                self.transpose_xz_lwi = (16, 4, 4)
-            else:
-                src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl'
-                self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=4"
-                self.transpose_xy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1] / 8), int(self.resolution[2]))
-                self.transpose_xy_lwi = (8, 4, 1)
-                src_transpose_xz = 'transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl'
-                self.build_options += " -D TILE_DIM_XZ=8 -D BLOCK_ROWS_XZ=2"
-                self.transpose_xz_gwi = (int(self.resolution[0]), int(self.resolution[1] / 4), int(self.resolution[2] / 4))
-                self.transpose_xz_lwi = (8, 2, 2)
-        else:
-            if self.gpu_precision is PARMES_REAL_GPU:
-                src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl'
-                self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8"
-                self.transpose_xy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1]) / 4)
-                self.transpose_xy_lwi = (8, 8)
-            else:
-                src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl'
-                self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=2"
-                self.transpose_xy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1]) / 16)
-                self.transpose_xy_lwi = (8, 2)
-        f = open(GPU_SRC + src_transpose_xy)
-        print  "   - transpose_xy:", f.name
-        gpu_src += "".join(f.readlines())
-        f.close()
-        if len(self.resolution) == 3:
-            f = open(GPU_SRC + src_transpose_xz)
-            gpu_src += "".join(f.readlines())
-            f.close()
+        print "       - config :",
+        print self.gwi_remesh, self.lwi_remesh
+        gpu_src += self._collect_kernels_cl_src_copy()
+        gpu_src += self._collect_kernels_cl_src_transpositions()
         return gpu_src
 
-    def _collect_usr_cl_src(self, usr_src):
-        print "usr src", usr_src
-        gpu_src = ""
-        if usr_src is not None:
-            if isinstance(usr_src, list):
-                for src in usr_src:
-                    print "Sources files (user): ", src
-                    f = open(src, 'r')
-                    gpu_src += "".join(f.readlines())
-                    f.close()
-            else:
-                print "Sources files (user): ", usr_src
-                f = open(usr_src, 'r')
-                gpu_src += "".join(f.readlines())
-                f.close()
-        return gpu_src
 
     def _apply_in_dir(self, t, dt, d, split_id):
         """
@@ -414,7 +213,7 @@ class GPUParticleAdvection(ParticleAdvection):
                                                    self.part_scalar.gpu_data)
                 else:
                     evt_init = \
-                        self.init_transpose.launch(1, self.gpu_scalar.gpu_data,
+                        self.init_transpose.launch(1, self.scalar.gpu_data,
                                                    self.part_scalar.gpu_data)
         # Advection
         evt_advec = self.num_advec.launch(self.velocity.gpu_data[d],
@@ -443,12 +242,11 @@ class GPUParticleAdvection(ParticleAdvection):
         if split_id == 0:
             self.compute_time_copy[d] += c_time_init
         else:
-            self.compute_time_swap[min(split_id, d)] += c_time_init
+            if self.scalar.topology.dim == 2:
+                self.compute_time_swap[0] += c_time_init
         self.total_time += (c_time_advec + c_time_remesh + c_time_init)
         return (c_time_advec + c_time_remesh + c_time_init)
 
-    def apply(self, t, dt, ite):
-        self.numMethod(t, dt)
 
     def printComputeTime(self):
         self.time_info[0] = "\"Advection total\" "
diff --git a/HySoP/hysop/operator/monitors/monitoring.py b/HySoP/hysop/operator/monitors/monitoring.py
index ac23e3aaa3e31ae5f1e36757c7077d0dfaa09458..ca7347b927ac2aa115b4f3fe00368890de1999ad 100644
--- a/HySoP/hysop/operator/monitors/monitoring.py
+++ b/HySoP/hysop/operator/monitors/monitoring.py
@@ -19,7 +19,10 @@ class Monitoring(Operator):
         ## Monitor frequency
         self.freq = frequency
 
-    def setUp(self, *spec):
+    def setUp(self):
+        pass
+
+    def finalize(self):
         pass
 
     def printComputeTime(self):
diff --git a/HySoP/hysop/operator/monitors/printer.py b/HySoP/hysop/operator/monitors/printer.py
index 1e6c7c1dda56ba039ed1bb34f362fcb700c86caa..bae205043edc4506508c880f24d1b121d413d903 100644
--- a/HySoP/hysop/operator/monitors/printer.py
+++ b/HySoP/hysop/operator/monitors/printer.py
@@ -92,47 +92,48 @@ class Printer(Monitoring):
             filename = self.outputPrefix + "results_{0:05d}.dat".format(ite)
             print "Print file : " + filename
             ## VTK output \todo: Need fix in 2D, getting an IOError.
-            if self.fields[0].domain.dimension == 3:
+            try:
                 evtk.imageToVTK(filename, pointData=self._build_vtk_dict())
-            else:
-                ## Standard output
-                f = open(filename, 'w')
-                shape = self.fields[0].discreteField[0].topology.mesh.resolution
-                coords = self.fields[0].discreteField[0].topology.mesh.coords
-                if len(shape) == 2:
-                    for i in xrange(shape[0] - 1):
-                        for j in xrange(shape[1] - 1):
-                            f.write("{0:8.12} {1:8.12} ".format(
-                                    coords[0][i, 0], coords[1][0, j]))
-                            for field in self.fields:
-                                if field.vector:
-                                    f.write("{0:8.12} {1:8.12} ".format(
-                                            field.discreteField[0][0][i, j],
-                                            field.discreteField[0][1][i, j]))
-                                else:
-                                    f.write("{0:8.12} ".format(
-                                            field.discreteField[0][i, j]))
-                            f.write("\n")
-                else:
-                    for i in xrange(shape[0] - 1):
-                        for j in xrange(shape[1] - 1):
-                            for k in xrange(shape[2] - 1):
-                                f.write("{0:8.12} {1:8.12} {2:8.12} ".format(
-                                        coords[0][i, 0, 0],
-                                        coords[1][0, j, 0],
-                                        coords[2][0, 0, k]))
+            except IOError:
+                if self.fields[0].domain.dimension == 2:
+                    ## Standard output
+                    f = open(filename, 'w')
+                    shape = self.fields[0].discreteField[0].topology.mesh.resolution
+                    coords = self.fields[0].discreteField[0].topology.mesh.coords
+                    if len(shape) == 2:
+                        for i in xrange(shape[0] - 1):
+                            for j in xrange(shape[1] - 1):
+                                f.write("{0:8.12} {1:8.12} ".format(
+                                        coords[0][i, 0], coords[1][0, j]))
                                 for field in self.fields:
                                     if field.vector:
-                                        f.write("{0:8.12} {1:8.12} {2:8.12} ".format(
-                                                field.discreteField[0][0][i, j, k],
-                                                field.discreteField[0][1][i, j, k],
-                                                field.discreteField[0][2][i, j, k]))
+                                        f.write("{0:8.12} {1:8.12} ".format(
+                                                field.discreteField[0][0][i, j],
+                                                field.discreteField[0][1][i, j]))
                                     else:
                                         f.write("{0:8.12} ".format(
-                                                field.discreteField[0][i, j, k]))
+                                                field.discreteField[0][i, j]))
                                 f.write("\n")
-                f.close()
-                print "==\n"
+                    else:
+                        for i in xrange(shape[0] - 1):
+                            for j in xrange(shape[1] - 1):
+                                for k in xrange(shape[2] - 1):
+                                    f.write("{0:8.12} {1:8.12} {2:8.12} ".format(
+                                            coords[0][i, 0, 0],
+                                            coords[1][0, j, 0],
+                                            coords[2][0, 0, k]))
+                                    for field in self.fields:
+                                        if field.vector:
+                                            f.write("{0:8.12} {1:8.12} {2:8.12} ".format(
+                                                    field.discreteField[0][0][i, j, k],
+                                                    field.discreteField[0][1][i, j, k],
+                                                    field.discreteField[0][2][i, j, k]))
+                                        else:
+                                            f.write("{0:8.12} ".format(
+                                                    field.discreteField[0][i, j, k]))
+                                    f.write("\n")
+                    f.close()
+            print "==\n"
         self.compute_time += (time.time() - t)
 
     def printComputeTime(self):
diff --git a/HySoP/hysop/problem/problem.py b/HySoP/hysop/problem/problem.py
index a793bbca71da48527145fe6c3d6478820b6073ec..69f0ac13b09c1d6f5989ff0571cb8b84ae66627c 100644
--- a/HySoP/hysop/problem/problem.py
+++ b/HySoP/hysop/problem/problem.py
@@ -79,6 +79,10 @@ class Problem(object):
                 op.apply(self.timer.t, self.timer.dt, ite)
             self.timer.step()
         print "\n\n End solving\n"
+        print "==== Finalization ===="
+        for op in self.operators:
+            op.finalize()
+        print "===\n"
         print "=== Timings ==="
         if (self.domain.topologies[0].rank == 0):
             for op in self.operators: