diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl
index 87127b396f8f0ec6ec3b9db261f1a6bf8aeb6f66..90b055bad3fbd8e72e56ecc616c9dce232b7fa6f 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl
@@ -2,16 +2,366 @@
 
 
 
+__kernel void buff_advec_and_remesh_l(__global const float* gvelo,
+				      __global float* v_l_buff,
+				      __global const float* pscal,
+				      __global float* s_l_buff,
+				      float dt,
+				      float inv_v_dx_y, float inv_v_dx_z,
+				      __constant struct AdvectionMeshInfo* mesh)
+{
+  int gidY = get_global_id(0); /* OpenCL work-itme global index (Y) */
+  int gidZ = get_global_id(1); /* OpenCL work-itme global index (Z) */
+  int i;			/* Particle index in 1D problem */
+  int line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */
+  float p,v,c,s,y,w;
+  float2 hY, hZ;
+  int i_ind, i_indY, i_indZ;
+  int ind, index;
+
+
+  float velocity_cache[V_NB_I];
+  float v_l_buff_loc[V_BUFF_WIDTH];
+  float s_l_buff_loc[BUFF_WIDTH];
+  float* loc_ptr;
+
+  // Initialize buffers
+  for (i=0; i<BUFF_WIDTH; i++)
+    s_l_buff_loc[i] = 0.0;
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  hY.s0 = (gidY * mesh->dx.y) * inv_v_dx_y;
+  hZ.s0 = (gidZ * mesh->dx.z) * inv_v_dx_z;
+  i_indY = convert_int_rtn(hY.s0);
+  i_indZ = convert_int_rtn(hZ.s0);
+  hY.s0 = hY.s0 - convert_float(i_indY);
+  hZ.s0 = hZ.s0 - convert_float(i_indZ);
+  hY.s1 = (1.0-hY.s0);
+  hZ.s1 = (1.0-hZ.s0);
+
+  i_indY = i_indY + V_GHOSTS_NB;
+  i_indZ = i_indZ + V_GHOSTS_NB;
+
+  for (i=0; i<V_NB_I; i++){
+    velocity_cache[noBC_id(i)] = hY.s1*hZ.s1 * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] += hY.s1*hZ.s0 * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] += hY.s0*hZ.s1 * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] += hY.s0*hZ.s0 * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II];
+  }
+
+  for (i=0; i<V_BUFF_WIDTH; i++){
+    v_l_buff_loc[i] = hY.s1*hZ.s1*v_l_buff[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)];
+    v_l_buff_loc[i] += hY.s1*hZ.s0*v_l_buff[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)];
+    v_l_buff_loc[i] += hY.s0*hZ.s1*v_l_buff[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)];
+    v_l_buff_loc[i] += hY.s0*hZ.s0*v_l_buff[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)];
+  }
+
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=0; i<2*BUFF_WIDTH; i++)
+    {
+      c = i * mesh->dx.x + mesh->min_position;
+      // multi-scale : interpolate v from velocity buffer (of length V_NB_I)
+      p = c * mesh->v_invdx;
+      i_ind = convert_int_rtn(p);
+      p = p - convert_float(i_ind);
+      i_ind = i_ind - (V_START_INDEX-V_GHOSTS_NB) - MS_INTERPOL_SHIFT;
+      v = mix(velocity_cache[noBC_id(i_ind)],
+	      velocity_cache[noBC_id(i_ind+1)],p);
+      p = (c + 0.5*dt*v) * mesh->v_invdx;
+
+      i_ind = convert_int_rtn(p) - MS_INTERPOL_SHIFT;
+      p = p - convert_float(i_ind);
+      loc_ptr = (i_ind>=(V_START_INDEX-V_GHOSTS_NB)) ? velocity_cache+noBC_id(i_ind - (V_START_INDEX-V_GHOSTS_NB)) : v_l_buff_loc+i_ind-(V_START_INDEX-V_GHOSTS_NB-1-V_BUFF_WIDTH+1);
+      v = (1.0-p)*(*loc_ptr);
+      i_ind = i_ind + 1;
+      loc_ptr = (i_ind>=(V_START_INDEX-V_GHOSTS_NB)) ? velocity_cache+noBC_id(i_ind - (V_START_INDEX-V_GHOSTS_NB)) : v_l_buff_loc+i_ind-(V_START_INDEX-V_GHOSTS_NB-1-V_BUFF_WIDTH+1);
+      v += p*(*loc_ptr);
+      p = c + dt * v;
+
+
+      /* Read particle scalar */
+      s = pscal[i + line_index];
+
+
+
+
+      ind = convert_int_rtn(p * mesh->invdx);
+      y = (p - convert_float(ind) * mesh->dx.x) * mesh->invdx;
+
+      index = ind - REMESH_SHIFT;
+
+      w = REMESH(alpha)(y);
+      if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(beta)(y);
+      if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(gamma)(y);
+      if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(delta)(y);
+      if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+#if REMESH_SHIFT > 1
+      index = index + 1;
+      w = REMESH(eta)(y);
+      if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(zeta)(y);
+      if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#if REMESH_SHIFT > 2
+      index = index + 1;
+      w = REMESH(theta)(y);
+      if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(iota)(y);
+      if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#if REMESH_SHIFT > 3
+      index = index + 1;
+      w = REMESH(kappa)(y);
+      if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(mu)(y);
+      if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+    }
+
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Store buffers
+  for (i=0; i<BUFF_WIDTH; i++)
+    s_l_buff[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_l_buff_loc[i];
+
+}
+
+__kernel void buff_advec_and_remesh_r(__global const float* gvelo,
+				      __global float* v_r_buff,
+				      __global const float* pscal,
+				      __global float* s_r_buff,
+				      float dt,
+				      float inv_v_dx_y, float inv_v_dx_z,
+				      __constant struct AdvectionMeshInfo* mesh)
+{
+  int gidY = get_global_id(0); /* OpenCL work-itme global index (Y) */
+  int gidZ = get_global_id(1); /* OpenCL work-itme global index (Z) */
+  int i;			/* Particle index in 1D problem */
+  int line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */
+  float p,v,c,s,y,w;
+  float2 hY, hZ;
+  int i_ind, i_indY, i_indZ;
+  int ind, index;
+
+
+  float velocity_cache[V_NB_I];
+  float v_r_buff_loc[V_BUFF_WIDTH];
+  float s_r_buff_loc[BUFF_WIDTH];
+  float* loc_ptr;
+
+  // Initialize buffers
+  for(i=0; i<BUFF_WIDTH; i++)
+    s_r_buff_loc[i] = 0.0;
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  hY.s0 = (gidY * mesh->dx.y) * inv_v_dx_y;
+  hZ.s0 = (gidZ * mesh->dx.z) * inv_v_dx_z;
+  i_indY = convert_int_rtn(hY.s0);
+  i_indZ = convert_int_rtn(hZ.s0);
+  hY.s0 = hY.s0 - convert_float(i_indY);
+  hZ.s0 = hZ.s0 - convert_float(i_indZ);
+  hY.s1 = (1.0-hY.s0);
+  hZ.s1 = (1.0-hZ.s0);
+
+  i_indY = i_indY + V_GHOSTS_NB;
+  i_indZ = i_indZ + V_GHOSTS_NB;
+
+  for(i=0;i<V_NB_I; i++){
+    velocity_cache[noBC_id(i)] = hY.s1*hZ.s1 * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] += hY.s1*hZ.s0 * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] += hY.s0*hZ.s1 * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II];
+    velocity_cache[noBC_id(i)] += hY.s0*hZ.s0 * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II];
+  }
+
+  for(i=0;i<V_BUFF_WIDTH; i++){
+    v_r_buff_loc[i] = hY.s1*hZ.s1*v_r_buff[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)];
+    v_r_buff_loc[i] += hY.s1*hZ.s0*v_r_buff[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)];
+    v_r_buff_loc[i] += hY.s0*hZ.s1*v_r_buff[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)];
+    v_r_buff_loc[i] += hY.s0*hZ.s0*v_r_buff[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)];
+  }
+
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=NB_I-2*BUFF_WIDTH; i<NB_I; i++)
+    {
+      c = i * mesh->dx.x + mesh->min_position;
+      // multi-scale : interpolate v from velocity buffer (of length V_NB_I)
+      p = c * mesh->v_invdx;
+      i_ind = convert_int_rtn(p);
+      p = p - convert_float(i_ind);
+      i_ind = i_ind - (V_START_INDEX-V_GHOSTS_NB) - MS_INTERPOL_SHIFT;
+      v = mix(velocity_cache[noBC_id(i_ind)],
+	      velocity_cache[noBC_id(i_ind+1)],p);
+      p = (c + 0.5*dt*v) * mesh->v_invdx;
+
+      i_ind = convert_int_rtn(p) - MS_INTERPOL_SHIFT;
+      p = p - convert_float(i_ind);
+      loc_ptr = (i_ind <= (V_STOP_INDEX+V_GHOSTS_NB)) ? velocity_cache+noBC_id(i_ind - (V_START_INDEX-V_GHOSTS_NB)) : v_r_buff_loc+i_ind-(V_STOP_INDEX+V_GHOSTS_NB+1) ;
+      v = (1.0-p)*(*loc_ptr);
+      i_ind = i_ind + 1;
+      loc_ptr = (i_ind <= (V_STOP_INDEX+V_GHOSTS_NB)) ? velocity_cache+noBC_id(i_ind - (V_START_INDEX-V_GHOSTS_NB)) : v_r_buff_loc+i_ind-(V_STOP_INDEX+V_GHOSTS_NB+1) ;
+      v += p*(*loc_ptr);
+      p = c + dt * v;
+
+
+      /* Read particle scalar */
+      s = pscal[i + line_index];
+
+
+
+
+      ind = convert_int_rtn(p * mesh->invdx);
+      y = (p - convert_float(ind) * mesh->dx.x) * mesh->invdx;
+
+      index = ind - REMESH_SHIFT;
+
+      w = REMESH(alpha)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(beta)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(gamma)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(delta)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+#if REMESH_SHIFT > 1
+      index = index + 1;
+      w = REMESH(eta)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(zeta)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#if REMESH_SHIFT > 2
+      index = index + 1;
+      w = REMESH(theta)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(iota)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#if REMESH_SHIFT > 3
+      index = index + 1;
+      w = REMESH(kappa)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(mu)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+    }
+
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Store buffers
+  for(i=0;i<BUFF_WIDTH;i++)
+    s_r_buff[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_r_buff_loc[i];
+
+}
+
 __kernel void buff_advec_and_remesh(__global const float* gvelo,
-				    __global float* v_l_buff,
-				    __global float* v_r_buff,
-				    __global const float* pscal,
-				    __global float* gscal,
-				    __global float* s_l_buff,
-				    __global float* s_r_buff,
-				    float dt,
-				    float inv_v_dx_y, float inv_v_dx_z,
-				    __constant struct AdvectionMeshInfo* mesh)
+				      __global float* v_l_buff,
+				      __global float* v_r_buff,
+				      __global const float* pscal,
+				      __global float* gscal,
+				      float dt,
+				      float inv_v_dx_y, float inv_v_dx_z,
+				      __constant struct AdvectionMeshInfo* mesh)
 {
   int gidX = get_global_id(0);	/* OpenCL work-itme global index (X) */
   int gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */
@@ -28,16 +378,8 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo,
   __local float v_l_buff_loc[V_BUFF_WIDTH];
   __local float v_r_buff_loc[V_BUFF_WIDTH];
   __local float gscal_loc[NB_I];
-  __local float s_l_buff_loc[BUFF_WIDTH];
-  __local float s_r_buff_loc[BUFF_WIDTH];
   __local float* loc_ptr;
 
-  // Initialize buffers
-  if(gidX < BUFF_WIDTH)
-    s_l_buff_loc[gidX] = 0.0;
-  if(gidX < BUFF_WIDTH)
-    s_r_buff_loc[gidX] = 0.0;
-
   for(i=gidX; i<NB_I; i+=WI_NB)
     {
       /* Initialize result buffer */
@@ -117,77 +459,77 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo,
       index = ind - REMESH_SHIFT;
 
       w = REMESH(alpha)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
       index = index + 1;
       w = REMESH(beta)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
       index = index + 1;
       w = REMESH(gamma)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
       index = index + 1;
       w = REMESH(delta)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
 #if REMESH_SHIFT > 1
       index = index + 1;
       w = REMESH(eta)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
       index = index + 1;
       w = REMESH(zeta)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 #endif
 
 #if REMESH_SHIFT > 2
       index = index + 1;
       w = REMESH(theta)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
       index = index + 1;
       w = REMESH(iota)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 #endif
 
 #if REMESH_SHIFT > 3
       index = index + 1;
       w = REMESH(kappa)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
       index = index + 1;
       w = REMESH(mu)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 #endif
     }
@@ -200,11 +542,4 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo,
       /* Store result */
       gscal[i + line_index] = gscal_loc[i];
     }
-
-  // Store buffers
-  if (gidX < BUFF_WIDTH)
-    s_l_buff[gidX + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_l_buff_loc[gidX];
-  if (gidX < BUFF_WIDTH)
-    s_r_buff[gidX + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_r_buff_loc[gidX];
-
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_and_remeshing_noVec.cl
index 7db76aab5dc1b7349666484f3abb240abf57eaee..ae019d2aded9acbe527aca653bbb86ea05c84a05 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_and_remeshing_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_and_remeshing_noVec.cl
@@ -1,13 +1,317 @@
 
 
 
+__kernel void buff_advec_and_remesh_l(__global const float* gvelo,
+				    __global float* v_buffer_l,
+				    __global const float* pscal,
+				    __global float* s_buffer_l,
+				    float dt, __constant struct AdvectionMeshInfo* mesh)
+{
+  int gidY = get_global_id(0); /* OpenCL work-itme global index (Y) */
+  int gidZ = get_global_id(1); /* OpenCL work-itme global index (Z) */
+  int i;			/* Particle index in 1D problem */
+  int line_index ; /* Current 1D problem index */
+
+  float v,vp,p,c,s,y,w, hdt = 0.5 * dt;
+  int i_ind, i_ind_p, ind, index;
+
+  float velocity_cache[V_NB_I];
+  float v_buff_l_loc[V_BUFF_WIDTH];
+  float s_buff_l_loc[BUFF_WIDTH];
+  float* loc_ptr;
+
+  // Initialize buffers
+  for (i=0;i<BUFF_WIDTH;i++)
+    s_buff_l_loc[i] = 0.0;
+
+  for(i=0; i<V_BUFF_WIDTH; i++)
+    v_buff_l_loc[i] = v_buffer_l[i + V_BUFF_WIDTH*(gidY + gidZ*V_NB_II)];
+
+  line_index = gidY*V_NB_I + gidZ*V_NB_I*V_NB_II;
+  /* Read velocity */
+  /* Fill velocity cache */
+  for(i=0;i<V_NB_I;i++)
+      velocity_cache[i] = gvelo[i+line_index];
+
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  line_index = gidY*NB_I+gidZ*NB_I*NB_II;
+  for(i=0; i<2*BUFF_WIDTH; i++)
+    {
+      /* Read particle scalar */
+      s = pscal[i + line_index];
+
+      c = i * mesh->dx.x + mesh->min_position;
+      v = velocity_cache[i + V_GHOSTS_NB];
+      p = (c + hdt*v) * mesh->v_invdx;
+
+      i_ind = convert_int_rtn(p);
+      p = p - convert_float(i_ind);
+      i_ind_p = i_ind + 1;
+      loc_ptr = (i_ind>=(V_START_INDEX-V_GHOSTS_NB)) ? velocity_cache + i_ind - (V_START_INDEX-V_GHOSTS_NB) : v_buff_l_loc+i_ind-(V_START_INDEX-V_GHOSTS_NB-1-V_BUFF_WIDTH+1);
+      v = *loc_ptr;
+
+      loc_ptr = (i_ind_p>=(V_START_INDEX-V_GHOSTS_NB)) ? velocity_cache+i_ind_p - (V_START_INDEX-V_GHOSTS_NB) : v_buff_l_loc+i_ind_p-(V_START_INDEX-V_GHOSTS_NB-1-V_BUFF_WIDTH+1);
+      vp = *loc_ptr;
+
+      v = (p*(vp-v) + v);
+      p = c + dt * v;
+
+
+
+      ind = convert_int_rtn(p * mesh->invdx);
+      y = (p - convert_float(ind) * mesh->dx.x) * mesh->invdx;
+
+      index = ind - REMESH_SHIFT;
+
+      w = REMESH(alpha)(y);
+      if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(beta)(y);
+      if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(gamma)(y);
+      if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(delta)(y);
+      if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+#if REMESH_SHIFT > 1
+      index = index + 1;
+      w = REMESH(eta)(y);
+      if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(zeta)(y);
+      if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#if REMESH_SHIFT > 2
+      index = index + 1;
+      w = REMESH(theta)(y);
+      if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(iota)(y);
+      if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#if REMESH_SHIFT > 3
+      index = index + 1;
+      w = REMESH(kappa)(y);
+      if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(mu)(y);
+      if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+    }
+
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Store buffers
+  for(i=0;i<BUFF_WIDTH;i++)
+    s_buffer_l[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_buff_l_loc[i];
+}
+
+
+
+
+
+
+
+
+__kernel void buff_advec_and_remesh_r(__global const float* gvelo,
+				    __global float* v_buffer_r,
+				    __global const float* pscal,
+				    __global float* s_buffer_r,
+				    float dt, __constant struct AdvectionMeshInfo* mesh)
+{
+  int gidY = get_global_id(0); /* OpenCL work-itme global index (Y) */
+  int gidZ = get_global_id(1); /* OpenCL work-itme global index (Z) */
+  int i;			/* Particle index in 1D problem */
+  int line_index ; /* Current 1D problem index */
+
+  float v,vp,p,c,s,y,w, hdt = 0.5 * dt;
+  int i_ind, i_ind_p, ind, index;
+
+  float velocity_cache[V_NB_I];
+  float v_buff_r_loc[V_BUFF_WIDTH];
+  float s_buff_r_loc[BUFF_WIDTH];
+  float* loc_ptr;
+
+  // Initialize buffers
+  for(i=0;i<BUFF_WIDTH;i++)
+    s_buff_r_loc[i] = 0.0;
+
+  for(i=0;i<V_BUFF_WIDTH;i++)
+    v_buff_r_loc[i] = v_buffer_r[i + V_BUFF_WIDTH*(gidY + gidZ*V_NB_II)];
+
+  line_index = gidY*V_NB_I + gidZ*V_NB_I*V_NB_II;
+  /* Read velocity */
+  /* Fill velocity cache */
+  for(i=0;i<V_NB_I; i++)
+      velocity_cache[i] = gvelo[i+line_index];
+
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  line_index = gidY*NB_I+gidZ*NB_I*NB_II;
+  for(i=NB_I-2*BUFF_WIDTH; i<NB_I; i++)
+    {
+      /* Read particle scalar */
+      s = pscal[i + line_index];
+
+      c = i * mesh->dx.x + mesh->min_position;
+      v = velocity_cache[i + V_GHOSTS_NB];
+      p = (c + hdt*v) * mesh->v_invdx;
+
+      i_ind = convert_int_rtn(p);
+      p = p - convert_float(i_ind);
+      i_ind_p = i_ind + 1;
+      loc_ptr = (i_ind <= (V_STOP_INDEX+V_GHOSTS_NB)) ? velocity_cache + i_ind - (V_START_INDEX-V_GHOSTS_NB) : v_buff_r_loc+i_ind-(V_STOP_INDEX+V_GHOSTS_NB+1) ;
+      v = *loc_ptr;
+
+      loc_ptr = (i_ind_p <= (V_STOP_INDEX+V_GHOSTS_NB)) ? velocity_cache+i_ind_p - (V_START_INDEX-V_GHOSTS_NB) : v_buff_r_loc+i_ind_p-(V_STOP_INDEX+V_GHOSTS_NB+1) ;
+      vp = *loc_ptr;
+
+      v = (p*(vp-v) + v);
+      p = c + dt * v;
+
+
+
+      ind = convert_int_rtn(p * mesh->invdx);
+      y = (p - convert_float(ind) * mesh->dx.x) * mesh->invdx;
+
+      index = ind - REMESH_SHIFT;
+
+      w = REMESH(alpha)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(beta)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(gamma)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(delta)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+#if REMESH_SHIFT > 1
+      index = index + 1;
+      w = REMESH(eta)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(zeta)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#if REMESH_SHIFT > 2
+      index = index + 1;
+      w = REMESH(theta)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(iota)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#if REMESH_SHIFT > 3
+      index = index + 1;
+      w = REMESH(kappa)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      index = index + 1;
+      w = REMESH(mu)(y);
+      if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1);
+      w = w * s;
+      (*loc_ptr) += w;}
+      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+    }
+
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=0;i<BUFF_WIDTH;i++)
+    s_buffer_r[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_buff_r_loc[i];
+
+}
+
+
 __kernel void buff_advec_and_remesh(__global const float* gvelo,
 				    __global float* v_buffer_l,
 				    __global float* v_buffer_r,
 				    __global const float* pscal,
 				    __global float* gscal,
-				    __global float* s_buffer_l,
-				    __global float* s_buffer_r,
 				    float dt, __constant struct AdvectionMeshInfo* mesh)
 {
   int gidX = get_global_id(0);	/* OpenCL work-itme global index (X) */
@@ -23,37 +327,24 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo,
   __local float v_buff_l_loc[V_BUFF_WIDTH];
   __local float v_buff_r_loc[V_BUFF_WIDTH];
   __local float gscal_loc[NB_I];
-  __local float s_buff_l_loc[BUFF_WIDTH];
-  __local float s_buff_r_loc[BUFF_WIDTH];
   __local float* loc_ptr;
 
-  // Initialize buffers
-  if(gidX < BUFF_WIDTH)
-    s_buff_l_loc[gidX] = 0.0;
-  if(gidX < BUFF_WIDTH)
-    s_buff_r_loc[gidX] = 0.0;
 
   for(i=gidX; i<NB_I; i+=WI_NB)
-    {
-      /* Initialize result buffer */
-      gscal_loc[i] = 0.0;
-    }
+    /* Initialize result buffer */
+    gscal_loc[i] = 0.0;
 
-    for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)){
-      v_buff_l_loc[i] = v_buffer_l[i + V_BUFF_WIDTH*(gidY + gidZ*V_NB_II)];
-    }
+  for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB))
+    v_buff_l_loc[i] = v_buffer_l[i + V_BUFF_WIDTH*(gidY + gidZ*V_NB_II)];
 
-    for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)){
-      v_buff_r_loc[i] = v_buffer_r[i + V_BUFF_WIDTH*(gidY + gidZ*V_NB_II)];
-    }
+  for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB))
+    v_buff_r_loc[i] = v_buffer_r[i + V_BUFF_WIDTH*(gidY + gidZ*V_NB_II)];
 
   line_index = gidY*V_NB_I + gidZ*V_NB_I*V_NB_II;
+  /* Read velocity */
+  /* Fill velocity cache */
   for(i=gidX; i<V_NB_I; i+=(WI_NB))
-    {
-      /* Read velocity */
-      /* Fill velocity cache */
       velocity_cache[i] = gvelo[i+line_index];
-    }
 
   /* Synchronize work-group */
   barrier(CLK_LOCAL_MEM_FENCE);
@@ -88,77 +379,77 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo,
       index = ind - REMESH_SHIFT;
 
       w = REMESH(alpha)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
       index = index + 1;
       w = REMESH(beta)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
       index = index + 1;
       w = REMESH(gamma)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
       index = index + 1;
       w = REMESH(delta)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
 #if REMESH_SHIFT > 1
       index = index + 1;
       w = REMESH(eta)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
       index = index + 1;
       w = REMESH(zeta)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 #endif
 
 #if REMESH_SHIFT > 2
       index = index + 1;
       w = REMESH(theta)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
       index = index + 1;
       w = REMESH(iota)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 #endif
 
 #if REMESH_SHIFT > 3
       index = index + 1;
       w = REMESH(kappa)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 
       index = index + 1;
       w = REMESH(mu)(y);
-      loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) );
+      if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX;
       w = w * s;
-      (*loc_ptr) += w;
+      (*loc_ptr) += w;}
       barrier(CLK_LOCAL_MEM_FENCE);
 #endif
 
@@ -172,11 +463,4 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo,
       /* Store result */
       gscal[i + line_index] = gscal_loc[i];
     }
-
-  // Store buffers
-  if (gidX < BUFF_WIDTH)
-    s_buffer_l[gidX + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_buff_l_loc[gidX];
-  if (gidX < BUFF_WIDTH)
-    s_buffer_r[gidX + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_buff_r_loc[gidX];
-
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl
index ad4d4dd9e8ddee507d7a8141ab2943d74cb3d5b2..4d12e508826ebc3f4f19b1e0beae31ee75c78297 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl
@@ -176,7 +176,7 @@ __kernel void buff_remesh_r(__global const float* ppos,
   float* loc_ptr;
 
   // Initialize buffers
-    for(i=0; i<BUFF_WIDTH; i++)
+  for(i=0; i<BUFF_WIDTH; i++)
     r_buff_loc[i] = 0.0;
 
   /* Synchronize work-group */
diff --git a/HySoP/hysop/gpu/multi_gpu_particle_advection.py b/HySoP/hysop/gpu/multi_gpu_particle_advection.py
index 126b4e686d1cd61ae17f139acf2b482cdfd90ce6..cf87210d6dcc8ec974ed9cb1dd31c9305a8ab96e 100644
--- a/HySoP/hysop/gpu/multi_gpu_particle_advection.py
+++ b/HySoP/hysop/gpu/multi_gpu_particle_advection.py
@@ -355,6 +355,12 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
         build_options += " -D BUFF_WIDTH=" + str(self._s_buff_width)
         prg = self.cl_env.build_src(
             src, build_options, 1)
+        self.num_advec_and_remesh_comm_l = KernelLauncher(
+            prg.buff_advec_and_remesh_l, self.cl_env.queue,
+            (gwi[1], gwi[2]), (32, 1))
+        self.num_advec_and_remesh_comm_r = KernelLauncher(
+            prg.buff_advec_and_remesh_r, self.cl_env.queue,
+            (gwi[1], gwi[2]), (32, 1))
         self.num_advec_and_remesh = KernelLauncher(
             prg.buff_advec_and_remesh, self.cl_env.queue,
             gwi, lwi)
@@ -734,31 +740,63 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
         wait_evts = self.velocity.events + self._evt_l_v + self._evt_r_v + \
             self._init_events[self.fields_on_grid[0]]
         if self._isMultiScale:
-            evt_num = self.num_advec_and_remesh(
+            evt_comm_l = self.num_advec_and_remesh_comm_l(
                 self.velocity.gpu_data[self.direction],
                 self._cl_v_l_buff,
-                self._cl_v_r_buff,
                 self.fields_on_part[self.fields_on_grid[0]][0],
-                self.fields_on_grid[0].gpu_data[0],
                 self._cl_s_l_buff,
+                self.gpu_precision(dt),
+                self.gpu_precision(1. / self._v_mesh_size[1]),
+                self.gpu_precision(1. / self._v_mesh_size[2]),
+                self._cl_mesh_info,
+                wait_for=wait_evts)
+            evt_comm_r = self.num_advec_and_remesh_comm_r(
+                self.velocity.gpu_data[self.direction],
+                self._cl_v_r_buff,
+                self.fields_on_part[self.fields_on_grid[0]][0],
                 self._cl_s_r_buff,
                 self.gpu_precision(dt),
                 self.gpu_precision(1. / self._v_mesh_size[1]),
                 self.gpu_precision(1. / self._v_mesh_size[2]),
                 self._cl_mesh_info,
                 wait_for=wait_evts)
-        else:
             evt_num = self.num_advec_and_remesh(
                 self.velocity.gpu_data[self.direction],
                 self._cl_v_l_buff,
                 self._cl_v_r_buff,
                 self.fields_on_part[self.fields_on_grid[0]][0],
                 self.fields_on_grid[0].gpu_data[0],
+                self.gpu_precision(dt),
+                self.gpu_precision(1. / self._v_mesh_size[1]),
+                self.gpu_precision(1. / self._v_mesh_size[2]),
+                self._cl_mesh_info,
+                wait_for=wait_evts)
+        else:
+            evt_comm_l = self.num_advec_and_remesh_comm_l(
+                self.velocity.gpu_data[self.direction],
+                self._cl_v_l_buff,
+                self.fields_on_part[self.fields_on_grid[0]][0],
                 self._cl_s_l_buff,
+                self.gpu_precision(dt),
+                self._cl_mesh_info,
+                wait_for=wait_evts)
+            evt_comm_r = self.num_advec_and_remesh_comm_r(
+                self.velocity.gpu_data[self.direction],
+                self._cl_v_r_buff,
+                self.fields_on_part[self.fields_on_grid[0]][0],
                 self._cl_s_r_buff,
                 self.gpu_precision(dt),
                 self._cl_mesh_info,
                 wait_for=wait_evts)
+            evt_num = self.num_advec_and_remesh(
+                self.velocity.gpu_data[self.direction],
+                self._cl_v_l_buff,
+                self._cl_v_r_buff,
+                self.fields_on_part[self.fields_on_grid[0]][0],
+                self.fields_on_grid[0].gpu_data[0],
+                self.gpu_precision(dt),
+                self._cl_mesh_info,
+                wait_for=wait_evts)
         self.evt_num_remesh = [evt_num]
 
         # Prepare the MPI receptions
@@ -783,7 +821,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
                 buffer_pitches=(self._s_l_buff.nbytes, 0),
                 region=(self._s_block_size, 1, 1),
                 is_blocking=False,
-                wait_for=[evt_num])
+                wait_for=[evt_comm_l])
 
         # Send the left buffer
         ctime = MPI.Wtime()
@@ -805,7 +843,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
                 buffer_pitches=(self._s_l_buff.nbytes, 0),
                 region=(self._s_block_size, 1, 1),
                 is_blocking=False,
-                wait_for=[evt_num])
+                wait_for=[evt_comm_r])
         # Send the right buffer
         for b in xrange(self._s_n_blocks):
             self._evt_get_r[b].wait()
@@ -926,7 +964,10 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
         for k in [self.copy, self.transpose_xy, self.transpose_xy_r,
                   self.transpose_xz, self.transpose_xz_r,
                   self.num_advec_and_remesh,
-                  self.num_advec, self.num_remesh, self.num_remesh_comm_l, self.num_remesh_comm_r]:
+                  self.num_advec, self.num_remesh, self.num_remesh_comm_l,
+                  self.num_remesh_comm_r, self.num_advec_and_remesh,
+                  self.num_advec_and_remesh_comm_r,
+                  self.num_advec_and_remesh_comm_l]:
             if k is not None:
                 for p in k.profile:
                     self.profiler += p