From 05c0f5e2e835abedbd4e2b2b650adbcad83155de Mon Sep 17 00:00:00 2001
From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@imag.fr>
Date: Wed, 14 Aug 2013 16:50:53 +0000
Subject: [PATCH] GPU configuration improvements

---
 HySoP/CMake/ParmesTests.cmake                 |   3 +-
 HySoP/CMakeLists.txt                          |   3 +-
 HySoP/hysop/fields/continuous.py              |   2 +-
 HySoP/hysop/gpu/cl_src/advection/basic.cl     |  56 +++
 .../hysop/gpu/cl_src/advection/basic_noVec.cl |  51 +++
 HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl |  93 +++++
 .../gpu/cl_src/advection/basic_rk4_noVec.cl   |  82 +++++
 HySoP/hysop/gpu/cl_src/advection/builtin.cl   |  27 +-
 .../gpu/cl_src/advection/builtin_noVec.cl     |  40 ++-
 .../hysop/gpu/cl_src/advection/builtin_rk4.cl |  87 +++++
 .../gpu/cl_src/advection/builtin_rk4_noVec.cl |  78 +++++
 HySoP/hysop/gpu/cl_src/common.cl              |   6 +-
 .../kernels/advection_and_remeshing_noVec.cl  |   9 +-
 .../gpu/cl_src/kernels/advection_noVec.cl     |  14 +-
 HySoP/hysop/gpu/cl_src/kernels/copy_noVec.cl  |   2 +-
 .../gpu/cl_src/kernels/remeshing_noVec.cl     |  66 ++++
 .../hysop/gpu/cl_src/kernels/transpose_xy.cl  |  14 +-
 .../gpu/cl_src/kernels/transpose_xy_noVec.cl  |  79 +++++
 .../hysop/gpu/cl_src/kernels/transpose_xz.cl  |  25 +-
 .../gpu/cl_src/kernels/transpose_xz_noVec.cl  |  77 +++++
 .../gpu/cl_src/kernels/transpose_xz_slice.cl  |  74 ++++
 .../kernels/transpose_xz_slice_noVec.cl       |  70 ++++
 .../hysop/gpu/cl_src/remeshing/basic_noVec.cl |   2 +
 .../gpu/cl_src/remeshing/private_noVec.cl     |  95 ++++++
 HySoP/hysop/gpu/cl_src/remeshing/weights.cl   |   2 +-
 .../gpu/cl_src/remeshing/weights_builtin.cl   |   2 +-
 .../gpu/cl_src/remeshing/weights_noVec.cl     |   2 +-
 .../cl_src/remeshing/weights_noVec_builtin.cl |   2 +-
 HySoP/hysop/gpu/config_cayman.py              | 155 +++++++++
 HySoP/hysop/gpu/config_k20m.py                | 161 +++++++++
 HySoP/hysop/gpu/gpu_discrete.py               |  14 +-
 HySoP/hysop/gpu/gpu_particle_advection.py     | 117 ++-----
 HySoP/hysop/gpu/gpu_particle_advection_1k.py  |  37 +-
 HySoP/hysop/gpu/gpu_particle_advection_2k.py  |  87 ++---
 HySoP/hysop/gpu/kernel_benchmark.py           | 320 ++++++++----------
 HySoP/hysop/gpu/tools.py                      |  86 ++---
 HySoP/hysop/numerics/tests/__init__.py        |   0
 HySoP/hysop/tools/timers.py                   |  46 ++-
 HySoP/setup.py.in                             |   1 +
 39 files changed, 1602 insertions(+), 485 deletions(-)
 create mode 100644 HySoP/hysop/gpu/cl_src/advection/basic.cl
 create mode 100644 HySoP/hysop/gpu/cl_src/advection/basic_noVec.cl
 create mode 100644 HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl
 create mode 100644 HySoP/hysop/gpu/cl_src/advection/basic_rk4_noVec.cl
 create mode 100644 HySoP/hysop/gpu/cl_src/advection/builtin_rk4.cl
 create mode 100644 HySoP/hysop/gpu/cl_src/advection/builtin_rk4_noVec.cl
 create mode 100644 HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl
 create mode 100644 HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl
 create mode 100644 HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl
 create mode 100644 HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl
 create mode 100644 HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl
 create mode 100644 HySoP/hysop/gpu/cl_src/remeshing/private_noVec.cl
 create mode 100644 HySoP/hysop/gpu/config_cayman.py
 create mode 100644 HySoP/hysop/gpu/config_k20m.py
 create mode 100644 HySoP/hysop/numerics/tests/__init__.py

diff --git a/HySoP/CMake/ParmesTests.cmake b/HySoP/CMake/ParmesTests.cmake
index af9db544a..87d3f8874 100644
--- a/HySoP/CMake/ParmesTests.cmake
+++ b/HySoP/CMake/ParmesTests.cmake
@@ -9,6 +9,7 @@ set(py_src_dirs
   operator
   problem
   tools
+  numerics
   )
 
 if(USE_MPI)
@@ -61,7 +62,7 @@ endforeach()
 ## that names are not __init__ or test_ and that contains '>>>'
 set(py_doctest_files)
 foreach(testdir ${py_src_dirs})
-  file(GLOB testfiles parmepy/${testdir}/*.py)
+  file(GLOB testfiles parmepy/${testdir}/[a-zA-Z]*.py)
   foreach(testfile ${testfiles})
     file(STRINGS ${testfile} test_doctest REGEX ">>>")
     if(NOT "${test_doctest}" STREQUAL "")
diff --git a/HySoP/CMakeLists.txt b/HySoP/CMakeLists.txt
index 6d3af4846..045fd87e2 100644
--- a/HySoP/CMakeLists.txt
+++ b/HySoP/CMakeLists.txt
@@ -56,7 +56,7 @@ endif()
 # 3  cmake -DCMAKE_INSTALL_PREFIX = some_path ==> install in some_path
 # Note that case 2 and 3 require a proper set of PYTHONPATH environment var for parmepy to work.
 if(USER_INSTALL)
-  # Unset CMAKE_INSTALL_PREFIX to allow user to reset it and to use python default (--user) rather than cmake default. 
+  # Unset CMAKE_INSTALL_PREFIX to allow user to reset it and to use python default (--user) rather than cmake default.
   set(CMAKE_INSTALL_PREFIX "" CACHE PATH "install path")
 endif()
 
@@ -131,6 +131,7 @@ string(STRIP ${${PROJECT_NAME}_PYTHON_BUILD_DIR} ${PROJECT_NAME}_PYTHON_BUILD_DI
 # --- OpenCL ---
 if(WITH_GPU)
   find_python_module(pyopencl REQUIRED)
+  find_python_module(sympy REQUIRED)
 endif()
 
 # --- MPI ---
diff --git a/HySoP/hysop/fields/continuous.py b/HySoP/hysop/fields/continuous.py
index c1ffa113a..e3913378a 100644
--- a/HySoP/hysop/fields/continuous.py
+++ b/HySoP/hysop/fields/continuous.py
@@ -150,7 +150,7 @@ class Field(object):
             s += 'scalar field '
         s += 'with the following (local) discretisations :\n'
         for f in self.discreteFields.values():
-            s += f.__str__(True)
+            s += f.__str__()
         return s
 
     def norm(self, topo=None):
diff --git a/HySoP/hysop/gpu/cl_src/advection/basic.cl b/HySoP/hysop/gpu/cl_src/advection/basic.cl
new file mode 100644
index 000000000..2bc92e4f0
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection/basic.cl
@@ -0,0 +1,56 @@
+/**
+ * @file basic.cl
+ * Advection function, vectorized version, no use of builtins functions.
+ */
+
+float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc);
+
+
+/**
+ * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field.
+ * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types.
+ *
+ * @param i Particle index.
+ * @param dt Time step.
+ * @param dx Space step.
+ * @param invdx 1/dx.
+ * @param gvelo_loc Local velocity cache.
+ * @return Particle position.
+ *
+ * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last.
+ * @remark <code>__N__</code> is expanded at compilation time by vector width.
+ * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component.
+ * @see parmepy.gpu.tools.parse_file
+ */
+float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc)
+{
+  float__N__ v,        		/* Velocity at point */
+    vp,				/* Velocity at right point */
+    p,				/* Intermediary position */
+    c,				/* initial coordinate */
+    hdt = (float__N__)(0.5*dt);	/* half time step */
+  int__N__ i_ind,		/* Interpolation left point */
+    i_ind_p;			/* Interpolation right point */
+
+  c = (float__N__)((i+__NN__)*dx,
+		   );
+
+  v = (float__N__)(gvelo_loc[noBC_id(i+__NN__)],
+		   );
+
+  p = (c + hdt * v) * invdx;
+
+  i_ind = convert_int__N___rtn(p);
+  p = p - convert_float__N__(i_ind);
+
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+
+  v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)],
+		   );
+  vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)],
+		    );
+  v = (p*(vp-v) + v);
+
+  return c + dt * v;
+}
diff --git a/HySoP/hysop/gpu/cl_src/advection/basic_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/basic_noVec.cl
new file mode 100644
index 000000000..2be4a657f
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection/basic_noVec.cl
@@ -0,0 +1,51 @@
+/**
+ * @file basic_noVec.cl
+ * Advection function, basic version
+ */
+
+float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc);
+
+
+/**
+ * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field.
+ * Use of builtin OpenCL functions fma and mix.
+ *
+ * @param i Particle index.
+ * @param dt Time step.
+ * @param dx Space step.
+ * @param invdx 1/dx.
+ * @param gvelo Global velocity field.
+ * @return Particle position
+ *
+ * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last.
+ */
+float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc)
+{
+  float v, 			/* Velocity at point */
+    vp,				/* Velocity at right point */
+    p,				/* Normalized intermediary position */
+    c = i * dx,			/* initial coordinate */
+    hdt = 0.5 * dt;		/* half time step */
+  int i_ind,			/* Interpolation left point */
+    i_ind_p;			/* Interpolation right point */
+
+  v = gvelo_loc[noBC_id(i)];
+
+  p = (c + hdt*v) * invdx;
+
+  i_ind = convert_int_rtn(p);
+  p = p - convert_float(i_ind);
+
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+
+  v = gvelo_loc[noBC_id(i_ind)];
+  vp = gvelo_loc[noBC_id(i_ind_p)];
+  v = (p*(vp-v) + v);
+
+  return c + dt * v;
+}
+/* Operations number :  */
+/*   - 2 positions = 2 * 2 */
+/*   - 1 iterpolation = 9 */
+/* Total = 13 */
diff --git a/HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl b/HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl
new file mode 100644
index 000000000..90b7592d5
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl
@@ -0,0 +1,93 @@
+/**
+ * @file basic_rk4.cl
+ * Advection function (RK4 scheme), vectorized version, no use of builtins functions.
+ */
+
+float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc);
+
+
+/**
+ * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field.
+ * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types.
+ *
+ * @param i Particle index.
+ * @param dt Time step.
+ * @param dx Space step.
+ * @param invdx 1/dx.
+ * @param gvelo_loc Local velocity cache.
+ * @return Particle position.
+ *
+ * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last.
+ * @remark <code>__N__</code> is expanded at compilation time by vector width.
+ * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component.
+ * @see parmepy.gpu.tools.parse_file
+ */
+float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc)
+{
+  float__N__ v,        		/* Velocity at point */
+    vp,				/* Velocity at right point */
+    p,				/* Intermediary position */
+    k,				/* rk averaged velocity */
+    kn,				/* rk intermediate velocity */
+    c,				/* initial coordinate */
+    hdt = (float__N__)(0.5*dt);	/* half time step */
+  int__N__ i_ind,		/* Interpolation left point */
+    i_ind_p;			/* Interpolation right point */
+
+  c = (float__N__)((i+__NN__)*dx,
+		   );
+
+  //k1 = f(t,y)
+  //k2 = f(t + dt/2, y + dt/2 * k1)
+  //k3 = f(t + dt/2, y + dt/2 * k2)
+  //k4 = f(t + dt, y + dt * k3)
+  //result = y + dt/6( k1 + 2 * k2 + 2 * k3 + k4)
+
+  k = (float__N__)(gvelo_loc[noBC_id(i+__NN__)],
+		   );
+
+  p = (c + hdt * k) * invdx;
+  i_ind = convert_int__N___rtn(p);
+  p = p - convert_float__N__(i_ind);
+
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+  v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)],
+		   );
+  vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)],
+		    );
+  kn = p*(vp-v) + v;
+
+  k += 2.0 * kn;
+
+  p = (c + hdt * kn) * invdx;
+  i_ind = convert_int__N___rtn(p);
+  p = p - convert_float__N__(i_ind);
+
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+  v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)],
+		   );
+  vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)],
+		    );
+  kn = p*(vp-v) + v;
+
+  k += 2.0 * kn;
+
+  p = (c + dt * kn) * invdx;
+  i_ind = convert_int__N___rtn(p);
+  p = p - convert_float__N__(i_ind);
+
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+  v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)],
+		   );
+  vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)],
+		    );
+  kn = p*(vp-v) + v;
+
+  k += kn;
+
+
+  return c + (float__N__)(dt / 6.0) * k;
+}
diff --git a/HySoP/hysop/gpu/cl_src/advection/basic_rk4_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/basic_rk4_noVec.cl
new file mode 100644
index 000000000..b571c4bc4
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection/basic_rk4_noVec.cl
@@ -0,0 +1,82 @@
+/**
+ * @file basic_rk4_noVec.cl
+ * Advection function (RK4 scheme), basic version
+ */
+
+float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc);
+
+
+/**
+ * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field.
+ * Use of builtin OpenCL functions fma and mix.
+ *
+ * @param i Particle index.
+ * @param dt Time step.
+ * @param dx Space step.
+ * @param invdx 1/dx.
+ * @param gvelo Global velocity field.
+ * @return Particle position
+ *
+ * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last.
+ */
+float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc)
+{
+  float v, 			/* Velocity at point */
+    vp,				/* Velocity at right point */
+    p,				/* Intermediary position */
+    k,				/* rk averaged velocity */
+    kn,				/* rk intermediate velocity */
+    c = i * dx,			/* initial coordinate */
+    hdt = 0.5 * dt;		/* half time step */
+  int i_ind,			/* Interpolation left point */
+    i_ind_p;			/* Interpolation right point */
+
+  //k1 = f(t,y)
+  //k2 = f(t + dt/2, y + dt/2 * k1)
+  //k3 = f(t + dt/2, y + dt/2 * k2)
+  //k4 = f(t + dt, y + dt * k3)
+  //result = y + dt/6( k1 + 2 * k2 + 2 * k3 + k4)
+
+  k = gvelo_loc[noBC_id(i)]; 	/* k = k1 */
+
+  p = (c + hdt * k) * invdx;
+  i_ind = convert_int_rtn(p);
+  p = p - convert_float(i_ind);
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+  v = gvelo_loc[noBC_id(i_ind)];
+  vp = gvelo_loc[noBC_id(i_ind_p)];
+  kn = p*(vp-v) + v;		/* kn = k2 */
+
+  k += 2.0 * kn;		/* k = k1 + 2*k2 */
+
+  p = (c + hdt * kn) * invdx;
+  i_ind = convert_int_rtn(p);
+  p = p - convert_float(i_ind);
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+  v = gvelo_loc[noBC_id(i_ind)];
+  vp = gvelo_loc[noBC_id(i_ind_p)];
+  kn = p*(vp-v) + v;		/* kn = k3 */
+
+  k += 2.0 * kn;		/* k = k1 + 2*k2 + 2*k3 */
+
+  p = (c + dt * kn) * invdx;
+  i_ind = convert_int_rtn(p);
+  p = p - convert_float(i_ind);
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+  v = gvelo_loc[noBC_id(i_ind)];
+  vp = gvelo_loc[noBC_id(i_ind_p)];
+  kn = p*(vp-v) + v;		/* kn = k4 */
+
+  k += kn;			/* k = k1 + 2*k2 + 2*k3 + k4 */
+
+  return c + dt * k / 6.0;
+}
+/* Operations number :  */
+/*   - 4 positions = 4 * 2 + 3 */
+/*   - 3 iterpolation = 3 * 9 */
+/*   - velocity weights = 5*/
+/* Total = 41 */
+
diff --git a/HySoP/hysop/gpu/cl_src/advection/builtin.cl b/HySoP/hysop/gpu/cl_src/advection/builtin.cl
index cadc6085b..80757b014 100644
--- a/HySoP/hysop/gpu/cl_src/advection/builtin.cl
+++ b/HySoP/hysop/gpu/cl_src/advection/builtin.cl
@@ -25,25 +25,28 @@ float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gve
 float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc)
 {
   float__N__ v,        		/* Velocity at point */
-    p,				/* Intermediary position */
     vp,				/* Velocity at right point */
-    y,				/* Normalized distance to left point */
-    coord;			/* Grid point coordinate */
-  int__N__ ind,        		/* Integer coordinate (cell id) */
-    i_ind,			/* Interpolation left point */
+    p,				/* Intermediary position */
+    c,				/* initial coordinate */
+    hdt = (float__N__)(0.5*dt);	/* half time step */
+  int__N__ i_ind,		/* Interpolation left point */
     i_ind_p;			/* Interpolation right point */
+
+  c = (float__N__)((i+__NN__)*dx,
+		       );
+
   v = (float__N__)(gvelo_loc[noBC_id(i+__NN__)],
 		   );
-  coord = (float__N__)((i+__NN__)*dx,
-		       );
-  p = fma((float__N__)(0.5*dt),v,coord);
-  ind = convert_int__N___rtn(p * invdx);
-  y = (fma(- convert_float__N__(ind),dx,p))* invdx ;
-  i_ind = ((ind + NB_I) % NB_I);
+
+  p = fma(hdt, v, c) * invdx;
+  i_ind = convert_int__N___rtn(p);
+  p = p - convert_float__N__(i_ind);
+
+  i_ind = ((i_ind + NB_I) % NB_I);
   i_ind_p = ((i_ind + 1) % NB_I);
   v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)],
 		   );
   vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)],
 		    );
-  return fma(mix(v,vp,y),dt,coord);
+  return fma(mix(v,vp,p),dt,c);
 }
diff --git a/HySoP/hysop/gpu/cl_src/advection/builtin_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/builtin_noVec.cl
index 5460cd3fe..a6ba8b975 100644
--- a/HySoP/hysop/gpu/cl_src/advection/builtin_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/advection/builtin_noVec.cl
@@ -3,14 +3,14 @@
  * Advection function, basic version
  */
 
-float advection(uint i, float dt, float dx, float invdx, __global float* gvelo);
+float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc);
 
 
 /**
  * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field.
  * Use of builtin OpenCL functions fma and mix.
  *
- * @param i Velocity.
+ * @param i Particle index.
  * @param dt Time step.
  * @param dx Space step.
  * @param invdx 1/dx.
@@ -19,23 +19,31 @@ float advection(uint i, float dt, float dx, float invdx, __global float* gvelo);
  *
  * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last.
  */
-float advection(uint i, float dt, float dx, float invdx, __global float* gvelo)
+float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc)
 {
   float v, 			/* Velocity at point */
     p,				/* Intermediary position */
-    vp,				/* Velocity at right point */
-    y;				/* Normalized distance to left point */
-  int ind, 			/* Integer coordinate (cell id) */
-    i_ind,			/* Interpolation left point */
+    c = i * dx,			/* initial coordinate */
+    hdt = 0.5 * dt;		/* half time step */
+  int i_ind,			/* Interpolation left point */
     i_ind_p;			/* Interpolation right point */
-  uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */
-  v = gvelo[i+line_index];
-  p = fma((float)(0.5*dt),v,i*dx);
-  ind = convert_int_rtn(p * invdx);
-  y = (fma(- convert_float(ind),dx,p))* invdx ;
-  i_ind = ((ind + NB_I) % NB_I);
+
+  v = gvelo_loc[noBC_id(i)];
+
+  p = fma(hdt, v, c) * invdx;
+
+  i_ind = convert_int_rtn(p);
+  p = p - convert_float(i_ind);
+
+  i_ind = ((i_ind + NB_I) % NB_I);
   i_ind_p = ((i_ind + 1) % NB_I);
-  return fma(mix(gvelo[i_ind+line_index],
-                gvelo[i_ind_p+line_index],y),
-            dt,i*dx);
+  v = mix(gvelo_loc[noBC_id(i_ind)],
+	  gvelo_loc[noBC_id(i_ind_p)],p);
+
+  return fma(dt, v, c);
 }
+/* Operations number :  */
+/*   - 2 positions = 2 * fma */
+/*   - 1 iterpolation = 6 + 1 * mix */
+/* Total = 2 fma + 1 mix + 6 */
+
diff --git a/HySoP/hysop/gpu/cl_src/advection/builtin_rk4.cl b/HySoP/hysop/gpu/cl_src/advection/builtin_rk4.cl
new file mode 100644
index 000000000..a67c69c16
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection/builtin_rk4.cl
@@ -0,0 +1,87 @@
+/**
+ * @file builtin_rk4.cl
+ * Advection function, vectorized version.
+ */
+
+float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc);
+
+
+/**
+ * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field.
+ * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types.
+ *
+ * @param i Particle index.
+ * @param dt Time step.
+ * @param dx Space step.
+ * @param invdx 1/dx.
+ * @param gvelo_loc Local velocity cache.
+ * @return Particle position.
+ *
+ * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last.
+ * @remark <code>__N__</code> is expanded at compilation time by vector width.
+ * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component.
+ * @see parmepy.gpu.tools.parse_file
+ */
+float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc)
+{
+  float__N__ v,        		/* Velocity at point */
+    vp,				/* Velocity at right point */
+    p,				/* Intermediary position */
+    k,				/* rk averaged velocity */
+    kn,				/* rk intermediate velocity */
+    c,				/* initial coordinate */
+    hdt = (float__N__)(0.5*dt);	/* half time step */
+  int__N__ i_ind,		/* Interpolation left point */
+    i_ind_p;			/* Interpolation right point */
+
+  c = (float__N__)((i+__NN__)*dx,
+		       );
+
+  k = (float__N__)(gvelo_loc[noBC_id(i+__NN__)],
+		   );
+
+  p = fma(hdt, k, c) * invdx;
+  i_ind = convert_int__N___rtn(p);
+  p = p - convert_float__N__(i_ind);
+
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+  v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)],
+		   );
+  vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)],
+		    );
+  kn = mix(v,vp,p);
+
+  k += 2.0 * kn;
+
+  p = fma(hdt, kn, c) * invdx;
+  i_ind = convert_int__N___rtn(p);
+  p = p - convert_float__N__(i_ind);
+
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+  v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)],
+		   );
+  vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)],
+		    );
+  kn = mix(v,vp,p);
+
+  k += 2.0 * kn;
+
+  p = fma((float__N__)(dt), kn, c) * invdx;
+  i_ind = convert_int__N___rtn(p);
+  p = p - convert_float__N__(i_ind);
+
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+  v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)],
+		   );
+  vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)],
+		    );
+  kn = mix(v,vp,p);
+
+  k += kn;
+
+
+  return fma(k,(float__N__)(dt/6.0),c);
+}
diff --git a/HySoP/hysop/gpu/cl_src/advection/builtin_rk4_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/builtin_rk4_noVec.cl
new file mode 100644
index 000000000..da79c8950
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/advection/builtin_rk4_noVec.cl
@@ -0,0 +1,78 @@
+/**
+ * @file builtin_rk4_noVec.cl
+ * Advection function, basic version
+ */
+
+float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc);
+
+
+/**
+ * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field.
+ * Use of builtin OpenCL functions fma and mix.
+ *
+ * @param i Particle index.
+ * @param dt Time step.
+ * @param dx Space step.
+ * @param invdx 1/dx.
+ * @param gvelo Global velocity field.
+ * @return Particle position
+ *
+ * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last.
+ */
+float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc)
+{
+  float p,		       /* Intermediary position */
+    k,			       /* rk averaged velocity */
+    kn,			       /* rk intermediate velocity */
+    c = i * dx,		       /* initial coordinate */
+    hdt = 0.5 * dt;	       /* half time step */
+  int i_ind,		       /* Interpolation left point */
+    i_ind_p;		       /* Interpolation right point */
+
+  //k1 = f(t,y)
+  //k2 = f(t + dt/2, y + dt/2 * k1)
+  //k3 = f(t + dt/2, y + dt/2 * k2)
+  //k4 = f(t + dt, y + dt * k3)
+  //result = y + dt/6( k1 + 2 * k2 + 2 * k3 + k4)
+
+  k = gvelo_loc[noBC_id(i)]; 	/* k = k1 */
+
+
+  p = fma(hdt, k, c) * invdx;
+  i_ind = convert_int_rtn(p);
+  p = p - convert_float(i_ind);
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+  kn = mix(gvelo_loc[noBC_id(i_ind)],
+	   gvelo_loc[noBC_id(i_ind_p)],p);		/* kn = k2 */
+
+  k += 2.0 * kn;		/* k = k1 + 2*k2 */
+
+  p = fma(hdt, kn, c) * invdx;
+  i_ind = convert_int_rtn(p);
+  p = p - convert_float(i_ind);
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+  kn = mix(gvelo_loc[noBC_id(i_ind)],
+	   gvelo_loc[noBC_id(i_ind_p)],p);		/* kn = k3 */
+
+  k += 2.0 * kn;		/* k = k1 + 2*k2 + 2*k3 */
+
+  p = fma(dt, kn, c) * invdx;
+  i_ind = convert_int_rtn(p);
+  p = p - convert_float(i_ind);
+  i_ind = ((i_ind + NB_I) % NB_I);
+  i_ind_p = ((i_ind + 1) % NB_I);
+  kn = mix(gvelo_loc[noBC_id(i_ind)],
+	   gvelo_loc[noBC_id(i_ind_p)],p);		/* kn = k4 */
+
+  k += kn;			/* k = k1 + 2*k2 + 2*k3 + k4 */
+
+  return fma(k, dt/6.0, c);
+}
+
+/* Operations number :  */
+/*   - 4 positions = 4 * fma  + 3 */
+/*   - 3 iterpolation = 3 * (1 * mix + 6) */
+/*   - velocity weights = 5*/
+/* Total = 4 fma + 3 mix + 19 */
diff --git a/HySoP/hysop/gpu/cl_src/common.cl b/HySoP/hysop/gpu/cl_src/common.cl
index ba88801f0..db36ee0c8 100644
--- a/HySoP/hysop/gpu/cl_src/common.cl
+++ b/HySoP/hysop/gpu/cl_src/common.cl
@@ -32,7 +32,7 @@ inline uint noBC_id(int id){
 
 /**
  * Constants for remeshing formulas:
- *   - M4PRIME 1
+ *   - L2STAR 1
  *   - L2STAR_C2 2
  *   - L2STAR_C3 3
  *   - L2STAR_C4 4
@@ -46,7 +46,7 @@ inline uint noBC_id(int id){
  *   - M8PRIME 12
  *   - L8STAR 13
  */
-#define M4PRIME 1
+#define L2STAR 1
 #define L2STAR_C2 2
 #define L2STAR_C3 3
 #define L2STAR_C4 4
@@ -63,7 +63,7 @@ inline uint noBC_id(int id){
 /**
  * Shift to left grid point
  */
-#if FORMULA == M4PRIME
+#if FORMULA == L2STAR
 #define REMESH_SHIFT 1
 #elif FORMULA == L2STAR_C2
 #define REMESH_SHIFT 1
diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl
index e147fb18f..225476a85 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl
@@ -34,17 +34,16 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
   float invdx = 1.0/dx;		/* Space step inverse */
   uint i;			/* Particle index in 1D problem */
   float p,			/* Particle position */
-    s,				/* Particle scalar */
-    v;				/* Particle velocity */
+    s;				/* Particle scalar */
   uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */
   __local float gscal_loc[NB_I]; /* Local buffer for result */
+  __local float gvelo_loc[NB_I]; /* Velocity cache */
 
   for(i=gidX; i<NB_I; i+=(WI_NB))
     {
       /* Read velocity */
-      v = gvelo[i+line_index];
       /* Fill velocity cache */
-      gvelo_loc[noBC_id(i)] = v;
+      gvelo_loc[noBC_id(i)] = gvelo[i+line_index];
       /* Initialize result buffer */
       gscal_loc[noBC_id(i)] = 0.0;
     }
@@ -57,7 +56,7 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
       /* Read Particle scalar */
       s = pscal[i + line_index];
       /* Compute particle position */
-      p = advection(i, dt, dx, invdx, gvelo);
+      p = advection(i, dt, dx, invdx, gvelo_loc);
       /* Remesh particle */
       remesh(i, dx, invdx, s, p, gscal_loc);
     }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl
index ef2371559..6a27cdd91 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl
@@ -26,9 +26,21 @@ __kernel void advection_kernel(__global const float* gvelo,
   uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */
   float invdx = 1.0/dx;		/* Space step inverse */
   uint i;			/* Particle index in 1D problem */
+  uint line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */
+
+  __local float velocity_cache[NB_I]; /* Velocity cache */
+
+  for(i=gidX; i<NB_I; i+=WI_NB)
+    {
+      /* Fill the cache */
+      velocity_cache[noBC_id(i)] = gvelo[i+line_index];
+    }
+
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
 
   for(i=gidX; i<NB_I; i+=WI_NB)
     {
-      ppos[i+gidY*NB_I+gidZ*NB_I*NB_II] = advection(i, dt, dx, invdx, gvelo);
+      ppos[i+line_index] = advection(i, dt, dx, invdx, velocity_cache) + min_position;
     }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/copy_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/copy_noVec.cl
index 869ce36a7..e9d79c51b 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/copy_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/copy_noVec.cl
@@ -13,7 +13,7 @@
 __kernel void copy(__global const float* in,
 		   __global float* out)
 {
-  uint xIndex = (get_group_id(0) * TILE_DIM_COPY + get_local_id(0)*__N__);
+  uint xIndex = get_group_id(0) * TILE_DIM_COPY + get_local_id(0);
   uint yIndex = get_group_id(1) * TILE_DIM_COPY + get_local_id(1);
   uint zIndex = get_global_id(2);
   uint index = xIndex + yIndex * NB_I + zIndex*NB_I*NB_II;
diff --git a/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl
new file mode 100644
index 000000000..188fc0fa9
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl
@@ -0,0 +1,66 @@
+/**
+ * @file remeshing.cl
+ * Remeshing kernel.
+ */
+
+/**
+ * Performs remeshing of the particles' scalar.
+ * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group.
+ * Each work-item computes <code>NB_I/WI_NB</code> particles positions. To avoid concurrent witings, in case of strong velocity gradients, work-items computes contiguous particles.
+ * Particle are computed through OpenCL vector types of lenght 2, 4 or 8.
+ * Scalar results are stored in a local buffer as a cache and then copied to global memory buffer.
+ *
+ * @param ppos Particle position
+ * @param pscal Particle scalar
+ * @param gscal Grid scalar
+ * @param min_position Domain lower coordinate
+ * @param dx Space step
+ *
+ * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last.
+ * @remark <code>WI_NB</code> corresponds to the work-item number.
+ * @remark <code>__N__</code> is expanded at compilation time by vector width.
+ * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component.
+ * @see parmepy.gpu.tools.parse_file
+ */
+__kernel void remeshing_kernel(__global const float* ppos,
+			       __global const float* pscal,
+			       __global float* gscal, float min_position, float dx)
+{
+  uint gidX = get_global_id(0);	/* OpenCL work-itme global index (X) */
+  uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */
+  uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */
+  float invdx = 1.0/dx;         /* Space step inverse */
+  uint i;			/* Particle index in 1D problem */
+  float p,			/* Particle position */
+    s;				/* Particle scalar */
+  uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */
+  __local float gscal_loc[NB_I]; /* Local buffer for result */
+
+  for(i=gidX; i<NB_I; i+=WI_NB)
+    {
+      /* Initialize result buffer */
+      gscal_loc[i] = 0.0;
+    }
+
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=1)
+    {
+      /* Read particle position */
+      p = ppos[i + line_index] - min_position;
+      /* Read particle scalar */
+      s = pscal[i + line_index];
+      /* Remesh particle */
+      remesh(i, dx, invdx, s, p, gscal_loc);
+    }
+
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=gidX; i<NB_I; i+=WI_NB)
+    {
+      /* Store result */
+      gscal[i + line_index] = gscal_loc[noBC_id(i)];
+  }
+}
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl
index acbd8405f..418a9d1ec 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl
@@ -37,6 +37,8 @@ __kernel void transpose_xy(__global const float* in,
   float__N__ temp;			/* Temporary variable */
   uint group_id_x;			/* Work-group coordinate in global space index X */
   uint group_id_y;			/* Work-group coordinate in global space index Y */
+  uint lid_x = get_local_id(0);
+  uint lid_y = get_local_id(1);
   /* Use of diagonal coordinates */
 #if NB_II == NB_I
   group_id_x = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
@@ -48,14 +50,14 @@ __kernel void transpose_xy(__global const float* in,
 #endif
 
   /* Global input index for work-item */
-  uint xIndex = group_id_x * TILE_DIM_XY + get_local_id(0)*__N__;
-  uint yIndex = group_id_y * TILE_DIM_XY + get_local_id(1);
+  uint xIndex = group_id_x * TILE_DIM_XY + lid_x*__N__;
+  uint yIndex = group_id_y * TILE_DIM_XY + lid_y;
   uint zIndex = get_global_id(2);
   uint index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
 
   /* Global output index */
-  xIndex = group_id_y * TILE_DIM_XY + get_local_id(0)*__N__;
-  yIndex = group_id_x * TILE_DIM_XY + get_local_id(1);
+  xIndex = group_id_y * TILE_DIM_XY + lid_x*__N__;
+  yIndex = group_id_x * TILE_DIM_XY + lid_y;
   uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
   __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */
@@ -64,7 +66,7 @@ __kernel void transpose_xy(__global const float* in,
     {
       /* Fill the tile */
       temp = vload__N__((index_in + i * NB_II)/__N__, in);
-      tile[get_local_id(1) + i][get_local_id(0)*__N__+__NN__] = temp.s__NN__;
+      tile[lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__;
     }
 
   /* Synchronize work-group */
@@ -73,7 +75,7 @@ __kernel void transpose_xy(__global const float* in,
   for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
     {
       /* Write transposed data */
-      temp = (float__N__)(tile[get_local_id(0)*__N__+__NN__][get_local_id(1) + i],
+      temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y + i],
 			  );
       vstore__N__(temp, (index_out + i*NB_I)/__N__, out);
     }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl
new file mode 100644
index 000000000..87e72c1e8
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl
@@ -0,0 +1,79 @@
+/**
+ * @file transpose_xy.cl
+ * Transposition in XY plane, coalesced, diagonal coordinates, vectorized version.
+ */
+
+/**
+ * Performs a transposition in xy plane.
+ * Optimizations used are:
+ *   - Coalesced reads and writes by means of local memory buffer (tile),
+ *   - Local memory padding to avoir banck conflicts (optional),
+ *   - Work groups are mapped to diagonal coordinates in global memory,
+ *   - Reads and writes are performed by OpenCL vector types.
+ *
+ * A work group handle transposition for a tile. Transposition is done when reading data in tile.
+ * Work-group layout: \code
+ * ________________________
+ * |0,0 | 1,0 | ...
+ * |N,0 | 0,1 | 1,2 | ...
+ * | .     .  | 0,2 | ...
+ * | .     .
+ * | .     .
+ * |
+ * \endcode
+ *
+ * @param in Input data
+ * @param out Output data
+ *
+ * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. Output layout is <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code>.
+ * @remark <code>PADDING_XY</code> : local memory padding width.
+ * @remark <code>__N__</code> is expanded at compilation time by vector width.
+ * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component.
+ * @see parmepy.gpu.tools.parse_file
+ */
+__kernel void transpose_xy(__global const float* in,
+			   __global float* out)
+{
+  uint group_id_x;			/* Work-group coordinate in global space index X */
+  uint group_id_y;			/* Work-group coordinate in global space index Y */
+  uint lid_x = get_local_id(0);
+  uint lid_y = get_local_id(1);
+  /* Use of diagonal coordinates */
+#if NB_II == NB_I
+  group_id_x = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
+  group_id_y = get_group_id(0);
+#else
+  uint bid = get_group_id(0) + get_group_id(1) * get_num_groups(0);
+  group_id_y = bid%get_num_groups(1);
+  group_id_x = ((bid/get_num_groups(1)) + group_id_y)%get_num_groups(0);
+#endif
+
+  /* Global input index for work-item */
+  uint xIndex = group_id_x * TILE_DIM_XY + lid_x;
+  uint yIndex = group_id_y * TILE_DIM_XY + lid_y;
+  uint zIndex = get_global_id(2);
+  uint index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
+
+  /* Global output index */
+  xIndex = group_id_y * TILE_DIM_XY + lid_x;
+  yIndex = group_id_x * TILE_DIM_XY + lid_y;
+  uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
+
+  __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */
+
+  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
+    {
+      /* Fill the tile */
+      tile[lid_y + i][lid_x] = in[index_in + i * NB_II];
+    }
+
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
+    {
+      /* Write transposed data */
+      out[index_out + i*NB_I] = tile[lid_x][lid_y + i];
+    }
+}
+
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl
index 743b4db43..b116a9551 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl
@@ -25,8 +25,12 @@
 __kernel void transpose_xz(__global const float* in,
 			   __global float* out)
 {
+  float__N__ temp;			/* Temporary variable */
   uint group_id_x;			/* Work-group coordinate in global space index X */
   uint group_id_z;			/* Work-group coordinate in global space index Y */
+  uint lid_x = get_local_id(0);
+  uint lid_y = get_local_id(1);
+  uint lid_z = get_local_id(2);
 
   /* Use of diagonal coordinates */
 #if NB_III == NB_I
@@ -39,36 +43,39 @@ __kernel void transpose_xz(__global const float* in,
 #endif
 
   /* Global input index for work-item */
-  uint xIndex = group_id_x * TILE_DIM_XZ + get_local_id(0);
-  uint yIndex = get_group_id(1) * TILE_DIM_XZ + get_local_id(1);
-  uint zIndex = group_id_z * TILE_DIM_XZ + get_local_id(2);
+  uint xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__;
+  uint yIndex = get_group_id(1) * TILE_DIM_XZ + lid_y;
+  uint zIndex = group_id_z * TILE_DIM_XZ + lid_z;
   uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
 
   /* Global output index */
-  xIndex = group_id_z * TILE_DIM_XZ + get_local_id(0);
-  zIndex = group_id_x * TILE_DIM_XZ + get_local_id(2);
+  xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__;
+  zIndex = group_id_x * TILE_DIM_XZ + lid_z;
   uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
   __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
 
-  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_ROWS_XZ)
+  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
     {
       for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
 	{
 	  /* Fill the tile */
-	  tile[get_local_id(2) + j][get_local_id(1) + i][get_local_id(0)] = in[index_in + i*NB_III + j*NB_III*NB_II];
+	  temp = vload__N__((index_in + i*NB_III + j*NB_III*NB_II)/__N__, in);
+	  tile[lid_z + j][lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__;
 	}
 
     }
   /* Synchronize work-group */
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_ROWS_XZ)
+  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
     {
       for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
 	{
 	  /* Write transposed data */
-	  out[index_out + i*NB_I + j*NB_I*NB_II] = tile[get_local_id(0)][get_local_id(1)+i][get_local_id(2) + j];
+	  temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y+i][lid_z + j],
+			      );
+	  vstore__N__(temp, (index_out + i*NB_I + j*NB_I*NB_II)/__N__, out);
 	}
     }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl
new file mode 100644
index 000000000..28181ea0f
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl
@@ -0,0 +1,77 @@
+/**
+ * @file transpose_xz.cl
+ * Transposition in XZ plane, coalesced, diagonal coordinates, 3D tiles.
+ */
+
+/**
+ * Perfoms a transposition in XZ plane. As data have to be contiguously read an write in global memory, we use a 3D tile.
+ * Optimizations used are:
+ *   - Coalesced reads and writes by means of local memory buffer (tile),
+ *   - Local memory padding to avoir banck conflicts (optional),
+ *   - Work groups are mapped to diagonal coordinates in global memory,
+ *   - Reads and writes are performed by OpenCL vector types.
+ *
+ *
+ * @param in Input data
+ * @param out Output data
+ *
+ * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. Output layout is <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code>.
+ * @remark <code>PADDING_XZ</code> : local memory padding width.
+ * @remark <code>__N__</code> is expanded at compilation time by vector width.
+ * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component.
+ * @see parmepy.gpu.tools.parse_file
+ * @see transpose_xy.cl
+ */
+__kernel void transpose_xz(__global const float* in,
+			   __global float* out)
+{
+  uint group_id_x;			/* Work-group coordinate in global space index X */
+  uint group_id_z;			/* Work-group coordinate in global space index Y */
+  uint lid_x = get_local_id(0);
+  uint lid_y = get_local_id(1);
+  uint lid_z = get_local_id(2);
+
+  /* Use of diagonal coordinates */
+#if NB_III == NB_I
+  group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0);
+  group_id_z = get_group_id(0);
+#else
+  uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0);
+  group_id_z = bid%get_num_groups(2);
+  group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0);
+#endif
+
+  /* Global input index for work-item */
+  uint xIndex = group_id_x * TILE_DIM_XZ + lid_x;
+  uint yIndex = get_group_id(1) * TILE_DIM_XZ + lid_y;
+  uint zIndex = group_id_z * TILE_DIM_XZ + lid_z;
+  uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
+
+  /* Global output index */
+  xIndex = group_id_z * TILE_DIM_XZ + lid_x;
+  zIndex = group_id_x * TILE_DIM_XZ + lid_z;
+  uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
+
+  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
+
+  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
+    {
+      for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
+	{
+	  /* Fill the tile */
+	  tile[lid_z + j][lid_y + i][lid_x] = in[index_in + i*NB_III + j*NB_III*NB_II];
+	}
+
+    }
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
+    {
+      for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
+	{
+	  /* Write transposed data */
+	  out[index_out + i*NB_I + j*NB_I*NB_II] = tile[lid_x][lid_y+i][lid_z + j];
+	}
+    }
+}
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl
new file mode 100644
index 000000000..611aa465b
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl
@@ -0,0 +1,74 @@
+/**
+ * @file transpose_xz.cl
+ * Transposition in XZ plane, coalesced, diagonal coordinates, 3D tiles.
+ */
+
+/**
+ * Perfoms a transposition in XZ plane. As data have to be contiguously read an write in global memory, we use a 3D tile.
+ * Optimizations used are:
+ *   - Coalesced reads and writes by means of local memory buffer (tile),
+ *   - Local memory padding to avoir banck conflicts (optional),
+ *   - Work groups are mapped to diagonal coordinates in global memory,
+ *   - Reads and writes are performed by OpenCL vector types.
+ *
+ *
+ * @param in Input data
+ * @param out Output data
+ *
+ * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. Output layout is <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code>.
+ * @remark <code>PADDING_XZ</code> : local memory padding width.
+ * @remark <code>__N__</code> is expanded at compilation time by vector width.
+ * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component.
+ * @see parmepy.gpu.tools.parse_file
+ * @see transpose_xy.cl
+ */
+__kernel void transpose_xz(__global const float* in,
+			   __global float* out)
+{
+  float__N__ temp;			/* Temporary variable */
+  uint group_id_x;			/* Work-group coordinate in global space index X */
+  uint group_id_z;			/* Work-group coordinate in global space index Y */
+  uint lid_x = get_local_id(0);
+  uint lid_z = get_local_id(2);
+
+  /* Use of diagonal coordinates */
+#if NB_III == NB_I
+  group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0);
+  group_id_z = get_group_id(0);
+#else
+  uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0);
+  group_id_z = bid%get_num_groups(2);
+  group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0);
+#endif
+
+  /* Global input index for work-item */
+  uint xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__;
+  uint yIndex = get_global_id(1);
+  uint zIndex = group_id_z * TILE_DIM_XZ + lid_z;
+  uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
+
+  /* Global output index */
+  xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__;
+  zIndex = group_id_x * TILE_DIM_XZ + lid_z;
+  uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
+
+  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
+
+  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
+    {
+      /* Fill the tile */
+      temp = vload__N__((index_in + j*NB_III*NB_II)/__N__, in);
+      tile[lid_z + j][lid_x*__N__+__NN__] = temp.s__NN__;
+
+    }
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
+    {
+      /* Write transposed data */
+      temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_z + j],
+			  );
+      vstore__N__(temp, (index_out + j*NB_I*NB_II)/__N__, out);
+    }
+}
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl
new file mode 100644
index 000000000..fc4b1322a
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl
@@ -0,0 +1,70 @@
+/**
+ * @file transpose_xz.cl
+ * Transposition in XZ plane, coalesced, diagonal coordinates, 3D tiles.
+ */
+
+/**
+ * Perfoms a transposition in XZ plane. As data have to be contiguously read an write in global memory, we use a 3D tile.
+ * Optimizations used are:
+ *   - Coalesced reads and writes by means of local memory buffer (tile),
+ *   - Local memory padding to avoir banck conflicts (optional),
+ *   - Work groups are mapped to diagonal coordinates in global memory,
+ *   - Reads and writes are performed by OpenCL vector types.
+ *
+ *
+ * @param in Input data
+ * @param out Output data
+ *
+ * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. Output layout is <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code>.
+ * @remark <code>PADDING_XZ</code> : local memory padding width.
+ * @remark <code>__N__</code> is expanded at compilation time by vector width.
+ * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component.
+ * @see parmepy.gpu.tools.parse_file
+ * @see transpose_xy.cl
+ */
+__kernel void transpose_xz(__global const float* in,
+			   __global float* out)
+{
+  uint group_id_x;			/* Work-group coordinate in global space index X */
+  uint group_id_z;			/* Work-group coordinate in global space index Y */
+  uint lid_x = get_local_id(0);
+  uint lid_z = get_local_id(2);
+
+  /* Use of diagonal coordinates */
+#if NB_III == NB_I
+  group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0);
+  group_id_z = get_group_id(0);
+#else
+  uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0);
+  group_id_z = bid%get_num_groups(2);
+  group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0);
+#endif
+
+  /* Global input index for work-item */
+  uint xIndex = group_id_x * TILE_DIM_XZ + lid_x;
+  uint yIndex = get_global_id(1);
+  uint zIndex = group_id_z * TILE_DIM_XZ + lid_z;
+  uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
+
+  /* Global output index */
+  xIndex = group_id_z * TILE_DIM_XZ + lid_x;
+  zIndex = group_id_x * TILE_DIM_XZ + lid_z;
+  uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
+
+  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
+
+  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
+    {
+      /* Fill the tile */
+      tile[lid_z + j][lid_x] = in[index_in + j*NB_III*NB_II];
+
+    }
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
+    {
+      /* Write transposed data */
+      out[index_out + j*NB_I*NB_II] = tile[lid_x][lid_z + j];
+    }
+}
diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl
index b43a4adb7..a9d67638d 100644
--- a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl
@@ -31,6 +31,8 @@ void remesh(uint i, float dx, float invdx, float s, float p, __local float* gsca
   int ind;			/* Integer coordinate */
   uint index;		/* Remeshing index */
 
+
+
   ind = convert_int_rtn(p * invdx);
   y = (p - convert_float(ind) * dx) * invdx;
 
diff --git a/HySoP/hysop/gpu/cl_src/remeshing/private_noVec.cl b/HySoP/hysop/gpu/cl_src/remeshing/private_noVec.cl
new file mode 100644
index 000000000..df1538d86
--- /dev/null
+++ b/HySoP/hysop/gpu/cl_src/remeshing/private_noVec.cl
@@ -0,0 +1,95 @@
+/**
+ * @file private.cl
+ * Remeshing function, vectorized, private variable.
+ */
+
+void remesh(uint i, float dx, float invdx, float s, float p, __local float* gscal_loc);
+
+
+/**
+ * Remesh particles in local buffer.
+ *
+ * Remeshing formula is given a compiling time.
+ * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types.
+ * Use of a private temporary variable for remeshing weights.
+ *
+ * @param i Particle index
+ * @param dx Space step
+ * @param invdx 1/dx
+ * @param s Particle scalar
+ * @param p Particle position
+ * @param gscal_loc Local buffer for result
+ *
+ * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last.
+ * @remark <code>__N__</code> is expanded at compilation time by vector width.
+ * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component.
+ * @remark <code>FORMULA</code> : remeshing formula flag {<code>M4PRIME</code>, <code>M6PRIME</code>, <code>M8PRIME</code>, <code>L6STAR</code>}
+ * @see parmepy.gpu.tools.parse_file
+ * @see parmepy.gpu.cl_src.common
+ */
+void remesh(uint i, float dx, float invdx, float s, float p, __local float* gscal_loc){
+  float y,			/* Normalized distance to nearest left grid point */
+    temp;			/* Temporary remeshing weights */
+  int ind;			/* Integer coordinate */
+  uint index;		/* Remeshing index */
+
+  ind = convert_int_rtn(p * invdx);
+  y = (p - convert_float(ind) * dx) * invdx;
+
+  index = convert_uint((ind - REMESH_SHIFT + NB_I) % NB_I);
+
+  temp = alpha(y) * s;
+  gscal_loc[noBC_id(index)] += temp;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  index = (index + 1) % NB_I;
+  temp = beta(y) * s;
+  gscal_loc[noBC_id(index)] += temp;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  index = (index + 1) % NB_I;
+  temp = gamma(y) * s;
+  gscal_loc[noBC_id(index)] += temp;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  index = (index + 1) % NB_I;
+  temp = delta(y) * s;
+  gscal_loc[noBC_id(index)] += temp;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+#if REMESH_SHIFT > 1
+  index = (index + 1) % NB_I;
+  temp = eta(y) * s;
+  gscal_loc[noBC_id(index)] += temp;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  index = (index + 1) % NB_I;
+  temp = zeta(y) * s;
+  gscal_loc[noBC_id(index)] += temp;
+  barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#if REMESH_SHIFT > 2
+  index = (index + 1) % NB_I;
+  temp = theta(y) * s;
+  gscal_loc[noBC_id(index)] += temp;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  index = (index + 1) % NB_I;
+  temp = iota(y) * s;
+  gscal_loc[noBC_id(index)] += temp;
+  barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#if REMESH_SHIFT > 3
+  index = (index + 1) % NB_I;
+  temp = kappa(y) * s;
+  gscal_loc[noBC_id(index)] += temp;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  index = (index + 1) % NB_I;
+  temp = mu(y) * s;
+  gscal_loc[noBC_id(index)] += temp;
+  barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+}
diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights.cl
index fec13ff85..4e352a7e5 100644
--- a/HySoP/hysop/gpu/cl_src/remeshing/weights.cl
+++ b/HySoP/hysop/gpu/cl_src/remeshing/weights.cl
@@ -4,7 +4,7 @@
  * Polynomials under Horner form.
  */
 
-#if FORMULA == M4PRIME
+#if FORMULA == L2STAR
 
 inline float__N__ alpha(float__N__ y){
   return ((y * (y * (-y + 2.0) - 1.0)) / 2.0);}
diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl
index 910aafda3..7f8bf2a80 100644
--- a/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl
+++ b/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl
@@ -4,7 +4,7 @@
  * Polynomials under Horner form.
  */
 
-#if FORMULA == M4PRIME
+#if FORMULA == L2STAR
 
 inline float__N__ alpha(float__N__ y){
   return (y*fma(y,fma(y,-1.0, 2.0), - 1.0)/2.0);}
diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl
index 70782b26a..f3e2e950f 100644
--- a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl
@@ -4,7 +4,7 @@
  * Polynomials under Horner form.
  */
 
-#if FORMULA == M4PRIME
+#if FORMULA == L2STAR
 
 inline float alpha(float y){
   return ((y * (y * (-y + 2.0) - 1.0)) / 2.0);}
diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl
index 69de73a5e..0a703d65a 100644
--- a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl
+++ b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl
@@ -4,7 +4,7 @@
  * Polynomials under Horner form.
  */
 
-#if FORMULA == M4PRIME
+#if FORMULA == L2STAR
 
 inline float alpha(float y){
   return (y*fma(y,fma(y,-1.0, 2.0), - 1.0)/2.0);}
diff --git a/HySoP/hysop/gpu/config_cayman.py b/HySoP/hysop/gpu/config_cayman.py
new file mode 100644
index 000000000..d537bec97
--- /dev/null
+++ b/HySoP/hysop/gpu/config_cayman.py
@@ -0,0 +1,155 @@
+"""
+@file config_cayman.py
+
+OpenCL kernels configurations.
+"""
+from parmepy.gpu import PARMES_REAL_GPU, PARMES_DOUBLE_GPU
+
+#build empty dictionaries
+kernels_config = {}
+kernels_config[2] = {PARMES_REAL_GPU: {}, PARMES_DOUBLE_GPU: {}}
+kernels_config[3] = {PARMES_REAL_GPU: {}, PARMES_DOUBLE_GPU: {}}
+
+# Copy kernel:
+def copy_space_index_2d(size, t_dim, b_rows, vec):
+    gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), 1)
+    lwi = (t_dim / vec, b_rows, 1)
+    return gwi, lwi
+def copy_space_index_3d(size, t_dim, b_rows, vec):
+    gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), int(size[2]))
+    lwi = (t_dim / vec, b_rows, 1)
+    return gwi, lwi
+# Configs : sources, tile size, block rows, vector size, index space function
+kernels_config[3][PARMES_REAL_GPU]['copy'] = \
+    ('kernels/copy.cl', 16, 8, 4, copy_space_index_3d)
+kernels_config[3][PARMES_DOUBLE_GPU]['copy'] = \
+    ('kernels/copy_locMem.cl', 32, 8, 1, copy_space_index_3d)
+kernels_config[2][PARMES_REAL_GPU]['copy'] = \
+    ('kernels/copy.cl', 16, 8, 2, copy_space_index_2d)
+kernels_config[2][PARMES_DOUBLE_GPU]['copy'] = \
+    ('kernels/copy.cl', 32, 2, 2, copy_space_index_2d)
+
+# Transpositions kernels:
+# XY transposition
+# Settings are taken from destination layout as current layout.
+# gwi is computed form input layout (appears as transposed layout)
+def xy_space_index_2d(size, t_dim, b_rows, vec):
+    gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), 1)
+    lwi = (t_dim / vec, b_rows, 1)
+    return gwi, lwi
+def xy_space_index_3d(size, t_dim, b_rows, vec):
+    gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), int(size[2]))
+    lwi = (t_dim / vec, b_rows, 1)
+    return gwi, lwi
+# Configs : sources, tile size, block rows, is padding, vector size,
+#              index space function
+kernels_config[3][PARMES_REAL_GPU]['transpose_xy'] = \
+    ('kernels/transpose_xy.cl', 16, 8, True, 2, xy_space_index_3d)
+kernels_config[3][PARMES_DOUBLE_GPU]['transpose_xy'] = \
+    ('kernels/transpose_xy.cl', 32, 4, True, 4, xy_space_index_3d)
+kernels_config[2][PARMES_REAL_GPU]['transpose_xy'] = \
+    ('kernels/transpose_xy.cl', 32, 8, True, 4, xy_space_index_2d)
+kernels_config[2][PARMES_DOUBLE_GPU]['transpose_xy'] = \
+    ('kernels/transpose_xy.cl', 32, 2, True, 4, xy_space_index_2d)
+
+# XZ transposition
+# Settings are taken from destination layout as current layout.
+# gwi is computed form input layout (appears as transposed layout)
+def xz_space_index_3d(size, t_dim, b_rows, b_deph, vec):
+    gwi = (int(size[2] / vec), int(b_rows * size[1] / t_dim), int(b_deph * size[0] / t_dim))
+    lwi = (t_dim / vec, b_rows, b_deph)
+    return gwi, lwi
+# Configs : sources, tile size, block rows, block depth, is padding,
+#              vector size, index space function
+kernels_config[3][PARMES_REAL_GPU]['transpose_xz'] = \
+    ('kernels/transpose_xz.cl', 16, 4, 4, True, 1, xy_space_index_3d)
+kernels_config[3][PARMES_DOUBLE_GPU]['transpose_xz'] = \
+    ('kernels/transpose_xz.cl', 8, 2, 2, False, 1, xy_space_index_3d)
+
+
+def computational_kernels_index_space(size, vec):
+    dim = len(size)
+    if dim == 3:
+        wi = 64
+    if dim == 2:
+        wi = 256
+    # Change work-item regarding problem size
+    if size[0] % wi > 0:
+        if dim == 3:
+            print "Warning : GPU best performances obtained for",
+            print "problem sizes multiples of 64"
+        else:
+            print "Warning : GPU best performances obtained for",
+            print "problem sizes multiples of 256"
+    while(size[0] % wi > 0):
+        wi = wi / 2
+    # Change work-item regarding vector_width
+    if wi * vec > size[0]:
+        if size[0] % vec > 0:
+            raise ValueError(
+                "Resolution ({0}) must be a multiple of {1}".format(
+                    size[0], vec))
+        wi = size[0] // vec
+    if dim == 3:
+        gwi = (int(wi), int(size[1]), int(size[2]))
+        lwi = (int(wi), 1, 1)
+    else:
+        gwi = (int(wi), int(size[1]))
+        lwi = (int(wi), 1)
+    return gwi, lwi
+
+# Advection kernel
+# Configs sources, is noBC, vector size, index space function
+kernels_config[3][PARMES_REAL_GPU]['advec'] = \
+    (["common.cl", "advection/builtin.cl", "kernels/advection.cl"],
+     False, 4, computational_kernels_index_space)
+kernels_config[3][PARMES_DOUBLE_GPU]['advec'] = \
+    (["common.cl", "advection/builtin.cl", "kernels/advection.cl"],
+     False, 2, computational_kernels_index_space)
+kernels_config[2][PARMES_REAL_GPU]['advec'] = \
+    (["common.cl", "advection/builtin.cl", "kernels/advection.cl"],
+     False, 4, computational_kernels_index_space)
+kernels_config[2][PARMES_DOUBLE_GPU]['advec'] = \
+    (["common.cl", "advection/builtin_noVec.cl", "kernels/advection_noVec.cl"],
+     False, 1, computational_kernels_index_space)
+
+# Remeshing kernel
+# Configs sources, is noBC, vector size, index space function
+kernels_config[3][PARMES_REAL_GPU]['remesh'] = \
+    (["common.cl", "remeshing/weights_builtin.cl", "remeshing/private.cl",
+      "kernels/remeshing.cl"],
+     False, 4, computational_kernels_index_space)
+kernels_config[3][PARMES_DOUBLE_GPU]['remesh'] = \
+    (["common.cl", "remeshing/weights_builtin.cl", "remeshing/private.cl",
+      "kernels/remeshing.cl"],
+     False, 4, computational_kernels_index_space)
+kernels_config[2][PARMES_REAL_GPU]['remesh'] = \
+    (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic.cl",
+      "kernels/remeshing.cl"],
+     True, 4, computational_kernels_index_space)
+kernels_config[2][PARMES_DOUBLE_GPU]['remesh'] = \
+    (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic.cl",
+      "kernels/remeshing.cl"],
+     True, 4, computational_kernels_index_space)
+
+# Advection and remeshing kernel
+# Configs sources, is noBC, vector size, index space function
+kernels_config[3][PARMES_REAL_GPU]['advec_and_remesh'] = \
+    (["common.cl", "remeshing/weights_builtin.cl", "remeshing/private.cl",
+      "advection/builtin.cl", "kernels/advection_and_remeshing.cl"],
+     False, 4, computational_kernels_index_space)
+kernels_config[3][PARMES_DOUBLE_GPU]['advec_and_remesh'] = \
+    (["common.cl", "remeshing/weights_builtin.cl", "remeshing/private.cl",
+      "advection/builtin.cl", "kernels/advection_and_remeshing.cl"],
+     True, 4, computational_kernels_index_space)
+kernels_config[2][PARMES_REAL_GPU]['advec_and_remesh'] = \
+    (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic.cl",
+      "advection/builtin.cl", "kernels/advection_and_remeshing.cl"],
+     True, 8, computational_kernels_index_space)
+kernels_config[2][PARMES_DOUBLE_GPU]['advec_and_remesh'] = \
+    (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic.cl",
+      "advection/builtin.cl", "kernels/advection_and_remeshing.cl"],
+     True, 4, computational_kernels_index_space)
+
+
+
diff --git a/HySoP/hysop/gpu/config_k20m.py b/HySoP/hysop/gpu/config_k20m.py
new file mode 100644
index 000000000..726a79b12
--- /dev/null
+++ b/HySoP/hysop/gpu/config_k20m.py
@@ -0,0 +1,161 @@
+"""
+@file config_k20m.py
+
+OpenCL kernels configurations.
+"""
+from parmepy.gpu import PARMES_REAL_GPU, PARMES_DOUBLE_GPU
+
+#build empty dictionaries
+kernels_config = {}
+kernels_config[2] = {PARMES_REAL_GPU: {}, PARMES_DOUBLE_GPU: {}}
+kernels_config[3] = {PARMES_REAL_GPU: {}, PARMES_DOUBLE_GPU: {}}
+
+# Copy kernel:
+def copy_space_index_2d(size, t_dim, b_rows, vec):
+    gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), 1)
+    lwi = (t_dim / vec, b_rows, 1)
+    return gwi, lwi
+def copy_space_index_3d(size, t_dim, b_rows, vec):
+    gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), int(size[2]))
+    lwi = (t_dim / vec, b_rows, 1)
+    return gwi, lwi
+# Configs : sources, tile size, block rows, vector size, index space function
+kernels_config[3][PARMES_REAL_GPU]['copy'] = \
+    ('kernels/copy_noVec.cl', 32, 8, 1, copy_space_index_3d)
+kernels_config[3][PARMES_DOUBLE_GPU]['copy'] = \
+    ('kernels/copy_noVec.cl', 16, 16, 1, copy_space_index_3d)
+kernels_config[2][PARMES_REAL_GPU]['copy'] = \
+    ('kernels/copy_noVec.cl', 32, 8, 1, copy_space_index_2d)
+kernels_config[2][PARMES_DOUBLE_GPU]['copy'] = \
+    ('kernels/copy_noVec.cl', 16, 16, 1, copy_space_index_2d)
+
+# Transpositions kernels:
+# XY transposition
+# Settings are taken from destination layout as current layout.
+# gwi is computed form input layout (appears as transposed layout)
+def xy_space_index_2d(size, t_dim, b_rows, vec):
+    gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), 1)
+    lwi = (t_dim / vec, b_rows, 1)
+    return gwi, lwi
+def xy_space_index_3d(size, t_dim, b_rows, vec):
+    gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), int(size[2]))
+    lwi = (t_dim / vec, b_rows, 1)
+    return gwi, lwi
+# Configs : sources, tile size, block rows, is padding, vector size,
+#              index space function
+kernels_config[3][PARMES_REAL_GPU]['transpose_xy'] = \
+    ('kernels/transpose_xy_noVec.cl', 32, 4, True, 1, xy_space_index_3d)
+kernels_config[3][PARMES_DOUBLE_GPU]['transpose_xy'] = \
+    ('kernels/transpose_xy_noVec.cl', 32, 16, True, 1, xy_space_index_3d)
+kernels_config[2][PARMES_REAL_GPU]['transpose_xy'] = \
+    ('kernels/transpose_xy_noVec.cl', 32, 2, True, 1, xy_space_index_2d)
+kernels_config[2][PARMES_DOUBLE_GPU]['transpose_xy'] = \
+    ('kernels/transpose_xy_noVec.cl', 32, 8, True, 1, xy_space_index_2d)
+
+# XZ transposition
+# Settings are taken from destination layout as current layout.
+# gwi is computed form input layout (appears as transposed layout)
+def xz_space_index_3d(size, t_dim, b_rows, b_deph, vec):
+    gwi = (int(size[2] / vec), int(size[1]), int(b_deph * size[0] / t_dim))
+    lwi = (t_dim / vec, 1, b_deph)
+    return gwi, lwi
+# Configs : sources, tile size, block rows, is padding, vector size,
+#              index space function
+kernels_config[3][PARMES_REAL_GPU]['transpose_xz'] = \
+    ('kernels/transpose_xz_slice_noVec.cl', 32, 0, 8, True, 1, xz_space_index_3d)
+kernels_config[3][PARMES_DOUBLE_GPU]['transpose_xz'] = \
+    ('kernels/transpose_xz_slice_noVec.cl', 32, 0, 4, True, 1, xz_space_index_3d)
+
+def computational_kernels_index_space(wi, size, vec):
+    # Change work-item regarding vector_width
+    if wi * vec > size[0]:
+        if size[0] % vec > 0:
+            raise ValueError(
+                "Resolution ({0}) must be a multiple of {1}".format(
+                    size[0], vec))
+        wi = size[0] // vec
+
+    if len(size) == 3:
+        gwi = (int(wi), int(size[1]), int(size[2]))
+        lwi = (int(wi), 1, 1)
+    else:
+        gwi = (int(wi), int(size[1]))
+        lwi = (int(wi), 1)
+    return gwi, lwi
+
+def advection_index_space_3d(size, vec):
+    wi = min(max(64, size[0] / 4), 1024)
+    return computational_kernels_index_space(wi, size, vec)
+def advection_index_space_2d_SP(size, vec):
+    wi = min(size[0] / 8, 1024)
+    return computational_kernels_index_space(wi, size, vec)
+def advection_index_space_2d_DP(size, vec):
+    wi = min(size[0] / 4, 1024)
+    return computational_kernels_index_space(wi, size, vec)
+
+def remeshing_index_space_3d(size, vec):
+    wi = min(size[0] / 2, 1024)
+    return computational_kernels_index_space(wi, size, vec)
+def remeshing_index_space_2d(size, vec):
+    wi = min(size[0] / 4, 1024)
+    return computational_kernels_index_space(wi, size, vec)
+
+def advection_and_remeshing_index_space(size, vec):
+    wi = min(size[0] / 2, 1024)
+    return computational_kernels_index_space(wi, size, vec)
+
+
+# Advection kernel
+# Configs sources, is noBC, vector size, index space function
+kernels_config[3][PARMES_REAL_GPU]['advec'] = \
+    (["common.cl", "advection/builtin_rk4_noVec.cl", "kernels/advection_noVec.cl"],
+     False, 1, advection_index_space_3d)
+kernels_config[3][PARMES_DOUBLE_GPU]['advec'] = \
+    (["common.cl", "advection/builtin_rk4_noVec.cl", "kernels/advection_noVec.cl"],
+     False, 1, advection_index_space_3d)
+kernels_config[2][PARMES_REAL_GPU]['advec'] = \
+    (["common.cl", "advection/builtin_rk4_noVec.cl", "kernels/advection_noVec.cl"],
+     False, 1, advection_index_space_2d_SP)
+kernels_config[2][PARMES_DOUBLE_GPU]['advec'] = \
+    (["common.cl", "advection/builtin_rk4_noVec.cl", "kernels/advection_noVec.cl"],
+     False, 1, advection_index_space_2d_DP)
+
+# Remeshing kernel
+# Configs sources, is noBC, vector size, index space function
+kernels_config[3][PARMES_REAL_GPU]['remesh'] = \
+    (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic_noVec.cl",
+      "kernels/remeshing_noVec.cl"],
+     False, 1, remeshing_index_space_3d)
+kernels_config[3][PARMES_DOUBLE_GPU]['remesh'] = \
+    (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic_noVec.cl",
+      "kernels/remeshing_noVec.cl"],
+     False, 1, remeshing_index_space_3d)
+kernels_config[2][PARMES_REAL_GPU]['remesh'] = \
+    (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic.cl",
+      "kernels/remeshing.cl"],
+     True, 2, remeshing_index_space_2d)
+kernels_config[2][PARMES_DOUBLE_GPU]['remesh'] = \
+    (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic.cl",
+      "kernels/remeshing.cl"],
+     True, 2, remeshing_index_space_2d)
+
+
+# Advection and remeshing kernel
+# Configs sources, is noBC, vector size, index space function
+kernels_config[3][PARMES_REAL_GPU]['advec_and_remesh'] = \
+    (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/private_noVec.cl",
+      "advection/builtin_rk4_noVec.cl", "kernels/advection_and_remeshing_noVec.cl"],
+     False, 1, advection_and_remeshing_index_space)
+kernels_config[3][PARMES_DOUBLE_GPU]['advec_and_remesh'] = \
+    (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/private_noVec.cl",
+      "advection/builtin_rk4_noVec.cl", "kernels/advection_and_remeshing_noVec.cl"],
+     False, 1, advection_and_remeshing_index_space)
+kernels_config[2][PARMES_REAL_GPU]['advec_and_remesh'] = \
+    (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/private_noVec.cl",
+      "advection/builtin_rk4_noVec.cl", "kernels/advection_and_remeshing_noVec.cl"],
+     False, 1, advection_and_remeshing_index_space)
+kernels_config[2][PARMES_DOUBLE_GPU]['advec_and_remesh'] = \
+    (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/private_noVec.cl",
+      "advection/builtin_rk4_noVec.cl", "kernels/advection_and_remeshing_noVec.cl"],
+     False, 1, advection_and_remeshing_index_space)
+
diff --git a/HySoP/hysop/gpu/gpu_discrete.py b/HySoP/hysop/gpu/gpu_discrete.py
index 08e007365..9c6ad5071 100644
--- a/HySoP/hysop/gpu/gpu_discrete.py
+++ b/HySoP/hysop/gpu/gpu_discrete.py
@@ -51,7 +51,7 @@ class GPUDiscreteField(DiscreteField):
             self.gpu_data[d] = cl.Buffer(self.queue.context,
                                          cl.mem_flags.READ_WRITE,
                                          size=self.data[d].nbytes)
-            self.mem_size += self.gpu_data[d].size
+            self.mem_size += self.data[d].nbytes
         if __VERBOSE__:
             print self.name, self.mem_size, "Bytes (",
             print self.mem_size / (1024 ** 2), "MB)"
@@ -74,18 +74,6 @@ class GPUDiscreteField(DiscreteField):
                 vfield, queue,
                 vfield.topology, vfield.isVector, vfield.name,
                 precision)
-            vfield.gpu_data = [None] * vfield.nbComponents
-            vfield.layout = layout
-            for d in xrange(vfield.nbComponents):
-                vfield.data[d] = np.asarray(vfield.data[d],
-                                            dtype=precision, order=ORDER)
-                vfield.gpu_data[d] = cl.Buffer(vfield.queue.context,
-                                               cl.mem_flags.READ_WRITE,
-                                               size=vfield.data[d].nbytes)
-                vfield.mem_size += vfield.gpu_data[d].size
-            if __VERBOSE__:
-                print vfield.name, vfield.mem_size, "Bytes (",
-                print vfield.mem_size / (1024 ** 2), "MB)"
 
     def setInitializationKernel(self, kernel):
         """
diff --git a/HySoP/hysop/gpu/gpu_particle_advection.py b/HySoP/hysop/gpu/gpu_particle_advection.py
index 2d218b7e5..be278dc25 100644
--- a/HySoP/hysop/gpu/gpu_particle_advection.py
+++ b/HySoP/hysop/gpu/gpu_particle_advection.py
@@ -10,6 +10,7 @@ from parmepy.operator.discrete.particle_advection import ParticleAdvection
 from parmepy.gpu import PARMES_REAL_GPU, cl
 from parmepy.gpu.tools import get_opencl_environment
 from parmepy.gpu.gpu_kernel import KernelLauncher
+from parmepy.tools.timers import Timer
 
 
 class GPUParticleAdvection(ParticleAdvection):
@@ -50,23 +51,27 @@ class GPUParticleAdvection(ParticleAdvection):
         ParticleAdvection.__init__(self, velocity, advectedFields, d,
                                    part_position, part_advectedFields, method)
         self.available_methods = [
-            'l2star_c4', 'l2star_c3', 'l2star_c2', 'm4prime',
+            'l2star_c4', 'l2star_c3', 'l2star_c2', 'l2star',
             'l4star_c4', 'l4star_c3', 'l4star',
             'l6star_c6', 'l6star_c5', 'l6star_c4', 'l6star', 'm8prime',
             'l8star']
-        self.method = 'l4star'
+        self.method = None
         for m in self.available_methods:
             if method.find(m) >= 0:
                 self.method = m
                 break
+        if self.method is None:
+            print "Unknown remeshing method : use default l4star"
+            self.method = 'l4star'
         self.name += ' GPU ' + self.method
         self.user_gpu_src = src
         self.num_method = None
+        self.dim = self.advectedFields[0].dimension
 
         # Compute resolutions for kernels for each direction.
         resol = self.advectedFields[0].topology.mesh.resolution
         self.resolution = resol.copy()
-        if len(resol) == 2:
+        if self.dim == 2:
             if self.dir == 1:
                 self.resolution[0] = resol[1]
                 self.resolution[1] = resol[0]
@@ -85,6 +90,8 @@ class GPUParticleAdvection(ParticleAdvection):
                                              resol)
         self.warning = True
         self.prg = None
+        ## Object to store computational times of OpenCL kernels
+        self.kernels_timer = Timer(self)
 
     @debug
     def setUp(self):
@@ -103,13 +110,12 @@ class GPUParticleAdvection(ParticleAdvection):
         if __VERBOSE__:
             print "Work-items basic config : ",
             print self.workItemNumber, self.gwi, self.lwi
-        dim = self.advectedFields[0].dimension
         self.coord_min = np.ones(4, dtype=self.gpu_precision)
         self.mesh_size = np.ones(4, dtype=self.gpu_precision)
-        self.coord_min[0:dim] = np.asarray(
+        self.coord_min[0:self.dim] = np.asarray(
             self.advectedFields[0].topology.mesh.origin,
             dtype=self.gpu_precision)
-        self.mesh_size[0:dim] = np.asarray(
+        self.mesh_size[0:self.dim] = np.asarray(
             self.advectedFields[0].topology.mesh.space_step,
             dtype=self.gpu_precision)
         self.build_options = ""
@@ -199,40 +205,15 @@ class GPUParticleAdvection(ParticleAdvection):
         """
         build_options = self.build_options
         # copy settings
-        if len(self.resolution) == 3:
-            if self.gpu_precision is PARMES_REAL_GPU:
-                vec = 4
-                src_copy = 'kernels/copy.cl'
-                t_dim = 16
-                b_rows = 8
-            else:
-                vec = 1
-                src_copy = 'kernels/copy_locMem.cl'
-                t_dim = 32
-                b_rows = 8
-        else:
-            vec = 2
-            if self.gpu_precision is PARMES_REAL_GPU:
-                src_copy = 'kernels/copy.cl'
-                t_dim = 16
-                b_rows = 8
-            else:
-                src_copy = 'kernels/copy.cl'
-                t_dim = 32
-                b_rows = 2
+        src, t_dim, b_rows, vec, f_space = \
+            self.cl_env.kernels_config[self.dim][self.gpu_precision]['copy']
+        gwi, lwi = f_space(self.resolution, t_dim, b_rows, vec)
+
         # Build code
         build_options += " -D TILE_DIM_COPY={0}".format(t_dim)
         build_options += " -D BLOCK_ROWS_COPY={0}".format(b_rows)
-        # Direction specific settings
-        # prg = len(self.resolution) * [None]
-        # gwi = len(self.resolution) * [None]
-        # lwi = len(self.resolution) * [None]
-        gwi = (int(self.resolution[0] / vec),
-               int(b_rows * self.resolution[1] / t_dim),
-               int(1 if len(self.resolution) == 2 else self.resolution[2]))
-        lwi = (t_dim / vec, b_rows, 1)
         prg = self.cl_env.build_src(
-            src_copy,
+            src,
             build_options + self.cl_env.default_sizes_constants[self.dir],
             vec)
         self.init_copy = KernelLauncher(prg.copy,
@@ -313,63 +294,36 @@ class GPUParticleAdvection(ParticleAdvection):
         """
         # XY transposition settings
         build_options = self.build_options
-        src_transpose_xy = 'kernels/transpose_xy.cl'
-        build_options += " -D PADDING_XY=1"
-        if len(self.resolution) == 3:
-            if self.gpu_precision is PARMES_REAL_GPU:
-                vec = 2
-                t_dim = 16
-                b_rows = 8
-            else:
-                vec = 4
-                t_dim = 32
-                b_rows = 4
+        src, t_dim, b_rows, is_padding, vec, f_space = \
+            self.cl_env.kernels_config[self.dim][self.gpu_precision]['transpose_xy']
+        gwi, lwi = f_space(self.resolution, t_dim, b_rows, vec)
+
+        if is_padding:
+            build_options += " -D PADDING_XY=1"
         else:
-            vec = 4
-            if self.gpu_precision is PARMES_REAL_GPU:
-                t_dim = 32
-                b_rows = 8
-            else:
-                t_dim = 32
-                b_rows = 2
+            build_options += " -D PADDING_XY=0"
         build_options += " -D TILE_DIM_XY={0}".format(t_dim)
         build_options += " -D BLOCK_ROWS_XY={0}".format(b_rows)
-        # Settings are taken from destination layout as current layout.
-        # gwi is computed form input layout (appears as transposed layout)
-        gwi = (int(self.resolution[1] / vec),
-               int(b_rows * self.resolution[0] / t_dim),
-               int(1 if len(self.resolution) == 2 else self.resolution[2]))
-        lwi = (t_dim / vec, b_rows, 1)
         prg = self.cl_env.build_src(
-            src_transpose_xy,
+            src,
             build_options + self.cl_env.default_sizes_constants[self.dir], vec)
 
         self.init_transpose_xy = KernelLauncher(
             prg.transpose_xy, self.cl_env.queue, gwi, lwi)
 
-        if len(self.resolution) == 3:
+        if self.dim == 3:
             # XY transposition settings
             build_options = self.build_options
-            src_transpose_xz = 'kernels/transpose_xz.cl'
-            vec = 1
-            if self.gpu_precision is PARMES_REAL_GPU:
+            src, t_dim, b_rows, b_deph, is_padding, vec, f_space = \
+                self.cl_env.kernels_config[self.dim][self.gpu_precision]['transpose_xz']
+            gwi, lwi = f_space(self.resolution, t_dim, b_rows, b_deph, vec)
+
+            if is_padding:
                 build_options += " -D PADDING_XZ=1"
-                t_dim = 16
-                b_rows = 4
             else:
                 build_options += " -D PADDING_XZ=0"
-                t_dim = 8
-                b_rows = 2
-            build_options += " -D TILE_DIM_XZ={0}".format(t_dim)
-            build_options += " -D BLOCK_ROWS_XZ={0}".format(b_rows)
-            # Settings are taken from destination layout as current layout.
-            # gwi is computed form input layout (appears as transposed layout)
-            gwi = (int(self.resolution[2] / vec),
-                   int(b_rows * self.resolution[1] / t_dim),
-                   int(b_rows * self.resolution[0] / t_dim))
-            lwi = (t_dim / vec, b_rows, b_rows)
             prg = self.cl_env.build_src(
-                src_transpose_xz,
+                src,
                 build_options + self.cl_env.default_sizes_constants[self.dir],
                 vec)
             self.init_transpose_xz = KernelLauncher(
@@ -429,13 +383,14 @@ class GPUParticleAdvection(ParticleAdvection):
             gpudf.release()
         if self.init_copy.f_timer is not None:
             for f_timer in self.init_copy.f_timer:
-                self.timer.addFunctionTimer(f_timer)
+                self.kernels_timer.addFunctionTimer(f_timer)
             for f_timer in self.init_transpose_xy.f_timer:
-                self.timer.addFunctionTimer(f_timer)
+                self.kernels_timer.addFunctionTimer(f_timer)
             if len(self.resolution) == 3:
                 for f_timer in self.init_transpose_xz.f_timer:
-                    self.timer.addFunctionTimer(f_timer)
+                    self.kernels_timer.addFunctionTimer(f_timer)
         ParticleAdvection.finalize(self)
+        self.timer.addSubTimer(self.kernels_timer, ' Details:')
 
     def __str__(self):
         s = "GPUParticleAdvection (DiscreteOperator). "
diff --git a/HySoP/hysop/gpu/gpu_particle_advection_1k.py b/HySoP/hysop/gpu/gpu_particle_advection_1k.py
index 68b045655..c7b4708db 100644
--- a/HySoP/hysop/gpu/gpu_particle_advection_1k.py
+++ b/HySoP/hysop/gpu/gpu_particle_advection_1k.py
@@ -138,43 +138,20 @@ class GPUParticleAdvection1k(GPUParticleAdvection):
         </table>
         """
         build_options = self.build_options
-        file_list = ["common.cl"]
-        with_noBC = True
-        vec = 4
-        if len(self.resolution) == 3:
-            file_list.append("remeshing/weights_builtin.cl")
-            if self.advectedFields[0].isVector:
-                file_list.append("remeshing/private_vector_3d.cl")
-            else:
-                file_list.append("remeshing/private.cl")
-            if self.gpu_precision is PARMES_REAL_GPU:
-                with_noBC = False
-        else:
-            file_list.append("remeshing/weights_noVec_builtin.cl")
-            if self.advectedFields[0].isVector:
-                file_list.append("remeshing/basic_vector_2d.cl")
-            else:
-                file_list.append("remeshing/basic.cl")
-            if self.gpu_precision is PARMES_REAL_GPU:
-                vec = 8
-        file_list.append("advection/builtin.cl")
-        if self.advectedFields[0].nbComponents == 3:
-            file_list.append("kernels/advection_and_remeshing_vector_3d.cl")
-        elif self.advectedFields[0].nbComponents == 2:
-            file_list.append("kernels/advection_and_remeshing_vector_2d.cl")
-        else:
-            file_list.append("kernels/advection_and_remeshing.cl")
+        src, is_noBC, vec, f_space = \
+            self.cl_env.kernels_config[self.dim][self.gpu_precision]['advec_and_remesh']
+        gwi, lwi = f_space(self.resolution, vec)
+        WINb = lwi[0]
         build_options += " -D FORMULA=" + self.method.upper()
-        if with_noBC:
+        if is_noBC:
             build_options += " -D WITH_NOBC=1"
-        WINb, gwi, lwi = self.cl_env.get_WorkItems(self.resolution, vec)
         build_options_d = " -D WI_NB=" + str(WINb)
         build_options_d += " -D PART_NB_PER_WI="
         build_options_d += str(self.resolution[0] / WINb)
         build_options_d += self.cl_env.default_sizes_constants[self.dir]
         ## Build code
         prg = self.cl_env.build_src(
-            file_list, build_options + build_options_d, vec)
+            src, build_options + build_options_d, vec)
         self.num_advec_and_remesh = KernelLauncher(
             prg.advection_and_remeshing, self.cl_env.queue, gwi, lwi)
 
@@ -240,7 +217,7 @@ class GPUParticleAdvection1k(GPUParticleAdvection):
     def finalize(self):
         if self.num_advec_and_remesh.f_timer is not None:
             for f_timer in self.num_advec_and_remesh.f_timer:
-                self.timer.addFunctionTimer(f_timer)
+                self.kernels_timer.addFunctionTimer(f_timer)
         GPUParticleAdvection.finalize(self)
 
     def __str__(self):
diff --git a/HySoP/hysop/gpu/gpu_particle_advection_2k.py b/HySoP/hysop/gpu/gpu_particle_advection_2k.py
index f3ec47922..02037b03b 100644
--- a/HySoP/hysop/gpu/gpu_particle_advection_2k.py
+++ b/HySoP/hysop/gpu/gpu_particle_advection_2k.py
@@ -62,13 +62,12 @@ class GPUParticleAdvection2k(GPUParticleAdvection):
         Allocates a buffer for storing particle positions.
         """
         ## Velocity.
-        GPUDiscreteField.fromField(self.cl_env.queue, self.velocity,
-                                   self.gpu_precision)
+        GPUDiscreteField.fromField( self.cl_env.queue,self.velocity,
+                         self.gpu_precision)
         ## Transported field.
-        GPUDiscreteField.fromField(self.cl_env.queue,
-                                   self.advectedFields[0],
-                                   self.gpu_precision,
-                                   layout=False)
+        GPUDiscreteField.fromField( self.cl_env.queue,self.advectedFields[0],
+                         self.gpu_precision,
+                         layout=False)
         ## Particle position
         if self.part_position is None:
             self.part_position = \
@@ -76,9 +75,8 @@ class GPUParticleAdvection2k(GPUParticleAdvection):
                       name="Particle_Position",
                       isVector=False
                       ).discretize(self.advectedFields[0].topology)
-        GPUDiscreteField.fromField(
-            self.cl_env.queue, self.part_position,
-            self.gpu_precision)
+        GPUDiscreteField.fromField( self.cl_env.queue,self.part_position,
+                         self.gpu_precision)
         ## Result scalar
         if self.part_advectedFields is None:
             self.part_advectedFields = [
@@ -86,10 +84,8 @@ class GPUParticleAdvection2k(GPUParticleAdvection):
                       name="Particle_AdvectedFields",
                       isVector=self.advectedFields[0].isVector
                       ).discretize(self.advectedFields[0].topology)]
-        GPUDiscreteField.fromField(
-            self.cl_env.queue, self.part_advectedFields[0],
-            self.gpu_precision,
-            layout=False)
+        GPUDiscreteField.fromField( self.cl_env.queue,self.part_advectedFields[0],
+                         self.gpu_precision, layout=False)
 
         self.variables = [self.advectedFields[0], self.velocity,
                           self.part_position, self.part_advectedFields[0]]
@@ -183,31 +179,17 @@ class GPUParticleAdvection2k(GPUParticleAdvection):
         </table>
         """
         build_options = self.build_options
-        file_list = ["common.cl"]
-        with_noBC = False
-        if len(self.resolution) == 3:
-            file_list.append("advection/builtin.cl")
-            file_list.append("kernels/advection.cl")
-            if self.gpu_precision is PARMES_REAL_GPU:
-                vec = 4
-            else:
-                vec = 2
-        else:
-            if self.gpu_precision is PARMES_REAL_GPU:
-                file_list.append("advection/builtin.cl")
-                file_list.append("kernels/advection.cl")
-                vec = 4
-            else:
-                file_list.append("advection/builtin_noVec.cl")
-                file_list.append("kernels/advection_noVec.cl")
-                vec = 1
-        if with_noBC:
+        src, is_noBC, vec, f_space = \
+            self.cl_env.kernels_config[self.dim][self.gpu_precision]['advec']
+        gwi, lwi = f_space(self.resolution, vec)
+        WINb = lwi[0]
+
+        if is_noBC:
             build_options += " -D WITH_NOBC=1"
-        WINb, gwi, lwi = self.cl_env.get_WorkItems(self.resolution, vec)
         build_options_d = " -D WI_NB=" + str(WINb)
         ## Build code
         prg = self.cl_env.build_src(
-            file_list,
+            src,
             build_options_d + self.cl_env.default_sizes_constants[self.dir],
             vec)
         self.num_advec = KernelLauncher(
@@ -215,40 +197,21 @@ class GPUParticleAdvection2k(GPUParticleAdvection):
 
         ## remeshing
         build_options = self.build_options
-        file_list = ["common.cl"]
-        vec = 4
-        if len(self.resolution) == 3:
-            file_list.append("remeshing/weights_builtin.cl")
-            if self.advectedFields[0].isVector:
-                file_list.append("remeshing/private_vector_3d.cl")
-            else:
-                file_list.append("remeshing/private.cl")
-            with_noBC = False
-        else:
-            file_list.append("remeshing/weights_noVec_builtin.cl")
-            if self.advectedFields[0].isVector:
-                file_list.append("remeshing/basic_vector_2d.cl")
-            else:
-                file_list.append("remeshing/basic.cl")
-            with_noBC = True
-        if self.advectedFields[0].nbComponents == 3:
-            file_list.append("kernels/remeshing_vector_3d.cl")
-        elif self.advectedFields[0].nbComponents == 2:
-            file_list.append("kernels/remeshing_vector_2d.cl")
-        else:
-            file_list.append("kernels/remeshing.cl")
+        src, is_noBC, vec, f_space = \
+            self.cl_env.kernels_config[self.dim][self.gpu_precision]['remesh']
+        gwi, lwi = f_space(self.resolution, vec)
+        WINb = lwi[0]
+
         build_options += " -D FORMULA=" + self.method.upper()
-        if with_noBC:
+        if is_noBC:
             build_options += " -D WITH_NOBC=1"
-        WINb, gwi, lwi = self.cl_env.get_WorkItems(
-            self.resolution, vec)
         build_options_d = " -D WI_NB=" + str(WINb)
         build_options_d += " -D PART_NB_PER_WI="
         build_options_d += str(self.resolution[0] / WINb)
         build_options_d += self.cl_env.default_sizes_constants[self.dir]
         ## Build code
         prg = self.cl_env.build_src(
-            file_list, build_options + build_options_d, vec)
+            src, build_options + build_options_d, vec)
         self.num_remesh = KernelLauncher(
             prg.remeshing_kernel, self.cl_env.queue, gwi, lwi)
 
@@ -315,9 +278,9 @@ class GPUParticleAdvection2k(GPUParticleAdvection):
     def finalize(self):
         if self.num_advec.f_timer is not None:
             for f_timer in self.num_advec.f_timer:
-                self.timer.addFunctionTimer(f_timer)
+                self.kernels_timer.addFunctionTimer(f_timer)
             for f_timer in self.num_remesh.f_timer:
-                self.timer.addFunctionTimer(f_timer)
+                self.kernels_timer.addFunctionTimer(f_timer)
         GPUParticleAdvection.finalize(self)
 
 
diff --git a/HySoP/hysop/gpu/kernel_benchmark.py b/HySoP/hysop/gpu/kernel_benchmark.py
index 93333bf55..4f2212695 100644
--- a/HySoP/hysop/gpu/kernel_benchmark.py
+++ b/HySoP/hysop/gpu/kernel_benchmark.py
@@ -3,7 +3,8 @@
 
 Package for benchmarking OpenCL kernels.
 """
-import pyopencl as cl
+from parmepy.gpu import PARMES_DOUBLE_GPU, cl
+from parmepy.gpu.tools import get_opencl_environment
 import numpy as np
 import pickle
 
@@ -11,21 +12,26 @@ import pickle
 class BenchmarkSuite:
     """Benchark suite management"""
 
-    def __init__(self, sizes, kernel_name,
-                 versions, configs, test=False, true_res=None, arg_to_test=0,
+    def __init__(self, sizes, name,
+                 kernels, configs, versions, setupFunction,
+                 test=False, true_res=None, arg_to_test=0,
                  inputs={}, file_name="Benchmarks_data",
-                 precision='SP', nb_run=20):
+                 precision=PARMES_DOUBLE_GPU, nb_run=20):
         """
         Creates a benchmak suite, that consists in a list of Benchmark.
 
         @param sizes : list of different problem sizes to benchmark.
-        @param kernel_name : name of the kernel to benchmark.
-        @param versions : list of tuples containing kernel versions
-        (kernel sources, kernel OpenCL name ).
+        @param name : name of the kernel to benchmark.
+        @param kernels : list of tuples containing kernel versions
+        (kernel sources file, OpenCL kernel name ).
         @param configs : dictionary of configurations.
         keys are kernel OpenCL name,
-        values are tuples containing (compilation flags function, kernel
-        arguments settings, config name, condition related to problem size)
+        values are tuples containing (kernel parameters vectorization
+        and identifier in last position).
+        @param versions : kernel versions to bench (used as dictionaries keys)
+        @param setupFunction : Function that returns building options and
+        kernel arguments (assuming arrays are numpy arrays) depending on a
+        given config, size and input dictionary.
         @param test : by default no results tests are performed
         @param true_res : function to compute true results
         @param arg_to_test : index of kernel arguments that contains result
@@ -39,20 +45,22 @@ class BenchmarkSuite:
         If no such file, a new database is created.
         """
         self.pickle_file_name = file_name
-        if precision == 'SP':
-            self.pickle_file_name += '_SP'
-        else:
+        if precision == PARMES_DOUBLE_GPU:
             self.pickle_file_name += '_DP'
+        else:
+            self.pickle_file_name += '_SP'
         self.sizes = sizes
         self.versions = versions
+        self.kernels_files = kernels
         self.configs = configs
         self.inputs = inputs
         self.test = test
         self.compute_true_res = true_res
         self.arg_to_test = arg_to_test
-        self.kernel_name = kernel_name
+        self.kernel_name = name
         self.precision = precision
         self.nb_run = nb_run
+        self.setupFunction = setupFunction
 
         if not self.test:
             try:
@@ -65,6 +73,8 @@ class BenchmarkSuite:
                 print 'start new database'
                 self.timings = {}
         else:
+            assert not true_res is None
+            assert arg_to_test >= 2
             self.timings = {}
         self.complete_timings()
 
@@ -77,16 +87,11 @@ class BenchmarkSuite:
         if self.kernel_name not in self.timings.keys():
             self.timings[self.kernel_name] = {}
         for v in self.versions:
-            if not v[1] in self.timings[self.kernel_name].keys():
-                self.timings[self.kernel_name][v[1]] = {}
-            if isinstance(self.configs, dict):
-                for c in self.configs[v[1]]:
-                    if not c[2] in self.timings[self.kernel_name][v[1]].keys():
-                        self.timings[self.kernel_name][v[1]][c[2]] = {}
-            else:
-                for c in self.configs:
-                    if not c[2] in self.timings[self.kernel_name][v[1]].keys():
-                        self.timings[self.kernel_name][v[1]][c[2]] = {}
+            if not v in self.timings[self.kernel_name].keys():
+                self.timings[self.kernel_name][v] = {}
+            for c in self.configs[v]:
+                if not c[-1] in self.timings[self.kernel_name][v].keys():
+                    self.timings[self.kernel_name][v][c[-1]] = {}
 
     def launch(self):
         """
@@ -101,34 +106,19 @@ class BenchmarkSuite:
             self.true_res = {}
             self.compute_true_res(self.sizes, self.true_res, self.inputs)
         for v in self.versions:
-            if isinstance(self.configs, dict):
-                conf_list = self.configs[v[1]]
-            else:
-                conf_list = self.configs
+            conf_list = self.configs[v]
             for conf in conf_list:
-                try:
-                    allowed_size = conf[3]
-                except IndexError:
-                    allowed_size = None
-                if callable(conf[0]):
-                    b = Benchmark(
-                        v[0], v[1], self.sizes,
-                        lambda s: "-D WIDTH=" + str(s[0]) + conf[0](s),
-                        inputs=self.inputs, allowed_size=allowed_size,
-                        precision=self.precision, nb_run=self.nb_run)
-                else:
-                    b = Benchmark(
-                        v[0], v[1], self.sizes,
-                        lambda s: "-D WIDTH=" + str(s[0]) + conf[0],
-                        inputs=self.inputs, allowed_size=allowed_size,
-                        precision=self.precision, nb_run=self.nb_run)
-                b.kernelSetup = conf[1]
-                print self.kernel_name, v[1], conf[2]
+                b = Benchmark(
+                    self.kernels_files[v][0], self.kernels_files[v][1],
+                    self.sizes, conf, self.setupFunction,
+                    inputs=self.inputs,
+                    precision=self.precision, nb_run=self.nb_run)
+                print self.kernel_name, v, conf[-1]
                 if self.test:
                     b.test(self.true_res, self.arg_to_test)
                 else:
                     b.launch()
-                    [self.timings[self.kernel_name][v[1]][conf[2]].__setitem__(
+                    [self.timings[self.kernel_name][v][conf[-1]].__setitem__(
                         t[0], t[1]) for t in b.timings.items()]
                     pickle.dump(
                         self.timings, open(
@@ -208,9 +198,16 @@ class BenchmarkSuite:
                         f.write(v + " ")
                         f.write(c + " ")
                         #print c
-                        cs = c.replace(
-                            'Nx', str(s[0])).replace('x', '*').split('_')
-                        cse = cs[0] + '/' + cs[1][1] if len(cs) == 2 else cs[0]
+                        # Compute work-item number from configuration string:
+                        # If config, start with 'wi=N', work-item number is set to N
+                        # Else, it assume a configuration matching 'A[xB]+[_fn]?'
+                        # It replace 'x' by '*' and divide by n. String is evaluated as python instruction
+                        if c[0:3] == 'wi=':
+                            cse = c.split('=')[1].split('_')[0]
+                        else:
+                            cs = c.replace(
+                                'Nx', str(s[0])).replace('x', '*').split('_')
+                            cse = cs[0] + '/' + cs[1][1] if len(cs) == 2 else cs[0]
                         #print cse
                         f.write(str(eval(cse)) + ' ')
                         try:
@@ -243,92 +240,30 @@ def find_min(filename, kernel=None, version=None, config=None, size=None):
 class Benchmark:
     """Benchmark management"""
 
-    def __init__(self, code, kernel_name, sizes, build_opt, nb_run=20,
-                 inputs=None, allowed_size=None,
-                 precision='SP'):
+    def __init__(self, kernel_file, kernel_name, sizes, config, setupFunction,
+                 nb_run=20, inputs=None, precision=PARMES_DOUBLE_GPU):
         """
-        Creates a benchmark for a given source code, kernel for differnet
-        problem sizes.
+        Creates a benchmark for a given source kernel_file, kernel for
+        different problem sizes.
 
-        @param code : kernels source code as a string
+        @param kernel_file : kernels source file
         @param kernel_name : name of the kernel to benchmark as a string
         @param sizes : list of different problem sizes to launch kernel
-        @param build_opt : OpenCL compiler options
+        @param config : list of kernel parameters
+        @param setupFunction : Function that returns building options and
+        kernel arguments (assuming arrays are numpy arrays) depending on a
+        given config, size and input dictionary.
         @param nb_run : number of launches to average time (default = 20)
         @param inputs : input data
-        @param allowed_size : boolean function that allows benchmarks
-        regarding problem size (depends on configuration)
         @param precision : Floating point precision for kernels
         """
-        #Get platform.
-        try:
-            ## OpenCL platform
-            self.platform = cl.get_platforms()[0]
-        except IndexError:
-            print "  Incorrect platform_id :", platform_id, ".",
-            print " Only ", len(cl.get_platforms()), " available.",
-            print " Getting defalut platform. "
-            self.platform = cl.get_platforms()[0]
-        #Get device.
-        try:
-            ## OpenCL device
-            self.device = self.platform.get_devices(cl.device_type.GPU)[0]
-        except cl.RuntimeError as e:
-            print "RuntimeError:", e
-            self.device = cl.create_some_context().devices[0]
-        except AttributeError as e:
-            print "AttributeError:", e
-            self.device = cl.create_some_context().devices[0]
-        print "\n  Benchmark on :   ", self.platform.name,
-        print " with ", self.platform.version,
-        print " over : ", self.device.name,
-        print " with ", self.device.opencl_c_version
-        print " Precision capability  ",
-        self.correctlyroundeddividesqrt = True
-        if precision == 'SP':
-            print "for Single Precision: ",
-            for v in ['DENORM', 'INF_NAN',
-                      'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF',
-                      'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']:
-                try:
-                    if eval('(self.device.single_fp_config' +
-                            ' & cl.device_fp_config.' +
-                            v + ') == cl.device_fp_config.' + v):
-                        print v,
-                except AttributeError as ae:
-                    if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT':
-                        self.correctlyroundeddividesqrt = False
-                    print '\n', v, 'is not supported in OpenCL C 1.2. ',
-                    print 'Exception catched : ', ae
-        else:
-            print "for Double Precision: ",
-            if self.device.double_fp_config > 0:
-                for v in ['DENORM', 'INF_NAN',
-                          'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF',
-                          'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT',
-                          'SOFT_FLOAT']:
-                    try:
-                        if eval('(self.device.double_fp_config' +
-                                ' & cl.device_fp_config.' +
-                                v + ') == cl.device_fp_config.' + v):
-                            print v,
-                    except AttributeError as ae:
-                        if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT':
-                            self.correctlyroundeddividesqrt = False
-                        print '\n', v, 'is supported in OpenCL C 1.2. ',
-                        print 'Exception catched : ', ae
-            else:
-                raise ValueError("Double Precision is not supported by device")
-        print ""
-        #Creates GPU Context
-        ## OpenCL context
-        self.ctx = cl.Context([self.device])
-        #Create CommandQueue on the GPU Context
-        ## OpenCL command queue
-        self.queue = cl.CommandQueue(
-            self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
-        ## OpenCL Source code
-        self.code = code
+        self.cl_env = get_opencl_environment(0, 0, 'gpu', precision, sizes[0])
+        self.platform = self.cl_env.platform
+        self.device = self.cl_env.device
+        self.ctx = self.cl_env.ctx
+        self.queue = self.cl_env.queue
+        ## OpenCL Source kernel_file
+        self.kernel_file = kernel_file
         ## Kernel name
         self.kernel = kernel_name
         ## Compiled programs
@@ -344,28 +279,39 @@ class Benchmark:
         ## Problems inputs
         self.inputs = inputs
         ## Function to test size
-        self.is_size_allowed = allowed_size
         self.precision = precision
-        if self.code is not None:
+        self.setupFunction = setupFunction
+
+        self.setup = {}
+        if self.kernel_file is not None:
             for size in self.sizes:
-                if self.is_size_allowed is None or self.is_size_allowed(size):
-                    build_options = ""  # " -cl-opt-disable"
-                    if self.correctlyroundeddividesqrt:
-                        build_options += \
-                            " -cl-fp32-correctly-rounded-divide-sqrt "
-                    if self.precision == 'SP':
-                        prg = cl.Program(
-                            self.ctx, self.code.replace('.0', '.0f'))
-                        build_options += " -cl-single-precision-constant "
-                    else:
-                        prg = cl.Program(
-                            self.ctx, self.code.replace('float', 'double'))
-                    prg.build(build_options + build_opt(size))
-                    print prg.get_build_info(
-                        self.device, cl.program_build_info.OPTIONS)
-                    self.prg[size] = prg
-        ## Function to setup kernels arguments
-        self.kernelSetup = None
+                self.setup[size] = self.setupFunction(config, size, self.inputs)
+                if self.setup[size] is not None:
+                    toDelete = False
+                    #print np.prod(self.setup[size][1][1]), "WI (",self.device.max_work_group_size ," max )"
+                    if np.prod(self.setup[size][1][1]) > self.device.max_work_group_size:
+                        toDelete = True
+                    global_mem_used=0
+                    for arg in self.setup[size][1]:
+                        if isinstance(arg, np.ndarray) and \
+                                len(arg.shape) > 1:
+                                #print "Alloc : ", arg.nbytes, "Bytes (", self.device.max_mem_alloc_size, "max)"
+                            if arg.nbytes > self.device.max_mem_alloc_size:
+                                toDelete = True
+                            global_mem_used += arg.nbytes
+                            #print "Total Alloc : ", global_mem_used, "Bytes (", self.device.global_mem_size, "max)"
+                    if global_mem_used > self.device.global_mem_size:
+                        toDelete = True
+                        #print "Local Alloc : ", self.setup[size][2], "Bytes (", self.device.local_mem_size, "max)"
+                    if self.setup[size][2] > self.device.local_mem_size:
+                        toDelete = True
+                    if toDelete:
+                        self.setup[size] = None
+        if self.kernel_file is not None:
+            for size in self.sizes:
+                if self.setup[size] is not None:
+                    self.prg[size] = self.cl_env.build_src(
+                        kernel_file, self.setup[size][0], config[-2])
 
     def test(self, true_res, ind_res):
         """
@@ -376,9 +322,9 @@ class Benchmark:
         """
         print "Testing : "
         for size in self.sizes:
-            if self.is_size_allowed is None or self.is_size_allowed(size):
+            if self.setup[size] is not None:
                 kernel = eval('self.prg[size].' + self.kernel)
-                kernelArgs = self.kernelSetup(size, self.inputs)
+                kernelArgs = self.setup[size][1]
                 res = np.empty_like(kernelArgs[ind_res])
                 mem_used = 0
                 for i in xrange(len(kernelArgs)):
@@ -408,10 +354,10 @@ class Benchmark:
                         res = res[:size[0], :size[1], :size[2]]
                     else:
                         res = res[:size[0], :size[1]]
-                    if self.precision == 'SP':
-                        exp = 6
-                    else:
+                    if self.precision == PARMES_DOUBLE_GPU:
                         exp = 15
+                    else:
+                        exp = 6
                     np.testing.assert_array_almost_equal(
                         res, true_res[size], decimal=exp)
                     print 'Ok'
@@ -442,41 +388,41 @@ class Benchmark:
         """
         print "\nRunning : "
         for size in self.sizes:
-            if self.is_size_allowed is None or self.is_size_allowed(size):
-                print size,
+            print size,
+            if self.setup[size] is not None:
                 kernel = eval('self.prg[size].' + self.kernel)
-                if not self.kernelSetup is None:
-                    kernelArgs = self.kernelSetup(size, self.inputs)
-                    mem_used = 0
-                    for i in xrange(len(kernelArgs)):
-                        if isinstance(kernelArgs[i], np.ndarray) and \
-                                len(kernelArgs[i].shape) > 1:
-                            buff = cl.Buffer(
-                                self.ctx, cl.mem_flags.READ_WRITE,
-                                size=kernelArgs[i].nbytes)
-                            cl.enqueue_copy(self.queue, buff, kernelArgs[i])
-                            mem_used += kernelArgs[i].nbytes
-                            kernelArgs[i] = buff
-                    print "Memory used : {0:.5f} GiB ({1:.2f}%)".format(
-                        mem_used / (1024. ** 3),
-                        100. * mem_used / (self.device.global_mem_size * 1.),
-                        mem_used / (1024. ** 3))
-                    self.queue.finish()
-                    print kernelArgs[0:2]
+                kernelArgs = self.setup[size][1]
+                mem_used = 0
+                for i in xrange(len(kernelArgs)):
+                    if isinstance(kernelArgs[i], np.ndarray) and \
+                            len(kernelArgs[i].shape) > 1:
+                        buff = cl.Buffer(
+                            self.ctx, cl.mem_flags.READ_WRITE,
+                            size=kernelArgs[i].nbytes)
+                        cl.enqueue_copy(self.queue, buff, kernelArgs[i])
+                        mem_used += kernelArgs[i].nbytes
+                        kernelArgs[i] = buff
+                print "Memory used : {0:.5f} GiB ({1:.2f}%)".format(
+                    mem_used / (1024. ** 3),
+                    100. * mem_used / (self.device.global_mem_size * 1.),
+                    mem_used / (1024. ** 3)),
+                self.queue.finish()
+                print kernelArgs[0:2],
+                evt = kernel(self.queue, *tuple(kernelArgs))
+                self.queue.finish()
+                evts = []
+                for i in xrange(self.nb_run):
                     evt = kernel(self.queue, *tuple(kernelArgs))
+                    evts.append(evt)
                     self.queue.finish()
-                    evts = []
-                    for i in xrange(self.nb_run):
-                        evt = kernel(self.queue, *tuple(kernelArgs))
-                        evts.append(evt)
-                        self.queue.finish()
-                    time = 0.
-                    for evt in evts:
-                        time += (evt.profile.end - evt.profile.start) * 1e-9
-                    self.timings[size] = time / self.nb_run
-                    self.kernel_args[size] = kernelArgs[0:2]
-                    print self.timings[size], "args : ", kernelArgs[0:2]
-                    for i in xrange(len(kernelArgs)):
-                        if isinstance(kernelArgs[i], cl.MemoryObject):
-                            kernelArgs[i].release()
-
+                time = 0.
+                for evt in evts:
+                    time += (evt.profile.end - evt.profile.start) * 1e-9
+                self.timings[size] = time / self.nb_run
+                self.kernel_args[size] = kernelArgs[0:2]
+                print self.timings[size]
+                for i in xrange(len(kernelArgs)):
+                    if isinstance(kernelArgs[i], cl.MemoryObject):
+                        kernelArgs[i].release()
+            else:
+                print "Incompatible sizes"
diff --git a/HySoP/hysop/gpu/tools.py b/HySoP/hysop/gpu/tools.py
index 674010ea1..affed92bf 100644
--- a/HySoP/hysop/gpu/tools.py
+++ b/HySoP/hysop/gpu/tools.py
@@ -5,7 +5,8 @@ Tools for gpu management.
 """
 from parmepy import __VERBOSE__
 from parmepy.constants import S_DIR
-from parmepy.gpu import cl, PARMES_REAL_GPU, GPU_SRC, CL_PROFILE
+from parmepy.gpu import cl, PARMES_REAL_GPU, PARMES_DOUBLE_GPU, \
+    GPU_SRC, CL_PROFILE
 
 ## Global variable handling an OpenCL Environment instance
 __cl_env = None
@@ -40,7 +41,7 @@ class OpenCLEnvironment(object):
         ## Floating point precision
         self.precision = precision
         self.macros = {}
-        self.default_build_opts = ""
+        self.default_build_opts = "-Werror"
         if not self.precision is None:
             self.default_build_opts += self._get_precision_opts()
         if resolution is not None:
@@ -61,6 +62,19 @@ class OpenCLEnvironment(object):
         else:
             self.default_sizes_constants = None
 
+        print self.device.name
+        ## Kernels configuration dictionary
+        if self.device.name == "Cayman":
+            from config_cayman import kernels_config as kernel_cfg
+        elif self.device.name == "Tesla K20m":
+            from config_k20m import kernels_config as kernel_cfg
+        else:
+            # from config_default import kernels_config as kernel_cfg
+            raise ValueError(
+                "Unknown device. Default configuration is not yet implemented")
+        self.kernels_config = kernel_cfg
+
+
     def modify(self, platform_id, device_id, device_type,
                precision, resolution, gl_sharing=False):
         """
@@ -129,7 +143,7 @@ class OpenCLEnvironment(object):
         >>> clenv = OpenCLEnvironment(0,0,'gpu', None, None)
         >>> clenv._get_platform(0) == cl.get_platforms()[0]
         True
-        >>> clenv._get_platform(-1) == cl.get_platforms()[0]
+        >>> clenv._get_platform(-1) == cl.get_platforms()[-1]
         True
         >>>
         \endcode
@@ -282,48 +296,35 @@ class OpenCLEnvironment(object):
         # Precision supported
         if __VERBOSE__:
             print " Precision capability  ",
-        capable = True
+        fp32_rounding_flag = True
         if self.precision is PARMES_REAL_GPU:
             opts += " -cl-single-precision-constant"
-            if __VERBOSE__:
-                print "for Single Precision: "
-            for v in ['DENORM', 'INF_NAN',
-                      'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF',
-                      'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']:
-                try:
-                    if eval('(self.device.single_fp_config &' +
-                            ' cl.device_fp_config.' +
-                            v + ') == cl.device_fp_config.' + v):
-                        if __VERBOSE__:
-                            print v
-                except AttributeError as ae:
-                    if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT':
-                        capable = False
-                    if __VERBOSE__:
-                        print v, 'is not supported in OpenCL C 1.2.\n',
-                        print '   Exception catched : ', ae
+            prec = "single"
         else:
-            if __VERBOSE__:
-                print "for Double Precision: "
-            if self.device.double_fp_config > 0:
-                for v in ['DENORM', 'INF_NAN', 'FMA',
-                          'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF',
-                          'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']:
-                    try:
-                        if eval('(self.device.double_fp_config &' +
-                                ' cl.device_fp_config.' +
-                                v + ') == cl.device_fp_config.' + v):
-                            if __VERBOSE__:
-                                print v
-                    except AttributeError as ae:
-                        if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT':
-                            capable = False
-                        if __VERBOSE__:
-                            print  v, 'is supported in OpenCL C 1.2.\n',
-                            print '   Exception catched : ', ae
-            else:
+            if self.device.double_fp_config <= 0:
                 raise ValueError("Double Precision is not supported by device")
-        if capable:
+            prec = "double"
+        if __VERBOSE__:
+            print "for " + prec + " Precision: "
+        for v in ['DENORM', 'INF_NAN',
+                  'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF',
+                  'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']:
+            try:
+                if eval('(self.device.' + prec + '_fp_config &' +
+                        ' cl.device_fp_config.' +
+                        v + ') == cl.device_fp_config.' + v):
+                    if __VERBOSE__:
+                        print v
+                else:
+                    if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT':
+                        fp32_rounding_flag = False
+            except AttributeError as ae:
+                if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT':
+                    fp32_rounding_flag = False
+                if __VERBOSE__:
+                    print v, 'is not supported in OpenCL C 1.2.\n',
+                    print '   Exception catched : ', ae
+        if fp32_rounding_flag:
             opts += " -cl-fp32-correctly-rounded-divide-sqrt"
         return opts
 
@@ -338,6 +339,8 @@ class OpenCLEnvironment(object):
         Parse the sources to handle single and double precision.
         """
         gpu_src = ""
+        if self.precision is PARMES_DOUBLE_GPU:
+            gpu_src += "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n"
         if isinstance(files, list):
             file_list = files
         else:
@@ -370,6 +373,7 @@ class OpenCLEnvironment(object):
             for sf in file_list:
                 print "   - ", sf
             print "Build options : ", self.default_build_opts + options
+            print "Vectorization : ", vector_width
             raise e
         if __VERBOSE__:
             #print options
diff --git a/HySoP/hysop/numerics/tests/__init__.py b/HySoP/hysop/numerics/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/HySoP/hysop/tools/timers.py b/HySoP/hysop/tools/timers.py
index a3861eb69..418bfb2ee 100644
--- a/HySoP/hysop/tools/timers.py
+++ b/HySoP/hysop/tools/timers.py
@@ -6,6 +6,7 @@ thanks to decorator.
 """
 from parmepy.mpi import MPI, main_rank
 from parmepy import __VERBOSE__
+ftime = MPI.Wtime
 
 
 def timed_function(f):
@@ -57,9 +58,9 @@ class FunctionTimer(object):
         @return wrapped function.
         """
         def f(*args, **kwargs):
-            t0 = MPI.Wtime()
+            t0 = ftime()
             res = func(*args, **kwargs)
-            t = MPI.Wtime() - t0
+            t = ftime() - t0
             self.t += t
             self.ncalls += 1
             #self.times.append(t)
@@ -121,9 +122,12 @@ class Timer(object):
         t._compute_summary()
         self.t += t.t
         for t_k in t.f_timers.keys():
-            t.f_timers[t_k].func_name = \
-                t._obj.name + "_" + t.f_timers[t_k].func_name
-            self.f_timers[t._obj.name + "_" + t_k] = t.f_timers[t_k]
+            if isinstance(t.f_timers[t_k], Timer):
+                self.f_timers[t_k] = t.f_timers[t_k].f_timers
+            else:
+                t.f_timers[t_k].func_name = \
+                    t._obj.name + "_" + t.f_timers[t_k].func_name
+                self.f_timers[t._obj.name + "_" + t_k] = t.f_timers[t_k]
         return self
 
     def getFunctionTimer(self, func):
@@ -148,20 +152,35 @@ class Timer(object):
         """
         self.f_timers[ft.func_name] = ft
 
+    def addSubTimer(self, t, prefix):
+        if len(t.f_timers.keys()) > 0:
+            self.f_timers[t._obj.name + prefix] = t
+
+
     def _compute_summary(self):
         """
-        Copute a summary of the different functions referenced herein.
+        Compute a summary of the different functions referenced herein.
         """
         self.t = 0.
         for f in self.f_timers.keys():
-            self.t += self.f_timers[f].t
+            try:
+                self.t += self.f_timers[f].t
+            except AttributeError:
+                # self.f_timers[f] is a sub-Timer times are details of
+                # functions times. these times are already in sum.
+                pass
 
     def __str__(self):
         self._compute_summary()
         s = "[" + str(main_rank) + "] Time spent in "
         s += self._obj.name + ' = ' + str(self.t) + ' s'
         for f in sorted(self.f_timers.keys()):
-            s += "\n" + str(self.f_timers[f])
+            if isinstance(self.f_timers[f], dict):
+                s += "\n[" + str(main_rank) + "]                |-- " + f
+                for sf in sorted(self.f_timers[f].keys()):
+                    s += "\n" + str(self.f_timers[f][sf]).replace('|--', '    |--')
+            else:
+                s += "\n" + str(self.f_timers[f])
         return s
 
     def toString(self):
@@ -171,7 +190,12 @@ class Timer(object):
         """
         h = self._obj.name.replace(' ', '_') + ' '
         t = str(self.t) + ' '
-        l = sorted(self.f_timers.keys())
-        h += ' '.join(l) + ' '
-        t += ' '.join([str(self.f_timers[k].t) for k in l]) + ' '
+        for f in sorted(self.f_timers.keys()):
+            if isinstance(self.f_timers[f], dict):
+                for sf in sorted(self.f_timers[f].keys()):
+                    h += (f + sf).replace(' ', '_') + ' '
+                    t += str(self.f_timers[f][sf].t) + ' '
+            else:
+                h += f.replace(' ', '_') + ' '
+                t += str(self.f_timers[f].t) + ' '
         return h, t
diff --git a/HySoP/setup.py.in b/HySoP/setup.py.in
index 436340c16..2db7c5451 100644
--- a/HySoP/setup.py.in
+++ b/HySoP/setup.py.in
@@ -30,6 +30,7 @@ packages_for_tests = ['parmepy.domain.tests',
                       'parmepy.operator.tests',
                       'parmepy.tools.tests',
                       'parmepy.problem.tests',
+                      'parmepy.numerics.tests',
                       ]
 
 if("@USE_MPI@" is "ON"):
-- 
GitLab