From 5134bc319b746c3e4b5e7eccd57443b1069330f9 Mon Sep 17 00:00:00 2001
From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@univ-reims.fr>
Date: Thu, 16 Apr 2015 16:06:10 +0200
Subject: [PATCH] GPU code improvement for large problems

---
 HySoP/hysop/gpu/cl_src/kernels/advection.cl   |  54 +++--
 .../cl_src/kernels/advection_and_remeshing.cl |  84 +++++---
 .../kernels/advection_and_remeshing_noVec.cl  |  17 +-
 .../advection_euler_and_remeshing_noVec.cl    |  70 +++---
 .../cl_src/kernels/advection_euler_noVec.cl   |  28 ++-
 .../gpu/cl_src/kernels/advection_noVec.cl     |  35 ++-
 HySoP/hysop/gpu/cl_src/kernels/diffusion.cl   | 203 ++++++++++--------
 HySoP/hysop/gpu/cl_src/kernels/remeshing.cl   |  17 +-
 .../gpu/cl_src/kernels/remeshing_noVec.cl     |  17 +-
 .../hysop/gpu/cl_src/kernels/transpose_xy.cl  |  42 ++--
 .../gpu/cl_src/kernels/transpose_xy_noVec.cl  |  61 ++++--
 .../hysop/gpu/cl_src/kernels/transpose_xz.cl  |  36 ++--
 .../gpu/cl_src/kernels/transpose_xz_noVec.cl  |  36 ++--
 .../gpu/cl_src/kernels/transpose_xz_slice.cl  |  36 ++--
 .../kernels/transpose_xz_slice_noVec.cl       |  57 ++---
 HySoP/hysop/gpu/config_default.py             |  91 +++++---
 HySoP/hysop/gpu/config_k20m.py                |  87 +++++---
 HySoP/hysop/gpu/gpu_diffusion.py              |   7 +-
 HySoP/hysop/gpu/gpu_kernel.py                 |  11 +-
 HySoP/hysop/gpu/gpu_particle_advection.py     |   8 +-
 .../tests/test_advection_randomVelocity.py    |  34 ++-
 HySoP/hysop/gpu/tests/test_transposition.py   |   8 +
 22 files changed, 668 insertions(+), 371 deletions(-)

diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection.cl b/HySoP/hysop/gpu/cl_src/kernels/advection.cl
index 29890ba9c..2e9e341e3 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/advection.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/advection.cl
@@ -5,19 +5,23 @@
 
 /**
  * Computes particles positions from the velocity field.
- * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group.
- * Each work-item computes <code>NB_I/WI_NB</code> particles positions.
- * Particle are computed through OpenCL vector types of lenght 2, 4 or 8.
+ * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. Computations of 1D problems are placed in loops over gidY and gidZ to adjust local workload and handle the work-item maximum size.
+ * Each work-item computes <code>NB_I/WI_NB</code> particles positions in each 1D problem.
+ * Particle are computed through OpenCL vector types of length 2, 4 or 8.
  * Velocity data are copied to a local buffer as a cache.
  *
  * @param gvelo Velocity.
  * @param ppos Particle position.
  * @param dt Time step.
- * @param min_position Domain lower coordinate.
- * @param dx Space step.
+ * @param mesh Mesh description.
+ * @param inv_v_dx_y velocity grid 1/dy
+ * @param inv_v_dx_z velocity grid 1/dz
  *
  * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last.
+ * @remark <code>NB_X</code>, <code>NB_Y</code>, <code>NB_Z</code> : points number in physical space directions.
  * @remark <code>WI_NB</code> corresponds to the work-item number.
+ * @remark <code>ADVEC_IS_MULTISCALE</code> is a flag for multiscale.
+ * @remark <code>V_NB_I</code>, <code>V_NB_II</code>, <code>V_NB_III</code> : points number for velocity grid in directions from 1st varying index to last.
  * @remark <code>__N__</code> is expanded at compilation time by vector width.
  * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component.
  * @see hysop.gpu.tools.parse_file
@@ -31,29 +35,43 @@ __kernel void advection_kernel(__global const float* gvelo,
 			       __constant struct AdvectionMeshInfo* mesh)
 {
   uint gidX = get_global_id(0);	/* OpenCL work-itme global index (X) */
-  uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */
-  uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */
+  uint gidY; /* OpenCL work-itme global index (Y) */
+  uint gidZ; /* OpenCL work-itme global index (Z) */
   uint i;			/* Particle index in 1D problem */
   float__N__ p;				/* Particle position */
-  uint line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */
+  uint line_index; /* Current 1D problem index */
 
   __local float velocity_cache[V_NB_I]; /* Velocity cache */
 
+  for(gidZ=get_global_id(2);
+#ifdef NB_Z
+      gidZ<NB_III;
+#else
+      gidZ<=get_global_id(2); // Single element loop
+#endif
+      gidZ+=get_global_size(2)) {
+    for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) {
+
+      // 1D problem computations
+      line_index = gidY*NB_I+ gidZ*NB_I*NB_II;
 
 #if ADVEC_IS_MULTISCALE
-  fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh);
+      fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh);
 #else
-  fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, mesh);
+      fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, mesh);
 #endif
 
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
+      /* Synchronize work-group */
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      for(i=gidX*__N__; i<NB_I; i+=WI_NB*__N__) {
+	/* Compute position */
+	p = advection(i, dt, velocity_cache, mesh);
+	/* Store result */
+	vstore__N__(p, (i+line_index)/__N__, ppos);
+      }
 
-  for(i=gidX*__N__; i<NB_I; i+=WI_NB*__N__)
-    {
-      /* Compute position */
-      p = advection(i, dt, velocity_cache, mesh);
-      /* Store result */
-      vstore__N__(p, (i+line_index)/__N__, ppos);
+      barrier(CLK_LOCAL_MEM_FENCE);
     }
+  }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing.cl
index b1b998913..c9fb52ebd 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing.cl
@@ -1,11 +1,11 @@
 /**
  * @file advection_and_remeshing.cl
- * Advection and remeshing kernel.
+ * Advection and remeshing kernel, vectorized version.
  */
 
 /**
- * Performs advection and then remeshing of the particles' scalar.
- * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group.
+ * Performs advection and then remeshing of the particles scalar.
+ * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. Computations of 1D problems are placed in loops over gidY and gidZ to adjust local workload and handle the work-item maximum size.
  * Each work-item computes NB_I/WI_NB particles positions. To avoid concurrent witings, in case of strong velocity gradients, work-items computes contiguous particles.
  * Particle are computed through OpenCL vector types of lenght 2, 4 or 8.
  * Scalar results are stored in a local buffer as a cache and then copied to global memory buffer.
@@ -14,11 +14,15 @@
  * @param pscal Particle scalar
  * @param gscal Grid scalar
  * @param dt Time step
- * @param min_position Domain lower coordinate
- * @param dx Space step
+ * @param mesh Mesh description.
+ * @param inv_v_dx_y velocity grid 1/dy
+ * @param inv_v_dx_z velocity grid 1/dz
  *
  * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last.
+ * @remark <code>NB_X</code>, <code>NB_Y</code>, <code>NB_Z</code> : points number in physical space directions.
  * @remark <code>WI_NB</code> corresponds to the work-item number.
+ * @remark <code>ADVEC_IS_MULTISCALE</code> is a flag for multiscale.
+ * @remark <code>V_NB_I</code>, <code>V_NB_II</code>, <code>V_NB_III</code> : points number for velocity grid in directions from 1st varying index to last.
  * @remark <code>__N__</code> is expanded at compilation time by vector width.
  * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component.
  * @remark <code>__RCOMP_I</code> flag is for instruction expansion for the different remeshed components.
@@ -36,48 +40,62 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
 				      __constant struct AdvectionMeshInfo* mesh)
 {
   uint gidX = get_global_id(0);	/* OpenCL work-itme global index (X) */
-  uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */
-  uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */
+  uint gidY; /* OpenCL work-itme global index (Y) */
+  uint gidZ; /* OpenCL work-itme global index (Z) */
   uint i;			/* Particle index in 1D problem */
   float__N__ p;			/* Particle position */
   __RCOMP_I float__N__ s__ID__; /* Particle scalar */
-  uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */
+  uint line_index; /* Current 1D problem index */
 
   __RCOMP_I__local float gscal_loc__ID__[NB_I]; /* Local buffer for result */
   __local float velocity_cache[V_NB_I]; /* Velocity cache */
 
+  for(gidZ=get_global_id(2);
+#ifdef NB_Z
+      gidZ<NB_III;
+#else
+      gidZ<=get_global_id(2); // Single element loop
+#endif
+      gidZ+=get_global_size(2)) {
+    for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) {
+
+      // 1D problem computations
+      line_index = gidY*NB_I+ gidZ*NB_I*NB_II;
+
 #if ADVEC_IS_MULTISCALE
-  fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh);
+      fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh);
 #else
-  fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, mesh);
+      fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, mesh);
 #endif
 
-  for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__))
-    {
-      /* Initialize result buffer */
-      __RCOMP_Igscal_loc__ID__[noBC_id(i+__NN__)] = 0.0;
-    }
+      for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__)) {
+	/* Initialize result buffer */
+	__RCOMP_Igscal_loc__ID__[noBC_id(i+__NN__)] = 0.0;
+      }
 
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
+      /* Synchronize work-group */
+      barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=__N__)
-    {
-      /* Read Particle scalar */
-      __RCOMP_Is__ID__ = vload__N__((i + line_index)/__N__, pscal__ID__);
-      /* Compute particle position */
-      p = advection(i, dt, velocity_cache, mesh);
-      /* Remesh particle */
-      remesh(i, __RCOMP_Ps__ID__, p, __RCOMP_Pgscal_loc__ID__, mesh);
-    }
+      for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=__N__) {
+	/* Read Particle scalar */
+	__RCOMP_Is__ID__ = vload__N__((i + line_index)/__N__, pscal__ID__);
+	/* Compute particle position */
+	p = advection(i, dt, velocity_cache, mesh);
+	/* Remesh particle */
+	remesh(i, __RCOMP_Ps__ID__, p, __RCOMP_Pgscal_loc__ID__, mesh);
+      }
+
+      /* Synchronize work-group */
+      barrier(CLK_LOCAL_MEM_FENCE);
 
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
+      for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__)) {
+	/* Store result */
+	__RCOMP_Ivstore__N__((float__N__)(gscal_loc__ID__[noBC_id(i+__NN__)],
+					  ), (i + line_index)/__N__, gscal__ID__);
+      }
 
-  for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__))
-    {
-      /* Store result */
-      __RCOMP_Ivstore__N__((float__N__)(gscal_loc__ID__[noBC_id(i+__NN__)],
-			       ), (i + line_index)/__N__, gscal__ID__);
+      /* Synchronize work-group */
+      barrier(CLK_LOCAL_MEM_FENCE);
     }
+  }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl
index 19dc24f54..5759dc6f5 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl
@@ -36,16 +36,23 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
 				      __constant struct AdvectionMeshInfo* mesh)
 {
   uint gidX = get_global_id(0);	/* OpenCL work-itme global index (X) */
-  uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */
-  uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */
+  uint gidY; /* OpenCL work-itme global index (Y) */
+  uint gidZ; /* OpenCL work-itme global index (Z) */
   uint i;			/* Particle index in 1D problem */
   float p;			/* Particle position */
   __RCOMP_I float s__ID__;	/* Particle scalar */
-  uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */
+  uint line_index; /* Current 1D problem index */
 
   __RCOMP_I__local float gscal_loc__ID__[NB_I]; /* Local buffer for result */
   __local float velocity_cache[V_NB_I]; /* Velocity cache */
 
+#ifdef NB_Z
+  for(gidZ=get_global_id(2); gidZ<NB_III; gidZ+=get_global_size(2)) {
+#else
+  gidZ=get_global_id(2); {
+#endif
+  for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) {
+  line_index = gidY*NB_I+ gidZ*NB_I*NB_II;
 
 #if ADVEC_IS_MULTISCALE
   fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh);
@@ -80,4 +87,8 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
       /* Store result */
       __RCOMP_Igscal__ID__[i + line_index] = gscal_loc__ID__[noBC_id(i)];
     }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+}
+}
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_euler_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_euler_and_remeshing_noVec.cl
index 47fda2027..99565be69 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/advection_euler_and_remeshing_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/advection_euler_and_remeshing_noVec.cl
@@ -1,11 +1,11 @@
 /**
  * @file advection_and_remeshing.cl
- * Advection and remeshing kernel.
+ * Euler advection and remeshing kernel.
  */
 
 /**
- * Performs advection and then remeshing of the particles' scalar.
- * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group.
+ * Performs advection and then remeshing of the particles scalar.
+ * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. Computations of 1D problems are placed in loops over gidY and gidZ to adjust local workload and handle the work-item maximum size.
  * Each work-item computes NB_I/WI_NB particles positions. To avoid concurrent witings, in case of strong velocity gradients, work-items computes contiguous particles.
  * Particle are computed through OpenCL vector types of lenght 2, 4 or 8.
  * Scalar results are stored in a local buffer as a cache and then copied to global memory buffer.
@@ -33,41 +33,53 @@ __kernel void advection_and_remeshing(__global const float* gvelo,
 				      __constant struct AdvectionMeshInfo* mesh)
 {
   uint gidX = get_global_id(0);	/* OpenCL work-itme global index (X) */
-  uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */
-  uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */
+  uint gidY; /* OpenCL work-itme global index (Y) */
+  uint gidZ; /* OpenCL work-itme global index (Z) */
   uint i;			/* Particle index in 1D problem */
   float p,c;			/* Particle position */
   __RCOMP_I float s__ID__;	/* Particle scalar */
-  uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */
+  uint line_index; /* Current 1D problem index */
 
   __RCOMP_I__local float gscal_loc__ID__[NB_I]; /* Local buffer for result */
 
-  for(i=gidX; i<NB_I; i+=(WI_NB))
-    {
-      /* Initialize result buffer */
-      __RCOMP_Igscal_loc__ID__[noBC_id(i)] = 0.0;
-    }
+  for(gidZ=get_global_id(2);
+#ifdef NB_Z
+      gidZ<NB_III;
+#else
+      gidZ<=get_global_id(2); // Single element loop
+#endif
+      gidZ+=get_global_size(2)) {
+    for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) {
 
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
+      // 1D computations
+      line_index = gidY*NB_I+ gidZ*NB_I*NB_II;
 
-  for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=1)
-    {
-      /* Read Particle scalar */
-      __RCOMP_Is__ID__ = pscal__ID__[i + line_index];
-      /* Compute particle position */
-      c = fma(i, mesh->dx.x, mesh->min_position);
-      p = fma(dt, gvelo[i+line_index], c);
-      /* Remesh particle */
-      remesh(i, __RCOMP_Ps__ID__, p, __RCOMP_Pgscal_loc__ID__, mesh);
-    }
+      for(i=gidX; i<NB_I; i+=(WI_NB)) {
+	/* Initialize result buffer */
+	__RCOMP_Igscal_loc__ID__[noBC_id(i)] = 0.0;
+      }
+
+      /* Synchronize work-group */
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=1) {
+	/* Read Particle scalar */
+	__RCOMP_Is__ID__ = pscal__ID__[i + line_index];
+	/* Compute particle position */
+	c = fma(i, mesh->dx.x, mesh->min_position);
+	p = fma(dt, gvelo[i+line_index], c);
+	/* Remesh particle */
+	remesh(i, __RCOMP_Ps__ID__, p, __RCOMP_Pgscal_loc__ID__, mesh);
+      }
 
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
+      /* Synchronize work-group */
+      barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(i=gidX; i<NB_I; i+=(WI_NB))
-    {
-      /* Store result */
-      __RCOMP_Igscal__ID__[i + line_index] = gscal_loc__ID__[noBC_id(i)];
+      for(i=gidX; i<NB_I; i+=(WI_NB)) {
+	/* Store result */
+	__RCOMP_Igscal__ID__[i + line_index] = gscal_loc__ID__[noBC_id(i)];
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
     }
+  }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_euler_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_euler_noVec.cl
index 091f57060..df90575a2 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/advection_euler_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/advection_euler_noVec.cl
@@ -23,15 +23,29 @@ __kernel void advection_kernel(__global const float* gvelo,
 			       __constant struct AdvectionMeshInfo* mesh)
 {
   uint gidX = get_global_id(0);	/* OpenCL work-itme global index (X) */
-  uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */
-  uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */
+  uint gidY; /* OpenCL work-itme global index (Y) */
+  uint gidZ; /* OpenCL work-itme global index (Z) */
   uint i;			/* Particle index in 1D problem */
-  uint line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */
+  uint line_index; /* Current 1D problem index */
   float c;
 
-  for(i=gidX; i<NB_I; i+=WI_NB)
-    {
-      c = fma(i, mesh->dx.x, mesh->min_position);
-      ppos[i+line_index] =  fma(dt, gvelo[i+line_index], c);
+  for(gidZ=get_global_id(2);
+#ifdef NB_Z
+      gidZ<NB_III;
+#else
+      gidZ<=get_global_id(2); // Single element loop
+#endif
+      gidZ+=get_global_size(2)) {
+    for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) {
+
+      //1D computations
+      line_index = gidY*NB_I+ gidZ*NB_I*NB_II;
+
+      for(i=gidX; i<NB_I; i+=WI_NB) {
+	c = fma(i, mesh->dx.x, mesh->min_position);
+	ppos[i+line_index] =  fma(dt, gvelo[i+line_index], c);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
     }
+  }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl
index d116c3cce..78ca64d61 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl
@@ -26,25 +26,38 @@ __kernel void advection_kernel(__global const float* gvelo,
 			       __constant struct AdvectionMeshInfo* mesh)
 {
   uint gidX = get_global_id(0);	/* OpenCL work-itme global index (X) */
-  uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */
-  uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */
+  uint gidY; /* OpenCL work-itme global index (Y) */
+  uint gidZ; /* OpenCL work-itme global index (Z) */
   uint i;			/* Particle index in 1D problem */
-  uint line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */
+  uint line_index; /* Current 1D problem index */
 
   __local float velocity_cache[V_NB_I]; /* Velocity cache */
 
-#if ADVEC_IS_MULTISCALE
-  fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh);
+  for(gidZ=get_global_id(2);
+#ifdef NB_Z
+      gidZ<NB_III;
 #else
-  fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, mesh);
+      gidZ<=get_global_id(2);  // Single element loop
 #endif
+      gidZ+=get_global_size(2)) {
+    for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) {
+
+      // 1D computation
+      line_index = gidY*NB_I+ gidZ*NB_I*NB_II;
 
+#if ADVEC_IS_MULTISCALE
+      fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh);
+#else
+      fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, mesh);
+#endif
 
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
+      /* Synchronize work-group */
+      barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(i=gidX; i<NB_I; i+=WI_NB)
-    {
-      ppos[i+line_index] = advection(i, dt, velocity_cache, mesh);
+      for(i=gidX; i<NB_I; i+=WI_NB) {
+	ppos[i+line_index] = advection(i, dt, velocity_cache, mesh);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
     }
+  }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl b/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl
index 7bd4ef31c..edbfa572a 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl
@@ -1,4 +1,26 @@
-
+/**
+ * @file diffusion.cl
+ * Diffusion kernel.
+ */
+
+/**
+ * Computes diffusion operator with finite differences.
+ * Stencil computation is performed within a 2D index space of size <code>TILE_SIZE</code> by a work-group. The 3rd direction is traversed in a loop for data reuse.
+ *
+ * @param scal_in Input scalar field
+ * @param ghostsX Ghosts array if X is a communication direction
+ * @param ghostsY Ghosts array if Y is a communication direction
+ * @param ghostsZ Ghosts array if Z is a communication direction
+ * @param scal_out Output scalar field
+ * @param nudt Diffusion coefficient
+ * @param dx Mesh space step
+ *
+ * @remark <code>NB_X</code>, <code>NB_Y</code>, <code>NB_Z</code> : points number in physical space directions.
+ * @remark <code>NB_PART</code> Particles number per work-item in computing direction
+ * @remark <code>CUT_DIT_X</code>, <code>CUT_DIT_Y</code> and <code>CUT_DIT_Z</code> : flags for communication direction
+ * @remark <code>NB_GROUPS_I</code> and <code>NB_GROUPS_II</code> : tiles number in X and Y directions.
+ * @remark <code>L_WIDTH</code> : work-item number in tile.
+ */
 
 __kernel void diffusion(__global const float* scal_in,
 #if CUT_DIR_X == 1
@@ -11,47 +33,104 @@ __kernel void diffusion(__global const float* scal_in,
 			__global const float* ghostsZ,
 #endif
 			__global float* scal_out,
-			float nudt, float4 dx)
+			float nudt,
+			float4 dx)
 {
-  int t_gidX = get_group_id(0);
-  int t_gidY = get_group_id(1);
-  int lidX = get_local_id(0);
-  int lidY = get_local_id(1);
-  int gidX = t_gidX*TILE_SIZE + lidX;	/* OpenCL work-item global index (X) */
-  int gidY = t_gidY*TILE_SIZE + lidY; /* OpenCL work-item global index (Y) */
-  int gidZ;
-  float cx = nudt/(dx.x*dx.x);
-  float cy = nudt/(dx.y*dx.y);
-  float cz = nudt/(dx.z*dx.z);
+  int t_gidX, t_gidY;
+  int lidX, lidY;
+  int gidX, gidY, gidZ;
+  float cx, cy, cz;
   float scal_z_m[NB_PART];
   float scal_z[NB_PART];
   float scal_z_p[NB_PART];
   float s;
   uint i;
 
-  for(i=0;i<NB_PART;i++){
+  __local float tile_XY[TILE_SIZE+2][TILE_SIZE+2];
+
+  for (t_gidX=get_group_id(0); t_gidX<NB_GROUPS_I; t_gidX+=get_num_groups(0)) {
+    for (t_gidY=get_group_id(1); t_gidY<NB_GROUPS_II; t_gidY+=get_num_groups(1)) {
+
+      // Tile computation
+      lidX = get_local_id(0);
+      lidY = get_local_id(1);
+      gidX = t_gidX*TILE_SIZE + lidX; /* OpenCL work-item global index (X) */
+      gidY = t_gidY*TILE_SIZE + lidY; /* OpenCL work-item global index (Y) */
+      cx = nudt/(dx.x*dx.x);
+      cy = nudt/(dx.y*dx.y);
+      cz = nudt/(dx.z*dx.z);
+
+      for(i=0;i<NB_PART;i++) {
 #if CUT_DIR_Z == 1
-    scal_z_m[i] = ghostsZ[gidX + (gidY+i*L_WIDTH)*NB_X + NB_X*NB_Y];
+	scal_z_m[i] = ghostsZ[gidX + (gidY+i*L_WIDTH)*NB_X + NB_X*NB_Y];
 #else
-    scal_z_m[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + (NB_Z-1)*NB_X*NB_Y];
+	scal_z_m[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + (NB_Z-1)*NB_X*NB_Y];
 #endif
-    scal_z[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X];
-  }
+	scal_z[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X];
+      }
 
-  /* scal_z[nb_parts] */
-  /* for i in xrange(nb_parts) */
-  /* lidY+i*L_WIDTH */
-  /* gidY+i*L_WIDTH */
+      lidX += 1;
+      lidY += 1;
 
-  __local float tile_XY[TILE_SIZE+2][TILE_SIZE+2];
+      // loop over Z indices but last.
+      for (gidZ=0; gidZ<(NB_Z-1); gidZ++) {
+	for(i=0;i<NB_PART;i++) {
+	  // fill the tile
+	  tile_XY[lidX][lidY+i*L_WIDTH] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
 
-  lidX += 1;
-  lidY += 1;
+	  /* // fill tile edges */
+#if CUT_DIR_X == 1
+	  tile_XY[0][lidY+i*L_WIDTH] = (t_gidX*TILE_SIZE>=1) ? scal_in[t_gidX*TILE_SIZE-1 + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] : ghostsX[1 + (gidY+i*L_WIDTH)*2 + gidZ*2*NB_Y];
+	  tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = ((t_gidX+1)*TILE_SIZE<NB_X) ? scal_in[(t_gidX+1)*TILE_SIZE + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]: ghostsX[(gidY+i*L_WIDTH)*2 + gidZ*2*NB_Y];
+#else
+	  tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
+	  tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
+#endif
+	}
+#if CUT_DIR_Y == 1
+	tile_XY[lidX][0] = (t_gidY*TILE_SIZE>=1)? scal_in[gidX + (t_gidY*TILE_SIZE-1)*NB_X + gidZ*NB_X*NB_Y] : ghostsY[gidX + NB_X + gidZ*NB_X*2];
+	tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE<NB_Y) ? scal_in[gidX + (t_gidY+1)*TILE_SIZE*NB_X + gidZ*NB_X*NB_Y] : ghostsY[gidX + gidZ*NB_X*2];
+#else
+	tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
+	tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
+#endif
+
+	/* Synchronize work-group */
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	for(i=0;i<NB_PART;i++) {
+	  /* get scalar value in Z direction */
+	  scal_z_p[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + (gidZ+1)*NB_X*NB_Y];
+
+	  // Compute stencil
+	  // central point
+	  s = scal_z[i] * (1.0 - 2.0 * (cx + cy + cz));
+
+	  s += cz*(scal_z_m[i] + scal_z_p[i]);
+
+	  s += cy * tile_XY[lidX][lidY+i*L_WIDTH-1];
+	  s += cy * tile_XY[lidX][lidY+i*L_WIDTH+1];
+	  s += cx * tile_XY[lidX-1][lidY+i*L_WIDTH];
+	  s += cx * tile_XY[lidX+1][lidY+i*L_WIDTH];
 
-  // loop over Z indices but last.
-  for (gidZ=0; gidZ<(NB_Z-1); gidZ++)
-    {
-      for(i=0;i<NB_PART;i++){
+	  // write result
+	  scal_out[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] = s;
+	}
+
+	/* Synchronize work-group */
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	for(i=0;i<NB_PART;i++) {
+	  // Shift Z values
+	  scal_z_m[i] = scal_z[i];
+	  scal_z[i] = scal_z_p[i];
+	}
+      }
+
+      // Compute last point (from ghosts)
+      gidZ = NB_Z - 1;
+
+      for(i=0;i<NB_PART;i++) {
 	// fill the tile
 	tile_XY[lidX][lidY+i*L_WIDTH] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
 
@@ -75,12 +154,16 @@ __kernel void diffusion(__global const float* scal_in,
       /* Synchronize work-group */
       barrier(CLK_LOCAL_MEM_FENCE);
 
-      for(i=0;i<NB_PART;i++){
-	/* get scalar value in Z direction */
-	scal_z_p[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + (gidZ+1)*NB_X*NB_Y];
+      for(i=0;i<NB_PART;i++) {
+	/* // get scalar value in Z direction */
+#if CUT_DIR_Z == 1
+	scal_z_p[i] = ghostsZ[gidX + (gidY+i*L_WIDTH)*NB_X];
+#else
+	scal_z_p[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X];
+#endif
 
 	// Compute stencil
-	// central point
+	/* // central point */
 	s = scal_z[i] * (1.0 - 2.0 * (cx + cy + cz));
 
 	s += cz*(scal_z_m[i] + scal_z_p[i]);
@@ -93,64 +176,8 @@ __kernel void diffusion(__global const float* scal_in,
 	// write result
 	scal_out[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] = s;
       }
-
       /* Synchronize work-group */
       barrier(CLK_LOCAL_MEM_FENCE);
-
-      for(i=0;i<NB_PART;i++){
-	// Shift Z values
-	scal_z_m[i] = scal_z[i];
-	scal_z[i] = scal_z_p[i];
-      }
     }
-
-  // Compute last point (from ghosts)
-  gidZ = NB_Z - 1;
-
-  for(i=0;i<NB_PART;i++){
-    // fill the tile
-    tile_XY[lidX][lidY+i*L_WIDTH] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
-
-    /* // fill tile edges */
-#if CUT_DIR_X == 1
-    tile_XY[0][lidY+i*L_WIDTH] = (t_gidX*TILE_SIZE>=1) ? scal_in[t_gidX*TILE_SIZE-1 + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] : ghostsX[1 + (gidY+i*L_WIDTH)*2 + gidZ*2*NB_Y];
-    tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = ((t_gidX+1)*TILE_SIZE<NB_X) ? scal_in[(t_gidX+1)*TILE_SIZE + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]: ghostsX[(gidY+i*L_WIDTH)*2 + gidZ*2*NB_Y];
-#else
-    tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
-    tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
-#endif
-  }
-#if CUT_DIR_Y == 1
-  tile_XY[lidX][0] = (t_gidY*TILE_SIZE>=1)? scal_in[gidX + (t_gidY*TILE_SIZE-1)*NB_X + gidZ*NB_X*NB_Y] : ghostsY[gidX + NB_X + gidZ*NB_X*2];
-  tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE<NB_Y) ? scal_in[gidX + (t_gidY+1)*TILE_SIZE*NB_X + gidZ*NB_X*NB_Y] : ghostsY[gidX + gidZ*NB_X*2];
-#else
-  tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
-  tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
-#endif
-
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  for(i=0;i<NB_PART;i++){
-    /* // get scalar value in Z direction */
-#if CUT_DIR_Z == 1
-    scal_z_p[i] = ghostsZ[gidX + (gidY+i*L_WIDTH)*NB_X];
-#else
-    scal_z_p[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X];
-#endif
-
-    // Compute stencil
-    /* // central point */
-    s = scal_z[i] * (1.0 - 2.0 * (cx + cy + cz));
-
-    s += cz*(scal_z_m[i] + scal_z_p[i]);
-
-    s += cy * tile_XY[lidX][lidY+i*L_WIDTH-1];
-    s += cy * tile_XY[lidX][lidY+i*L_WIDTH+1];
-    s += cx * tile_XY[lidX-1][lidY+i*L_WIDTH];
-    s += cx * tile_XY[lidX+1][lidY+i*L_WIDTH];
-
-    // write result
-    scal_out[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] = s;
   }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/remeshing.cl b/HySoP/hysop/gpu/cl_src/kernels/remeshing.cl
index 2e2774527..809c5ad32 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/remeshing.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/remeshing.cl
@@ -31,16 +31,24 @@ __kernel void remeshing_kernel(__global const float* ppos,
 			       __constant struct AdvectionMeshInfo* mesh)
 {
   uint gidX = get_global_id(0);	/* OpenCL work-itme global index (X) */
-  uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */
-  uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */
+  uint gidY; /* OpenCL work-itme global index (Y) */
+  uint gidZ; /* OpenCL work-itme global index (Z) */
   //  float invdx = 1.0/dx;         /* Space step inverse */
   uint i;			/* Particle index in 1D problem */
   float__N__ p;			/* Particle position */
   __RCOMP_I float__N__ s__ID__; /* Particle scalar */
-  uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */
+  uint line_index; /* Current 1D problem index */
 
   __RCOMP_I__local float gscal_loc__ID__[NB_I]; /* Local buffer for result */
 
+#ifdef NB_Z
+  for(gidZ=get_global_id(2); gidZ<NB_III; gidZ+=get_global_size(2)) {
+#else
+  gidZ=get_global_id(2); {
+#endif
+  for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) {
+  line_index = gidY*NB_I+ gidZ*NB_I*NB_II;
+
   for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__))
     {
       /* Initialize result buffer */
@@ -69,4 +77,7 @@ __kernel void remeshing_kernel(__global const float* ppos,
       __RCOMP_Ivstore__N__((float__N__)(gscal_loc__ID__[noBC_id(i+__NN__)],
 			       ),(i + line_index)/__N__, gscal__ID__);
   }
+  barrier(CLK_LOCAL_MEM_FENCE);
+}
+}
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl
index c8dbd3a8a..15db5730c 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl
@@ -31,15 +31,23 @@ __kernel void remeshing_kernel(__global const float* ppos,
 			       __constant struct AdvectionMeshInfo* mesh)
 {
   uint gidX = get_global_id(0);	/* OpenCL work-itme global index (X) */
-  uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */
-  uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */
+  uint gidY; /* OpenCL work-itme global index (Y) */
+  uint gidZ; /* OpenCL work-itme global index (Z) */
   uint i;			/* Particle index in 1D problem */
   float p;			/* Particle position */
   __RCOMP_I float s__ID__;      /* Particle scalar */
-  uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */
+  uint line_index; /* Current 1D problem index */
 
   __RCOMP_I__local float gscal_loc__ID__[NB_I]; /* Local buffer for result */
 
+#ifdef NB_Z
+  for(gidZ=get_global_id(2); gidZ<NB_III; gidZ+=get_global_size(2)) {
+#else
+  gidZ=get_global_id(2); {
+#endif
+  for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) {
+  line_index = gidY*NB_I+ gidZ*NB_I*NB_II;
+
   for(i=gidX; i<NB_I; i+=WI_NB)
     {
       /* Initialize result buffer */
@@ -67,4 +75,7 @@ __kernel void remeshing_kernel(__global const float* ppos,
       /* Store result */
       __RCOMP_Igscal__ID__[i + line_index] = gscal_loc__ID__[noBC_id(i)];
   }
+  barrier(CLK_LOCAL_MEM_FENCE);
+}
+}
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl
index 8e1725737..c430da22e 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl
@@ -39,28 +39,40 @@ __kernel void transpose_xy(__global const float* in,
   uint group_id_y;			/* Work-group coordinate in global space index Y */
   uint lid_x = get_local_id(0);
   uint lid_y = get_local_id(1);
+
+  unint xIndex, yIndex, zIndex;
+  uint index_in, index_out;
+  uint gidI, gidII, i;
+
+  __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */
+
+#ifdef NB_Z
+  for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2)) {
+#else
+  zIndex=get_global_id(2); {
+#endif
+  for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
+  for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
+
   /* Use of diagonal coordinates */
 #if NB_II == NB_I
-  group_id_x = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
-  group_id_y = get_group_id(0);
+  group_id_x = (gidI + gidII) % NB_GROUPS_I;
+  group_id_y = gidI;
 #else
-  uint bid = get_group_id(0) + get_group_id(1) * get_num_groups(0);
-  group_id_y = bid%get_num_groups(1);
-  group_id_x = ((bid/get_num_groups(1)) + group_id_y)%get_num_groups(0);
+  uint bid = gidI + gidII * NB_GROUPS_I;
+  group_id_y = bid%NB_GROUPS_II;
+  group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I;
 #endif
 
   /* Global input index for work-item */
-  uint xIndex = group_id_x * TILE_DIM_XY + lid_x*__N__;
-  uint yIndex = group_id_y * TILE_DIM_XY + lid_y;
-  uint zIndex = get_global_id(2);
-  uint index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
+  xIndex = group_id_x * TILE_DIM_XY + lid_x;
+  yIndex = group_id_y * TILE_DIM_XY + lid_y;
+  index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
 
   /* Global output index */
   xIndex = group_id_y * TILE_DIM_XY + lid_x*__N__;
   yIndex = group_id_x * TILE_DIM_XY + lid_y;
-  uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
-
-  __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */
+  index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
   for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
     {
@@ -79,5 +91,9 @@ __kernel void transpose_xy(__global const float* in,
 			  );
       vstore__N__(temp, (index_out + i*NB_I)/__N__, out);
     }
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+}
+}
+}
 }
-
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl
index 6c0f64489..dde7ba936 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl
@@ -38,42 +38,57 @@ __kernel void transpose_xy(__global const float* in,
   uint group_id_y;			/* Work-group coordinate in global space index Y */
   uint lid_x = get_local_id(0);
   uint lid_y = get_local_id(1);
+
+  uint xIndex, yIndex, zIndex;
+  uint index_in, index_out;
+  uint gidI, gidII, i;
+
+  __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */
+
+#ifdef NB_Z
+  for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2)) {
+#else
+  zIndex=get_global_id(2); {
+#endif
+  for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
+  for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
+
   /* Use of diagonal coordinates */
 #if NB_II == NB_I
-  group_id_x = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
-  group_id_y = get_group_id(0);
+  group_id_x = (gidI + gidII) % NB_GROUPS_I;
+  group_id_y = gidI;
 #else
-  uint bid = get_group_id(0) + get_group_id(1) * get_num_groups(0);
-  group_id_y = bid%get_num_groups(1);
-  group_id_x = ((bid/get_num_groups(1)) + group_id_y)%get_num_groups(0);
+  uint bid = gidI + gidII * NB_GROUPS_I;
+  group_id_y = bid%NB_GROUPS_II;
+  group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I;
 #endif
 
   /* Global input index for work-item */
-  uint xIndex = group_id_x * TILE_DIM_XY + lid_x;
-  uint yIndex = group_id_y * TILE_DIM_XY + lid_y;
-  uint zIndex = get_global_id(2);
-  uint index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
+  xIndex = group_id_x * TILE_DIM_XY + lid_x;
+  yIndex = group_id_y * TILE_DIM_XY + lid_y;
+  index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
 
   /* Global output index */
   xIndex = group_id_y * TILE_DIM_XY + lid_x;
   yIndex = group_id_x * TILE_DIM_XY + lid_y;
-  uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
-
-  __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */
+  index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
-  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
-    {
-      /* Fill the tile */
-      tile[lid_y + i][lid_x] = in[index_in + i * NB_II];
-    }
+  for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
+    /* Fill the tile */
+    tile[lid_y + i][lid_x] = in[index_in + i * NB_II];
+  }
 
   /* Synchronize work-group */
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
-    {
-      /* Write transposed data */
-      out[index_out + i*NB_I] = tile[lid_x][lid_y + i];
-    }
-}
+  for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
+    /* Write transposed data */
+    out[index_out + i*NB_I] = tile[lid_x][lid_y + i];
+  }
 
+  /* Synchronize work-group */
+  barrier(CLK_LOCAL_MEM_FENCE);
+}
+}
+}
+}
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl
index 8adc81f43..294925000 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl
@@ -32,28 +32,36 @@ __kernel void transpose_xz(__global const float* in,
   uint lid_y = get_local_id(1);
   uint lid_z = get_local_id(2);
 
+  /* Global input index for work-item */
+  uint xIndex, yIndex, zIndex;
+  uint index_in, index_out;
+  uint gidI, gidIII, j;
+
+  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
+
+  for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) {
+  for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
+  for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
+
   /* Use of diagonal coordinates */
 #if NB_III == NB_I
-  group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0);
-  group_id_z = get_group_id(0);
+  group_id_x = (gidI + gidIII) % NB_GROUPS_I;
+  group_id_z = gidI;
 #else
-  uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0);
-  group_id_z = bid%get_num_groups(2);
-  group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0);
+  uint bid = gidI + gidIII * NB_GROUPS_I;
+  group_id_z = bid%NB_GROUPS_III;
+  group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
 #endif
 
   /* Global input index for work-item */
-  uint xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__;
-  uint yIndex = get_group_id(1) * TILE_DIM_XZ + lid_y;
-  uint zIndex = group_id_z * TILE_DIM_XZ + lid_z;
-  uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
+  xIndex = group_id_x * TILE_DIM_XZ + lid_x;
+  zIndex = group_id_z * TILE_DIM_XZ + lid_z;
+  index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
 
   /* Global output index */
   xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__;
   zIndex = group_id_x * TILE_DIM_XZ + lid_z;
-  uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
-
-  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
+  index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
   for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
     {
@@ -78,4 +86,8 @@ __kernel void transpose_xz(__global const float* in,
 	  vstore__N__(temp, (index_out + i*NB_I + j*NB_I*NB_II)/__N__, out);
 	}
     }
+  barrier(CLK_LOCAL_MEM_FENCE);
+}
+}
+}
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl
index 96b771b20..37c811cf2 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl
@@ -31,28 +31,36 @@ __kernel void transpose_xz(__global const float* in,
   uint lid_y = get_local_id(1);
   uint lid_z = get_local_id(2);
 
+  /* Global input index for work-item */
+  uint xIndex, yIndex, zIndex;
+  uint index_in, index_out;
+  uint gidI, gidIII, j;
+
+  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
+
+  for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) {
+  for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
+  for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
+
   /* Use of diagonal coordinates */
 #if NB_III == NB_I
-  group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0);
-  group_id_z = get_group_id(0);
+  group_id_x = (gidI + gidIII) % NB_GROUPS_I;
+  group_id_z = gidI;
 #else
-  uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0);
-  group_id_z = bid%get_num_groups(2);
-  group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0);
+  uint bid = gidI + gidIII * NB_GROUPS_I;
+  group_id_z = bid%NB_GROUPS_III;
+  group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
 #endif
 
   /* Global input index for work-item */
-  uint xIndex = group_id_x * TILE_DIM_XZ + lid_x;
-  uint yIndex = get_group_id(1) * TILE_DIM_XZ + lid_y;
-  uint zIndex = group_id_z * TILE_DIM_XZ + lid_z;
-  uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
+  xIndex = group_id_x * TILE_DIM_XZ + lid_x;
+  zIndex = group_id_z * TILE_DIM_XZ + lid_z;
+  index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
 
   /* Global output index */
   xIndex = group_id_z * TILE_DIM_XZ + lid_x;
   zIndex = group_id_x * TILE_DIM_XZ + lid_z;
-  uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
-
-  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
+  index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
   for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
     {
@@ -74,4 +82,8 @@ __kernel void transpose_xz(__global const float* in,
 	  out[index_out + i*NB_I + j*NB_I*NB_II] = tile[lid_x][lid_y+i][lid_z + j];
 	}
     }
+  barrier(CLK_LOCAL_MEM_FENCE);
+}
+}
+}
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl
index b64960b0d..91475e45d 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl
@@ -31,28 +31,36 @@ __kernel void transpose_xz(__global const float* in,
   uint lid_x = get_local_id(0);
   uint lid_z = get_local_id(2);
 
+  /* Global input index for work-item */
+  uint xIndex, yIndex, zIndex;
+  uint index_in, index_out;
+  uint gidI, gidIII, j;
+
+  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
+
+  for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) {
+  for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
+  for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
+
   /* Use of diagonal coordinates */
 #if NB_III == NB_I
-  group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0);
-  group_id_z = get_group_id(0);
+  group_id_x = (gidI + gidIII) % NB_GROUPS_I;
+  group_id_z = gidI;
 #else
-  uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0);
-  group_id_z = bid%get_num_groups(2);
-  group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0);
+  uint bid = gidI + gidIII * NB_GROUPS_I;
+  group_id_z = bid%NB_GROUPS_III;
+  group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
 #endif
 
   /* Global input index for work-item */
-  uint xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__;
-  uint yIndex = get_global_id(1);
-  uint zIndex = group_id_z * TILE_DIM_XZ + lid_z;
-  uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
+  xIndex = group_id_x * TILE_DIM_XZ + lid_x;
+  zIndex = group_id_z * TILE_DIM_XZ + lid_z;
+  index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
 
   /* Global output index */
   xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__;
   zIndex = group_id_x * TILE_DIM_XZ + lid_z;
-  uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
-
-  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
+  index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
   for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
     {
@@ -71,4 +79,8 @@ __kernel void transpose_xz(__global const float* in,
 			  );
       vstore__N__(temp, (index_out + j*NB_I*NB_II)/__N__, out);
     }
+  barrier(CLK_LOCAL_MEM_FENCE);
+}
+}
+}
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl
index 7b06e13fb..9325ccfff 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl
@@ -30,41 +30,50 @@ __kernel void transpose_xz(__global const float* in,
   uint lid_x = get_local_id(0);
   uint lid_z = get_local_id(2);
 
+  /* Global input index for work-item */
+  uint xIndex, yIndex, zIndex;
+  uint index_in, index_out;
+  uint gidI, gidIII, j;
+
+  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
+
+  for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) {
+  for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
+  for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
+
   /* Use of diagonal coordinates */
 #if NB_III == NB_I
-  group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0);
-  group_id_z = get_group_id(0);
+  group_id_x = (gidI + gidIII) % NB_GROUPS_I;
+  group_id_z = gidI;
 #else
-  uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0);
-  group_id_z = bid%get_num_groups(2);
-  group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0);
+  uint bid = gidI + gidIII * NB_GROUPS_I;
+  group_id_z = bid%NB_GROUPS_III;
+  group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
 #endif
 
-  /* Global input index for work-item */
-  uint xIndex = group_id_x * TILE_DIM_XZ + lid_x;
-  uint yIndex = get_global_id(1);
-  uint zIndex = group_id_z * TILE_DIM_XZ + lid_z;
-  uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
+  xIndex = group_id_x * TILE_DIM_XZ + lid_x;
+  zIndex = group_id_z * TILE_DIM_XZ + lid_z;
+  index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
 
   /* Global output index */
   xIndex = group_id_z * TILE_DIM_XZ + lid_x;
   zIndex = group_id_x * TILE_DIM_XZ + lid_z;
-  uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
+  index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
-  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
-
-  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
-    {
-      /* Fill the tile */
-      tile[lid_z + j][lid_x] = in[index_in + j*NB_III*NB_II];
-
-    }
+  for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
+    /* Fill the tile */
+    tile[lid_z + j][lid_x] = in[index_in + j*NB_III*NB_II];
+  }
   /* Synchronize work-group */
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
-    {
-      /* Write transposed data */
-      out[index_out + j*NB_I*NB_II] = tile[lid_x][lid_z + j];
-    }
+  for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
+    /* Write transposed data */
+    out[index_out + j*NB_I*NB_II] = tile[lid_x][lid_z + j];
+    tile[lid_x][lid_z + j] = 0.0;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+}
+}
+}
 }
diff --git a/HySoP/hysop/gpu/config_default.py b/HySoP/hysop/gpu/config_default.py
index ee9ffc465..2f87511db 100644
--- a/HySoP/hysop/gpu/config_default.py
+++ b/HySoP/hysop/gpu/config_default.py
@@ -5,43 +5,62 @@ OpenCL kernels default configurations.
 """
 from hysop.constants import np
 FLOAT_GPU, DOUBLE_GPU = np.float32, np.float64
+MAX_GWI = (256, 256, 256)
 
 #build empty dictionaries
 kernels_config = {}
 kernels_config[2] = {FLOAT_GPU: {}, DOUBLE_GPU: {}}
 kernels_config[3] = {FLOAT_GPU: {}, DOUBLE_GPU: {}}
 
-# Copy kernel:
-def copy_space_index_2d(size, t_dim, b_rows, vec):
-    gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), 1)
-    lwi = (t_dim / vec, b_rows, 1)
-    return gwi, lwi
-def copy_space_index_3d(size, t_dim, b_rows, vec):
-    gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), int(size[2]))
-    lwi = (t_dim / vec, b_rows, 1)
-    return gwi, lwi
-# Configs : sources, tile size, block rows, vector size, index space function
-kernels_config[3][FLOAT_GPU]['copy'] = \
-    ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_3d)
-kernels_config[3][DOUBLE_GPU]['copy'] = \
-    ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_3d)
-kernels_config[2][FLOAT_GPU]['copy'] = \
-    ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_2d)
-kernels_config[2][DOUBLE_GPU]['copy'] = \
-    ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_2d)
+def _clamp_max(w, m):
+    while w > m:
+        w /= 2
+    return int(w)
+
+
+def check_max(t_gwi):
+    return tuple([_clamp_max(w, m) for w, m in zip(t_gwi, MAX_GWI)])
+
+## Copy kernel is replaced by copy function from OpenCL API
+# # Copy kernel:
+# def copy_space_index_2d(size, t_dim, b_rows, vec):
+#     gwi = check_max((size[0] / vec, b_rows * size[1] / t_dim, 1))
+#     lwi = (t_dim / vec, b_rows, 1)
+#     blocks_nb = ((size[0] / vec) / lwi[0],
+#                  (b_rows * size[1] / t_dim) / lwi[1], None)
+#     return gwi, lwi, blocks_nb
+# def copy_space_index_3d(size, t_dim, b_rows, vec):
+#     gwi = check_max((size[0] / vec, b_rows * size[1] / t_dim, size[2]))
+#     lwi = (t_dim / vec, b_rows, 1)
+#     blocks_nb = ((size[0] / vec) / lwi[0],
+#                  (b_rows * size[1] / t_dim) / lwi[1], None)
+#     return gwi, lwi, blocks_nb
+# # Configs : sources, tile size, block rows, vector size, index space function
+# kernels_config[3][FLOAT_GPU]['copy'] = \
+#     ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_3d)
+# kernels_config[3][DOUBLE_GPU]['copy'] = \
+#     ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_3d)
+# kernels_config[2][FLOAT_GPU]['copy'] = \
+#     ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_2d)
+# kernels_config[2][DOUBLE_GPU]['copy'] = \
+#     ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_2d)
 
 # Transpositions kernels:
 # XY transposition
 # Settings are taken from destination layout as current layout.
 # gwi is computed form input layout (appears as transposed layout)
 def xy_space_index_2d(size, t_dim, b_rows, vec):
-    gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), 1)
+    gwi = check_max((size[1] / vec, b_rows * size[0] / t_dim, 1))
     lwi = (t_dim / vec, b_rows, 1)
-    return gwi, lwi
+    blocs_nb = ((size[1] / vec) / lwi[0],
+                (b_rows * size[0] / t_dim) / lwi[1], None)
+    return gwi, lwi, blocs_nb
 def xy_space_index_3d(size, t_dim, b_rows, vec):
-    gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), int(size[2]))
+    gwi = check_max((size[1] / vec, b_rows * size[0] / t_dim, size[2]))
     lwi = (t_dim / vec, b_rows, 1)
-    return gwi, lwi
+    blocs_nb = ((size[1] / vec) / lwi[0],
+                (b_rows * size[0] / t_dim) / lwi[1], None)
+    return gwi, lwi, blocs_nb
 # Configs : sources, tile size, block rows, is padding, vector size,
 #              index space function
 kernels_config[3][FLOAT_GPU]['transpose_xy'] = \
@@ -57,9 +76,11 @@ kernels_config[2][DOUBLE_GPU]['transpose_xy'] = \
 # Settings are taken from destination layout as current layout.
 # gwi is computed form input layout (appears as transposed layout)
 def xz_space_index_3d(size, t_dim, b_rows, b_deph, vec):
-    gwi = (int(size[2] / vec), int(size[1]), int(b_deph * size[0] / t_dim))
+    gwi = check_max((size[2] / vec, size[1], b_deph * size[0] / t_dim))
     lwi = (t_dim / vec, 1, b_deph)
-    return gwi, lwi
+    blocs_nb = (((size[2]) / vec) / lwi[0], None,
+                (b_deph * (size[0]) / t_dim) / lwi[2])
+    return gwi, lwi, blocs_nb
 # Configs : sources, tile size, block rows, is padding, vector size,
 #              index space function
 kernels_config[3][FLOAT_GPU]['transpose_xz'] = \
@@ -77,11 +98,13 @@ def computational_kernels_index_space(wi, size, vec):
         wi = size[0] // vec
 
     if len(size) == 3:
-        gwi = (int(wi), int(size[1]), int(size[2]))
+        gwi = (int(wi),
+               _clamp_max(size[1], MAX_GWI[1]),
+               _clamp_max(size[2], MAX_GWI[2]))
         lwi = (int(wi), 1, 1)
     else:
-        gwi = (int(wi), int(size[1]))
-        lwi = (int(wi), 1)
+        gwi = (int(wi), _clamp_max(size[1], MAX_GWI[1]), 1)
+        lwi = (int(wi), 1, 1)
     return gwi, lwi
 
 def advection_index_space_3d(size, vec):
@@ -102,7 +125,7 @@ def remeshing_index_space_2d(size, vec):
     return computational_kernels_index_space(wi, size, vec)
 
 def advection_and_remeshing_index_space(size, vec):
-    wi = min(max(32, size[0] / 2), 128)
+    wi = min(size[0] / 2, 128)
     return computational_kernels_index_space(wi, size, vec)
 
 
@@ -172,11 +195,17 @@ kernels_config[2][DOUBLE_GPU]['advec_and_remesh'] = \
       "kernels/advection_and_remeshing_noVec.cl"],
      False, 1, advection_and_remeshing_index_space)
 
+
+def diffusion_space_index_3d(size, nb_part, tile):
+    gwi = check_max((size[0], size[1] / nb_part))
+    lwi = (tile, tile / nb_part)
+    blocs_nb = (size[0] / tile, size[1] / tile)
+    return gwi, lwi, blocs_nb
+
+
 kernels_config[3][FLOAT_GPU]['diffusion'] = \
     (["common.cl", "kernels/diffusion.cl"],
-     16, 1, 1,
-     lambda size, nb_part, tile: ((size[0], size[1] / nb_part),
-                                  (tile, tile / nb_part)))
+     16, 1, 1, diffusion_space_index_3d)
 
 
 kernels_config[3][DOUBLE_GPU]['advec_comm'] = \
diff --git a/HySoP/hysop/gpu/config_k20m.py b/HySoP/hysop/gpu/config_k20m.py
index e9a23eb69..cb39f7d2e 100644
--- a/HySoP/hysop/gpu/config_k20m.py
+++ b/HySoP/hysop/gpu/config_k20m.py
@@ -5,43 +5,60 @@ OpenCL kernels configurations.
 """
 from hysop.constants import np
 FLOAT_GPU, DOUBLE_GPU = np.float32, np.float64
+MAX_GWI = (1024, 1024, 1024)
+
+
+def _clamp_max(w, m):
+    while w > m:
+        w /= 2
+    return int(w)
+
+
+def check_max(t_gwi):
+    return tuple([_clamp_max(w, m) for w, m in zip(t_gwi, MAX_GWI)])
+
 
 #build empty dictionaries
 kernels_config = {}
 kernels_config[2] = {FLOAT_GPU: {}, DOUBLE_GPU: {}}
 kernels_config[3] = {FLOAT_GPU: {}, DOUBLE_GPU: {}}
 
-# Copy kernel:
-def copy_space_index_2d(size, t_dim, b_rows, vec):
-    gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), 1)
-    lwi = (t_dim / vec, b_rows, 1)
-    return gwi, lwi
-def copy_space_index_3d(size, t_dim, b_rows, vec):
-    gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), int(size[2]))
-    lwi = (t_dim / vec, b_rows, 1)
-    return gwi, lwi
-# Configs : sources, tile size, block rows, vector size, index space function
-kernels_config[3][FLOAT_GPU]['copy'] = \
-    ('kernels/copy_noVec.cl', 32, 8, 1, copy_space_index_3d)
-kernels_config[3][DOUBLE_GPU]['copy'] = \
-    ('kernels/copy_noVec.cl', 16, 16, 1, copy_space_index_3d)
-kernels_config[2][FLOAT_GPU]['copy'] = \
-    ('kernels/copy_noVec.cl', 32, 8, 1, copy_space_index_2d)
-kernels_config[2][DOUBLE_GPU]['copy'] = \
-    ('kernels/copy_noVec.cl', 16, 16, 1, copy_space_index_2d)
+## Copy kernel is replaced by copy function from OpenCL API
+# # Copy kernel:
+# def copy_space_index_2d(size, t_dim, b_rows, vec):
+#     gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), 1)
+#     lwi = (t_dim / vec, b_rows, 1)
+#     return gwi, lwi
+# def copy_space_index_3d(size, t_dim, b_rows, vec):
+#     gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), int(size[2]))
+#     lwi = (t_dim / vec, b_rows, 1)
+#     return gwi, lwi
+# # Configs : sources, tile size, block rows, vector size, index space function
+# kernels_config[3][FLOAT_GPU]['copy'] = \
+#     ('kernels/copy_noVec.cl', 32, 8, 1, copy_space_index_3d)
+# kernels_config[3][DOUBLE_GPU]['copy'] = \
+#     ('kernels/copy_noVec.cl', 16, 16, 1, copy_space_index_3d)
+# kernels_config[2][FLOAT_GPU]['copy'] = \
+#     ('kernels/copy_noVec.cl', 32, 8, 1, copy_space_index_2d)
+# kernels_config[2][DOUBLE_GPU]['copy'] = \
+#     ('kernels/copy_noVec.cl', 16, 16, 1, copy_space_index_2d)
 
 # Transpositions kernels:
 # XY transposition
 # Settings are taken from destination layout as current layout.
 # gwi is computed form input layout (appears as transposed layout)
 def xy_space_index_2d(size, t_dim, b_rows, vec):
-    gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), 1)
+    gwi = check_max((size[1] / vec, b_rows * size[0] / t_dim, 1))
     lwi = (t_dim / vec, b_rows, 1)
-    return gwi, lwi
+    blocs_nb = ((size[1] / vec) / lwi[0],
+                (b_rows * size[0] / t_dim) / lwi[1], None)
+    return gwi, lwi, blocs_nb
 def xy_space_index_3d(size, t_dim, b_rows, vec):
-    gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), int(size[2]))
+    gwi = check_max((size[1] / vec, b_rows * size[0] / t_dim, size[2]))
     lwi = (t_dim / vec, b_rows, 1)
-    return gwi, lwi
+    block_nb = ((size[1] / vec) / lwi[0],
+                (b_rows * size[0] / t_dim) / lwi[1], None)
+    return gwi, lwi, block_nb
 # Configs : sources, tile size, block rows, is padding, vector size,
 #              index space function
 kernels_config[3][FLOAT_GPU]['transpose_xy'] = \
@@ -57,9 +74,11 @@ kernels_config[2][DOUBLE_GPU]['transpose_xy'] = \
 # Settings are taken from destination layout as current layout.
 # gwi is computed form input layout (appears as transposed layout)
 def xz_space_index_3d(size, t_dim, b_rows, b_deph, vec):
-    gwi = (int(size[2] / vec), int(size[1]), int(b_deph * size[0] / t_dim))
+    gwi = check_max((size[2] / vec, size[1], b_deph * size[0] / t_dim))
     lwi = (t_dim / vec, 1, b_deph)
-    return gwi, lwi
+    blocs_nb = ((size[2] / vec) / lwi[0], None,
+                (b_deph * size[0] / t_dim) / lwi[2])
+    return gwi, lwi, blocs_nb
 # Configs : sources, tile size, block rows, is padding, vector size,
 #              index space function
 kernels_config[3][FLOAT_GPU]['transpose_xz'] = \
@@ -77,15 +96,17 @@ def computational_kernels_index_space(wi, size, vec):
         wi = size[0] // vec
 
     if len(size) == 3:
-        gwi = (int(wi), int(size[1]), int(size[2]))
+        gwi = (int(wi),
+               _clamp_max(size[1], MAX_GWI[1]),
+               _clamp_max(size[2], MAX_GWI[2]))
         lwi = (int(wi), 1, 1)
     else:
-        gwi = (int(wi), int(size[1]))
+        gwi = (int(wi), _clamp_max(size[1], MAX_GWI[1]))
         lwi = (int(wi), 1)
     return gwi, lwi
 
 def advection_index_space_3d(size, vec):
-    wi = min(max(64, size[0] / 4), 1024)
+    wi = min(size[0] / 4, 1024)
     return computational_kernels_index_space(wi, size, vec)
 def advection_index_space_2d_SP(size, vec):
     wi = min(size[0] / 8, 1024)
@@ -180,11 +201,17 @@ kernels_config[2][DOUBLE_GPU]['advec_and_remesh'] = \
       "kernels/advection_and_remeshing_noVec.cl"],
      False, 1, advection_and_remeshing_index_space)
 
+
+def diffusion_space_index_3d(size, nb_part, tile):
+    gwi = check_max((size[0], size[1] / nb_part))
+    lwi = (tile, tile / nb_part)
+    blocs_nb = (size[0] / tile, size[1] / tile)
+    return gwi, lwi, blocs_nb
+
+
 kernels_config[3][DOUBLE_GPU]['diffusion'] = \
     (["common.cl", "kernels/diffusion.cl"],
-     16, 4, 1,
-     lambda size, nb_part, tile: ((size[0], size[1] / nb_part),
-                                  (tile, tile / nb_part)))
+     16, 4, 1, diffusion_space_index_3d)
 
 
 kernels_config[3][DOUBLE_GPU]['advec_comm'] = \
diff --git a/HySoP/hysop/gpu/gpu_diffusion.py b/HySoP/hysop/gpu/gpu_diffusion.py
index 0758a4581..c1ba26ec4 100644
--- a/HySoP/hysop/gpu/gpu_diffusion.py
+++ b/HySoP/hysop/gpu/gpu_diffusion.py
@@ -154,10 +154,11 @@ class GPUDiffusion(DiscreteOperator, GPUOperator):
             build_options += " -D CUT_DIR" + S_DIR[d] + "="
             build_options += str(1 if topo.shape[d] > 1 else 0)
 
+        gwi, lwi, blocs_nb = f_space(self.field.data[0].shape,
+                                     nb_part_per_wi, tile_size)
+        build_options += " -D NB_GROUPS_I={0}".format(blocs_nb[0])
+        build_options += " -D NB_GROUPS_II={0}".format(blocs_nb[1])
         prg = self.cl_env.build_src(src, build_options, vec)
-
-        gwi, lwi = f_space(self.field.data[0].shape,
-                           nb_part_per_wi, tile_size)
         self.num_diffusion = KernelLauncher(
             prg.diffusion, self.cl_env.queue, gwi, lwi)
         self.copy = KernelLauncher(cl.enqueue_copy,
diff --git a/HySoP/hysop/gpu/gpu_kernel.py b/HySoP/hysop/gpu/gpu_kernel.py
index abc0e9fbd..411e141ce 100644
--- a/HySoP/hysop/gpu/gpu_kernel.py
+++ b/HySoP/hysop/gpu/gpu_kernel.py
@@ -2,10 +2,10 @@
 @file gpu_kernel.py
 """
 from hysop.constants import debug, S_DIR
-from hysop import __VERBOSE__
+#from hysop import __VERBOSE__
 from hysop.gpu import cl, CL_PROFILE
 from hysop.tools.profiler import FProfiler
-
+__VERBOSE__ = True
 
 class KernelListLauncher(object):
     """
@@ -73,8 +73,11 @@ class KernelListLauncher(object):
         @return OpenCL Event.
         """
         if __VERBOSE__:
-            print "OpenCL kernel:", self.kernel[d].function_name
-            print d, args[0], args[1], args, kwargs
+            try:
+                print "OpenCL kernel:", self.kernel[d].function_name, d, args[0], args[1]
+                #print d, args[0], args[1], args, kwargs
+            except AttributeError:
+                print "OpenCL kernel:", self.kernel[d].__name__
         evt = self.kernel[d](self.queue, *args, **kwargs)
         if CL_PROFILE:
             evt.wait()
diff --git a/HySoP/hysop/gpu/gpu_particle_advection.py b/HySoP/hysop/gpu/gpu_particle_advection.py
index 18a7ed70a..0504675eb 100644
--- a/HySoP/hysop/gpu/gpu_particle_advection.py
+++ b/HySoP/hysop/gpu/gpu_particle_advection.py
@@ -393,7 +393,7 @@ class GPUParticleAdvection(ParticleAdvection, GPUOperator):
         while t_dim > resol_tmp[0] or t_dim > resol_tmp[1] or \
                 (resol_tmp[0] % t_dim) > 0 or (resol_tmp[1] % t_dim) > 0:
             t_dim /= 2
-        gwi, lwi = f_space(resol_tmp, t_dim, b_rows, vec)
+        gwi, lwi, blocs_nb = f_space(resol_tmp, t_dim, b_rows, vec)
 
         if is_padding:
             build_options += " -D PADDING_XY=1"
@@ -401,6 +401,8 @@ class GPUParticleAdvection(ParticleAdvection, GPUOperator):
             build_options += " -D PADDING_XY=0"
         build_options += " -D TILE_DIM_XY={0}".format(t_dim)
         build_options += " -D BLOCK_ROWS_XY={0}".format(b_rows)
+        build_options += " -D NB_GROUPS_I={0}".format(blocs_nb[0])
+        build_options += " -D NB_GROUPS_II={0}".format(blocs_nb[1])
         build_options += ocl_cte
         prg = self.cl_env.build_src(src, build_options, vec)
         return KernelLauncher(prg.transpose_xy, self.cl_env.queue, gwi, lwi)
@@ -447,7 +449,7 @@ class GPUParticleAdvection(ParticleAdvection, GPUOperator):
         while t_dim > resol_tmp[0] or t_dim > resol_tmp[2] or \
                 (resol_tmp[0] % t_dim) > 0 or (resol_tmp[2] % t_dim) > 0:
             t_dim /= 2
-        gwi, lwi = f_space(resol_tmp, t_dim, b_rows, b_deph, vec)
+        gwi, lwi, blocs_nb = f_space(resol_tmp, t_dim, b_rows, b_deph, vec)
         if is_padding:
             build_options += " -D PADDING_XZ=1"
         else:
@@ -455,6 +457,8 @@ class GPUParticleAdvection(ParticleAdvection, GPUOperator):
         build_options += " -D TILE_DIM_XZ={0}".format(t_dim)
         build_options += " -D BLOCK_ROWS_XZ={0}".format(b_rows)
         build_options += " -D BLOCK_DEPH_XZ={0}".format(b_deph)
+        build_options += " -D NB_GROUPS_I={0}".format(blocs_nb[0])
+        build_options += " -D NB_GROUPS_III={0}".format(blocs_nb[2])
         build_options += ocl_cte
         prg = self.cl_env.build_src(
             src,
diff --git a/HySoP/hysop/gpu/tests/test_advection_randomVelocity.py b/HySoP/hysop/gpu/tests/test_advection_randomVelocity.py
index 2e4263578..c151b8f3f 100644
--- a/HySoP/hysop/gpu/tests/test_advection_randomVelocity.py
+++ b/HySoP/hysop/gpu/tests/test_advection_randomVelocity.py
@@ -54,7 +54,8 @@ def assertion_2D_withPython(scal, velo, advec, advec_py):
     scal_d.toHost()
 
     advec.finalize()
-    assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07)
+    err_m = np.max(np.abs(py_res - scal_d.data[0]))
+    assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07), str(err_m)
 
 
 def assertion_3D_withPython(scal, velo, advec, advec_py):
@@ -83,7 +84,8 @@ def assertion_3D_withPython(scal, velo, advec, advec_py):
     scal_d.toHost()
 
     advec.finalize()
-    assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07)
+    err_m = np.max(np.abs(py_res - scal_d.data[0]))
+    assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07), str(err_m)
 
 d2d = Discretization([33, 33])
 
@@ -707,7 +709,8 @@ def test_rectangular_domain2D():
     py_res = scal_d.data[0].copy()
     scal_d.toHost()
 
-    assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07)
+    err_m = np.max(np.abs(py_res - scal_d.data[0]))
+    assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07), str(err_m)
     advec.finalize()
 
 
@@ -753,7 +756,9 @@ def test_rectangular_domain3D():
     py_res = scal_d.data[0].copy()
     scal_d.toHost()
 
-    assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07)
+    err_m = np.max(np.abs(py_res - scal_d.data[0]))
+    assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07), str(err_m) + " " + \
+        str(np.where(np.abs(py_res - scal_d.data[0]) > 1e-07+1e-04*np.abs(scal_d.data[0])))
     advec.finalize()
 
 
@@ -792,7 +797,6 @@ def test_vector_2D():
     scal_d.toDevice()
     velo_d.toDevice()
 
-    print np.max(scal_d.data[0] - scal_d.data[1])
     advec_py.apply(Simulation(tinit=0., tend=0.1, nbIter=1))
     advec.apply(Simulation(tinit=0., tend=0.1, nbIter=1))
 
@@ -800,8 +804,12 @@ def test_vector_2D():
     py_res_Y = scal_d.data[1].copy()
     scal_d.toHost()
 
-    assert np.allclose(py_res_X, scal_d.data[0], rtol=1e-04, atol=1e-07)
-    assert np.allclose(py_res_Y, scal_d.data[1], rtol=1e-04, atol=1e-07)
+    err_m = np.max(np.abs(py_res_X - scal_d.data[0]))
+    assert np.allclose(py_res_X, scal_d.data[0], rtol=1e-04, atol=1e-07), str(err_m) + " " + \
+        str(np.where(np.abs(py_res_X - scal_d.data[0]) > 1e-07+1e-04*np.abs(scal_d.data[0])))
+    err_m = np.max(np.abs(py_res_Y - scal_d.data[1]))
+    assert np.allclose(py_res_Y, scal_d.data[1], rtol=1e-04, atol=1e-07), str(err_m) + " "+ \
+        str(np.where(np.abs(py_res_Y - scal_d.data[1]) > 1e-07+1e-04*np.abs(scal_d.data[1])))
     advec.finalize()
 
 
@@ -854,7 +862,13 @@ def test_vector_3D():
     py_res_Z = scal_d.data[2].copy()
     scal_d.toHost()
 
-    assert np.allclose(py_res_X, scal_d.data[0], rtol=1e-04, atol=1e-07)
-    assert np.allclose(py_res_Y, scal_d.data[1], rtol=1e-04, atol=1e-07)
-    assert np.allclose(py_res_Z, scal_d.data[2], rtol=1e-04, atol=1e-07)
+    err_m = np.max(np.abs(py_res_X - scal_d.data[0]))
+    assert np.allclose(py_res_X, scal_d.data[0], rtol=1e-04, atol=1e-07), str(err_m) + " " + \
+        str(np.where(np.abs(py_res_X_- scal_d.data[0]) > 1e-07+1e-04*np.abs(scal_d.data[0])))
+    err_m = np.max(np.abs(py_res_Y - scal_d.data[0]))
+    assert np.allclose(py_res_Y, scal_d.data[1], rtol=1e-04, atol=1e-07), str(err_m) + " " + \
+        str(np.where(np.abs(py_res_Y_- scal_d.data[1]) > 1e-07+1e-04*np.abs(scal_d.data[1])))
+    err_m = np.max(np.abs(py_res_Z - scal_d.data[0]))
+    assert np.allclose(py_res_Z, scal_d.data[2], rtol=1e-04, atol=1e-07), str(err_m) + " " + \
+        str(np.where(np.abs(py_res_Z_- scal_d.data[2]) > 1e-07+1e-04*np.abs(scal_d.data[2])))
     advec.finalize()
diff --git a/HySoP/hysop/gpu/tests/test_transposition.py b/HySoP/hysop/gpu/tests/test_transposition.py
index c6da15f27..ddb81e299 100644
--- a/HySoP/hysop/gpu/tests/test_transposition.py
+++ b/HySoP/hysop/gpu/tests/test_transposition.py
@@ -372,6 +372,9 @@ def test_transposition_xz3Dslice():
            int(resolution[1]),
            int(resolution[2] / 4))
     lwi = (16, 1, 4)
+    build_options += " -D NB_GROUPS_I=" + str(resolution[0] / lwi[0])
+    build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2])
+
     prg = cl_env.build_src(src_transpose_xz, build_options, vec)
     init_transpose_xz = KernelLauncher(
         prg.transpose_xz, cl_env.queue, gwi, lwi)
@@ -425,6 +428,8 @@ def test_transposition_xz3Dslice_rect():
            int(resolution[1]),
            int(resolution[2] / 4))
     lwi = (16, 1, 4)
+    build_options += " -D NB_GROUPS_I=" + str(resolution[0] / lwi[0])
+    build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2])
     prg = cl_env.build_src(src_transpose_xz, build_options, vec)
     init_transpose_xz_x = KernelLauncher(
         prg.transpose_xz, cl_env.queue, gwi, lwi)
@@ -437,6 +442,8 @@ def test_transposition_xz3Dslice_rect():
            int(resolution[1]),
            int(resolution[0] / 4))
     lwi = (16, 1, 4)
+    build_options += " -D NB_GROUPS_I=" + str(resolution[2] / lwi[0])
+    build_options += " -D NB_GROUPS_III=" + str((resolution[0] / 4) / lwi[2])
     prg = cl_env.build_src(src_transpose_xz, build_options, vec)
     init_transpose_xz_z = KernelLauncher(
         prg.transpose_xz, cl_env.queue, gwi, lwi)
@@ -463,6 +470,7 @@ def test_transposition_xz3Dslice_rect():
     cl_env.queue.finish()
     cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out)
     cl_env.queue.finish()
+    print len(np.where(np.abs(data_out-data_res)>0.0001)[0])
     assert np.allclose(data_out, data_res)
 
     init_transpose_xz_z(data_gpu_out, data_gpu_out2)
-- 
GitLab