From 980929391a82b2c0bc53a56a9cfe78e34d79c3f3 Mon Sep 17 00:00:00 2001
From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@univ-reims.fr>
Date: Fri, 17 Apr 2015 08:32:38 +0200
Subject: [PATCH] Fix transpositions. Improve gpu transpositions test

---
 .../hysop/gpu/cl_src/kernels/transpose_xy.cl  |  78 +--
 .../gpu/cl_src/kernels/transpose_xy_noVec.cl  |  67 ++-
 .../hysop/gpu/cl_src/kernels/transpose_xz.cl  |  83 ++-
 .../gpu/cl_src/kernels/transpose_xz_noVec.cl  |  75 ++-
 .../gpu/cl_src/kernels/transpose_xz_slice.cl  |  66 +-
 .../kernels/transpose_xz_slice_noVec.cl       |  58 +-
 HySoP/hysop/gpu/tests/test_transposition.py   | 562 ++++++++++--------
 7 files changed, 521 insertions(+), 468 deletions(-)

diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl
index c430da22e..bdba54042 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl
@@ -40,60 +40,60 @@ __kernel void transpose_xy(__global const float* in,
   uint lid_x = get_local_id(0);
   uint lid_y = get_local_id(1);
 
-  unint xIndex, yIndex, zIndex;
+  uint xIndex, yIndex, zIndex;
   uint index_in, index_out;
   uint gidI, gidII, i;
 
   __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */
 
-#ifdef NB_Z
-  for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2)) {
+#ifdef NB_III
+  for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2))
 #else
-  zIndex=get_global_id(2); {
+    zIndex=get_global_id(2);
 #endif
-  for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
-  for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
+  {
+    for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
+      for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
 
-  /* Use of diagonal coordinates */
+	/* Use of diagonal coordinates */
 #if NB_II == NB_I
-  group_id_x = (gidI + gidII) % NB_GROUPS_I;
-  group_id_y = gidI;
+	group_id_x = (gidI + gidII) % NB_GROUPS_I;
+	group_id_y = gidI;
 #else
-  uint bid = gidI + gidII * NB_GROUPS_I;
-  group_id_y = bid%NB_GROUPS_II;
-  group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I;
+	uint bid = gidI + gidII * NB_GROUPS_I;
+	group_id_y = bid%NB_GROUPS_II;
+	group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I;
 #endif
 
-  /* Global input index for work-item */
-  xIndex = group_id_x * TILE_DIM_XY + lid_x;
-  yIndex = group_id_y * TILE_DIM_XY + lid_y;
-  index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
+	/* Global input index for work-item */
+	xIndex = group_id_x * TILE_DIM_XY + lid_x*__N__;
+	yIndex = group_id_y * TILE_DIM_XY + lid_y;
+	//zIndex = get_global_id(2);
+	index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
 
-  /* Global output index */
-  xIndex = group_id_y * TILE_DIM_XY + lid_x*__N__;
-  yIndex = group_id_x * TILE_DIM_XY + lid_y;
-  index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
+	/* Global output index */
+	xIndex = group_id_y * TILE_DIM_XY + lid_x*__N__;
+	yIndex = group_id_x * TILE_DIM_XY + lid_y;
+	index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
-  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
-    {
-      /* Fill the tile */
-      temp = vload__N__((index_in + i * NB_II)/__N__, in);
-      tile[lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__;
-    }
 
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
+	for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
+	  /* Fill the tile */
+	  temp = vload__N__((index_in + i * NB_II)/__N__, in);
+	  tile[lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__;
+	}
+
+	/* Synchronize work-group */
+	barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
-    {
-      /* Write transposed data */
-      temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y + i],
-			  );
-      vstore__N__(temp, (index_out + i*NB_I)/__N__, out);
+	for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
+	  /* Write transposed data */
+	  temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y + i],
+			      );
+	  vstore__N__(temp, (index_out + i*NB_I)/__N__, out);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+      }
     }
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
-}
-}
-}
+  }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl
index dde7ba936..083d86eb2 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl
@@ -46,49 +46,50 @@ __kernel void transpose_xy(__global const float* in,
   __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */
 
 #ifdef NB_Z
-  for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2)) {
+  for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2))
 #else
-  zIndex=get_global_id(2); {
+  zIndex=get_global_id(2);
 #endif
-  for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
-  for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
+  {
+    for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
+      for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
 
-  /* Use of diagonal coordinates */
+	/* Use of diagonal coordinates */
 #if NB_II == NB_I
-  group_id_x = (gidI + gidII) % NB_GROUPS_I;
-  group_id_y = gidI;
+	group_id_x = (gidI + gidII) % NB_GROUPS_I;
+	group_id_y = gidI;
 #else
-  uint bid = gidI + gidII * NB_GROUPS_I;
-  group_id_y = bid%NB_GROUPS_II;
-  group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I;
+	uint bid = gidI + gidII * NB_GROUPS_I;
+	group_id_y = bid%NB_GROUPS_II;
+	group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I;
 #endif
 
-  /* Global input index for work-item */
-  xIndex = group_id_x * TILE_DIM_XY + lid_x;
-  yIndex = group_id_y * TILE_DIM_XY + lid_y;
-  index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
+	/* Global input index for work-item */
+	xIndex = group_id_x * TILE_DIM_XY + lid_x;
+	yIndex = group_id_y * TILE_DIM_XY + lid_y;
+	index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
 
-  /* Global output index */
-  xIndex = group_id_y * TILE_DIM_XY + lid_x;
-  yIndex = group_id_x * TILE_DIM_XY + lid_y;
-  index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
+	/* Global output index */
+	xIndex = group_id_y * TILE_DIM_XY + lid_x;
+	yIndex = group_id_x * TILE_DIM_XY + lid_y;
+	index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
-  for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
-    /* Fill the tile */
-    tile[lid_y + i][lid_x] = in[index_in + i * NB_II];
-  }
+	for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
+	  /* Fill the tile */
+	  tile[lid_y + i][lid_x] = in[index_in + i * NB_II];
+	}
 
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
+	/* Synchronize work-group */
+	barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
-    /* Write transposed data */
-    out[index_out + i*NB_I] = tile[lid_x][lid_y + i];
-  }
+	for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
+	  /* Write transposed data */
+	  out[index_out + i*NB_I] = tile[lid_x][lid_y + i];
+	}
 
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
-}
-}
-}
+	/* Synchronize work-group */
+	barrier(CLK_LOCAL_MEM_FENCE);
+      }
+    }
+  }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl
index 294925000..b2197fbb1 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl
@@ -32,62 +32,59 @@ __kernel void transpose_xz(__global const float* in,
   uint lid_y = get_local_id(1);
   uint lid_z = get_local_id(2);
 
-  /* Global input index for work-item */
   uint xIndex, yIndex, zIndex;
-  uint index_in, index_out;
-  uint gidI, gidIII, j;
+  uint index_in, index_out, i, j;
+  uint gidI, gidII, gidIII;
+
+  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
 
-  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
 
-  for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) {
   for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
-  for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
+    for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
+      for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
 
-  /* Use of diagonal coordinates */
+	/* Use of diagonal coordinates */
 #if NB_III == NB_I
-  group_id_x = (gidI + gidIII) % NB_GROUPS_I;
-  group_id_z = gidI;
+	group_id_x = (gidI + gidIII) % NB_GROUPS_I;
+	group_id_z = gidI;
 #else
-  uint bid = gidI + gidIII * NB_GROUPS_I;
-  group_id_z = bid%NB_GROUPS_III;
-  group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
+	uint bid = gidI + gidIII * NB_GROUPS_I;
+	group_id_z = bid%NB_GROUPS_III;
+	group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
 #endif
 
-  /* Global input index for work-item */
-  xIndex = group_id_x * TILE_DIM_XZ + lid_x;
-  zIndex = group_id_z * TILE_DIM_XZ + lid_z;
-  index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
+	/* Global input index for work-item */
+	xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__;
+	yIndex = gidII * TILE_DIM_XZ + lid_y;
+	zIndex = group_id_z * TILE_DIM_XZ + lid_z;
+	index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
 
-  /* Global output index */
-  xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__;
-  zIndex = group_id_x * TILE_DIM_XZ + lid_z;
-  index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
+	/* Global output index */
+	xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__;
+	zIndex = group_id_x * TILE_DIM_XZ + lid_z;
+	index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
-  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
-    {
-      for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
-	{
-	  /* Fill the tile */
-	  temp = vload__N__((index_in + i*NB_III + j*NB_III*NB_II)/__N__, in);
-	  tile[lid_z + j][lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__;
-	}
 
-    }
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
+	for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
+	  for(i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) {
+	    /* Fill the tile */
+	    temp = vload__N__((index_in + i*NB_III + j*NB_III*NB_II)/__N__, in);
+	    tile[lid_z + j][lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__;
+	  }
+
+	}
+	/* Synchronize work-group */
+	barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
-    {
-      for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
-	{
-	  /* Write transposed data */
-	  temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y+i][lid_z + j],
-			      );
-	  vstore__N__(temp, (index_out + i*NB_I + j*NB_I*NB_II)/__N__, out);
+	for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
+	  for(i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) {
+	    /* Write transposed data */
+	    temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y+i][lid_z + j],
+				);
+	    vstore__N__(temp, (index_out + i*NB_I + j*NB_I*NB_II)/__N__, out);
+	  }
 	}
+      }
     }
-  barrier(CLK_LOCAL_MEM_FENCE);
-}
-}
-}
+  }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl
index 37c811cf2..475bc9aae 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl
@@ -31,59 +31,54 @@ __kernel void transpose_xz(__global const float* in,
   uint lid_y = get_local_id(1);
   uint lid_z = get_local_id(2);
 
-  /* Global input index for work-item */
   uint xIndex, yIndex, zIndex;
-  uint index_in, index_out;
-  uint gidI, gidIII, j;
+  uint index_in, index_out, i, j;
+  uint gidI, gidII, gidIII;
 
-  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
+  __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
 
-  for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) {
   for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
-  for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
+    for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
+      for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
 
-  /* Use of diagonal coordinates */
+	/* Use of diagonal coordinates */
 #if NB_III == NB_I
-  group_id_x = (gidI + gidIII) % NB_GROUPS_I;
-  group_id_z = gidI;
+	group_id_x = (gidI + gidIII) % NB_GROUPS_I;
+	group_id_z = gidI;
 #else
-  uint bid = gidI + gidIII * NB_GROUPS_I;
-  group_id_z = bid%NB_GROUPS_III;
-  group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
+	uint bid = gidI + gidIII * NB_GROUPS_I;
+	group_id_z = bid%NB_GROUPS_III;
+	group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
 #endif
 
-  /* Global input index for work-item */
-  xIndex = group_id_x * TILE_DIM_XZ + lid_x;
-  zIndex = group_id_z * TILE_DIM_XZ + lid_z;
-  index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
+	/* Global input index for work-item */
+	xIndex = group_id_x * TILE_DIM_XZ + lid_x;
+	yIndex = gidII * TILE_DIM_XZ + lid_y;
+	zIndex = group_id_z * TILE_DIM_XZ + lid_z;
+	index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
 
-  /* Global output index */
-  xIndex = group_id_z * TILE_DIM_XZ + lid_x;
-  zIndex = group_id_x * TILE_DIM_XZ + lid_z;
-  index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
+	/* Global output index */
+	xIndex = group_id_z * TILE_DIM_XZ + lid_x;
+	zIndex = group_id_x * TILE_DIM_XZ + lid_z;
+	index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
-  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
-    {
-      for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
-	{
-	  /* Fill the tile */
-	  tile[lid_z + j][lid_y + i][lid_x] = in[index_in + i*NB_III + j*NB_III*NB_II];
+	for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
+	  for(i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) {
+	    /* Fill the tile */
+	    tile[lid_z + j][lid_y + i][lid_x] = in[index_in + i*NB_III + j*NB_III*NB_II];
+	  }
 	}
+	/* Synchronize work-group */
+	barrier(CLK_LOCAL_MEM_FENCE);
 
-    }
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
-    {
-      for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
-	{
-	  /* Write transposed data */
-	  out[index_out + i*NB_I + j*NB_I*NB_II] = tile[lid_x][lid_y+i][lid_z + j];
+	for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
+	  for(i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) {
+	    /* Write transposed data */
+	    out[index_out + i*NB_I + j*NB_I*NB_II] = tile[lid_x][lid_y+i][lid_z + j];
+	  }
 	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+      }
     }
-  barrier(CLK_LOCAL_MEM_FENCE);
-}
-}
-}
+  }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl
index 91475e45d..ec394f6cb 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl
@@ -31,7 +31,6 @@ __kernel void transpose_xz(__global const float* in,
   uint lid_x = get_local_id(0);
   uint lid_z = get_local_id(2);
 
-  /* Global input index for work-item */
   uint xIndex, yIndex, zIndex;
   uint index_in, index_out;
   uint gidI, gidIII, j;
@@ -39,48 +38,45 @@ __kernel void transpose_xz(__global const float* in,
   __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
 
   for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) {
-  for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
-  for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
+    for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
+      for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
 
-  /* Use of diagonal coordinates */
+	/* Use of diagonal coordinates */
 #if NB_III == NB_I
-  group_id_x = (gidI + gidIII) % NB_GROUPS_I;
-  group_id_z = gidI;
+	group_id_x = (gidI + gidIII) % NB_GROUPS_I;
+	group_id_z = gidI;
 #else
-  uint bid = gidI + gidIII * NB_GROUPS_I;
-  group_id_z = bid%NB_GROUPS_III;
-  group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
+	uint bid = gidI + gidIII * NB_GROUPS_I;
+	group_id_z = bid%NB_GROUPS_III;
+	group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
 #endif
 
-  /* Global input index for work-item */
-  xIndex = group_id_x * TILE_DIM_XZ + lid_x;
-  zIndex = group_id_z * TILE_DIM_XZ + lid_z;
-  index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
+	/* Global input index for work-item */
+	xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__;
+	zIndex = group_id_z * TILE_DIM_XZ + lid_z;
+	index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
 
-  /* Global output index */
-  xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__;
-  zIndex = group_id_x * TILE_DIM_XZ + lid_z;
-  index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
+	/* Global output index */
+	xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__;
+	zIndex = group_id_x * TILE_DIM_XZ + lid_z;
+	index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
-  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
-    {
-      /* Fill the tile */
-      temp = vload__N__((index_in + j*NB_III*NB_II)/__N__, in);
-      tile[lid_z + j][lid_x*__N__+__NN__] = temp.s__NN__;
+	for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
+	  /* Fill the tile */
+	  temp = vload__N__((index_in + j*NB_III*NB_II)/__N__, in);
+	  tile[lid_z + j][lid_x*__N__+__NN__] = temp.s__NN__;
 
-    }
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	/* Synchronize work-group */
+	barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
-    {
-      /* Write transposed data */
-      temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_z + j],
-			  );
-      vstore__N__(temp, (index_out + j*NB_I*NB_II)/__N__, out);
+	for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
+	  /* Write transposed data */
+	  temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_z + j],
+			      );
+	  vstore__N__(temp, (index_out + j*NB_I*NB_II)/__N__, out);
+	}
+      }
     }
-  barrier(CLK_LOCAL_MEM_FENCE);
-}
-}
-}
+  }
 }
diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl
index 9325ccfff..d97cb925e 100644
--- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl
+++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl
@@ -38,42 +38,42 @@ __kernel void transpose_xz(__global const float* in,
   __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
 
   for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) {
-  for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
-  for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
+    for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
+      for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
 
-  /* Use of diagonal coordinates */
+	/* Use of diagonal coordinates */
 #if NB_III == NB_I
-  group_id_x = (gidI + gidIII) % NB_GROUPS_I;
-  group_id_z = gidI;
+	group_id_x = (gidI + gidIII) % NB_GROUPS_I;
+	group_id_z = gidI;
 #else
-  uint bid = gidI + gidIII * NB_GROUPS_I;
-  group_id_z = bid%NB_GROUPS_III;
-  group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
+	uint bid = gidI + gidIII * NB_GROUPS_I;
+	group_id_z = bid%NB_GROUPS_III;
+	group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
 #endif
 
-  xIndex = group_id_x * TILE_DIM_XZ + lid_x;
-  zIndex = group_id_z * TILE_DIM_XZ + lid_z;
-  index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
+	xIndex = group_id_x * TILE_DIM_XZ + lid_x;
+	zIndex = group_id_z * TILE_DIM_XZ + lid_z;
+	index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
 
-  /* Global output index */
-  xIndex = group_id_z * TILE_DIM_XZ + lid_x;
-  zIndex = group_id_x * TILE_DIM_XZ + lid_z;
-  index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
+	/* Global output index */
+	xIndex = group_id_z * TILE_DIM_XZ + lid_x;
+	zIndex = group_id_x * TILE_DIM_XZ + lid_z;
+	index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
 
-  for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
-    /* Fill the tile */
-    tile[lid_z + j][lid_x] = in[index_in + j*NB_III*NB_II];
-  }
-  /* Synchronize work-group */
-  barrier(CLK_LOCAL_MEM_FENCE);
+	for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
+	  /* Fill the tile */
+	  tile[lid_z + j][lid_x] = in[index_in + j*NB_III*NB_II];
+	}
+	/* Synchronize work-group */
+	barrier(CLK_LOCAL_MEM_FENCE);
 
-  for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
-    /* Write transposed data */
-    out[index_out + j*NB_I*NB_II] = tile[lid_x][lid_z + j];
-    tile[lid_x][lid_z + j] = 0.0;
+	for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
+	  /* Write transposed data */
+	  out[index_out + j*NB_I*NB_II] = tile[lid_x][lid_z + j];
+	  tile[lid_x][lid_z + j] = 0.0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+      }
+    }
   }
-  barrier(CLK_LOCAL_MEM_FENCE);
-}
-}
-}
 }
diff --git a/HySoP/hysop/gpu/tests/test_transposition.py b/HySoP/hysop/gpu/tests/test_transposition.py
index ddb81e299..39862afed 100644
--- a/HySoP/hysop/gpu/tests/test_transposition.py
+++ b/HySoP/hysop/gpu/tests/test_transposition.py
@@ -8,27 +8,15 @@ from hysop.gpu.tools import get_opencl_environment
 from hysop.gpu.gpu_kernel import KernelLauncher
 import hysop.tools.numpywrappers as npw
 
+cl_env = get_opencl_environment()
 
-def test_transposition_xy2D():
-    resolution = (256, 256)
-    cl_env = get_opencl_environment()
-    vec = 4
-    src_transpose_xy = 'kernels/transpose_xy.cl'
-    build_options = ""
-    build_options += " -D NB_I=256 -D NB_II=256"
-    build_options += " -D PADDING_XY=1"
-    build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8"
-    gwi = (int(resolution[0] / 4), int(resolution[1]) / 4)
-    lwi = (8, 8)
-
-    # Build code
-    prg = cl_env.build_src(src_transpose_xy, build_options, vec)
-    init_transpose_xy = KernelLauncher(
-        prg.transpose_xy, cl_env.queue, gwi, lwi)
+def _comparison(resolution, resolutionT,
+                transpose_f, transpose_b,
+                gwi, lwi, axe=1):
 
     data_in = npw.asrealarray(np.random.random(resolution))
-    data_out = np.empty_like(data_in)
-    data_out2 = np.empty_like(data_in)
+    data_out = npw.realempty(resolutionT)
+    data_out2 = npw.realempty(resolution)
     data_gpu_in = cl.Buffer(cl_env.ctx,
                             cl.mem_flags.READ_WRITE,
                             size=data_in.nbytes)
@@ -37,19 +25,21 @@ def test_transposition_xy2D():
                              size=data_out.nbytes)
     data_gpu_out2 = cl.Buffer(cl_env.ctx,
                               cl.mem_flags.READ_WRITE,
-                              size=data_out.nbytes)
+                              size=data_out2.nbytes)
     cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in)
     cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out)
     cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2)
     cl_env.queue.finish()
 
-    init_transpose_xy(data_gpu_in, data_gpu_out)
+    # gpu_out <- gpu_in.T
+    transpose_f(data_gpu_in, data_gpu_out)
     cl_env.queue.finish()
     cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out)
     cl_env.queue.finish()
-    assert np.allclose(data_out, data_in.T)
+    assert np.allclose(data_out, data_in.swapaxes(0, axe))
 
-    init_transpose_xy(data_gpu_out, data_gpu_out2)
+    # gpu_in <- gpu_out.T ( = gpu_in.T.T = gpu_in)
+    transpose_b(data_gpu_out, data_gpu_out2)
     cl_env.queue.finish()
     cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2)
     cl_env.queue.finish()
@@ -60,10 +50,54 @@ def test_transposition_xy2D():
     data_gpu_out2.release()
 
 
+def test_transposition_xy2D():
+    resolution = (256, 256)
+    cl_env = get_opencl_environment()
+    vec = 4
+    src_transpose_xy = 'kernels/transpose_xy.cl'
+    build_options = ""
+    build_options += " -D NB_I=256 -D NB_II=256"
+    build_options += " -D PADDING_XY=1"
+    build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8"
+    gwi = (int(resolution[0] / 4), int(resolution[1]) / 4)
+    lwi = (8, 8)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 4) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1])
+
+    # Build code
+    prg = cl_env.build_src(src_transpose_xy, build_options, vec)
+    init_transpose_xy = KernelLauncher(
+        prg.transpose_xy, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolution,
+                init_transpose_xy, init_transpose_xy,
+                gwi, lwi)
+
+
+def test_transposition_xy2D_noVec():
+    resolution = (256, 256)
+    cl_env = get_opencl_environment()
+    src_transpose_xy = 'kernels/transpose_xy_noVec.cl'
+    build_options = ""
+    build_options += " -D NB_I=256 -D NB_II=256"
+    build_options += " -D PADDING_XY=1"
+    build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8"
+    gwi = (int(resolution[0]), int(resolution[1]) / 4)
+    lwi = (32, 8)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0]) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1])
+
+    # Build code
+    prg = cl_env.build_src(src_transpose_xy, build_options)
+    init_transpose_xy = KernelLauncher(
+        prg.transpose_xy, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolution,
+                init_transpose_xy, init_transpose_xy,
+                gwi, lwi)
+
+
 def test_transposition_xy2D_rect():
     resolution = (512, 256)
     resolutionT = (256, 512)
-    cl_env = get_opencl_environment()
     vec = 4
     src_transpose_xy = 'kernels/transpose_xy.cl'
     build_options = ""
@@ -74,6 +108,8 @@ def test_transposition_xy2D_rect():
     gwi = (int(resolution[0] / 4),
            int(resolution[1]) / 4)
     lwi = (8, 8)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 4) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1])
     prg = cl_env.build_src(src_transpose_xy, build_options, vec)
     init_transpose_xy_x = KernelLauncher(prg.transpose_xy,
                                          cl_env.queue,
@@ -85,46 +121,53 @@ def test_transposition_xy2D_rect():
     gwi = (int(resolution[1] / 4),
            int(resolution[0]) / 4)
     lwi = (8, 8)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[1] / 4) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[0] / 4) / lwi[1])
     prg = cl_env.build_src(src_transpose_xy, build_options, vec)
     init_transpose_xy_y = KernelLauncher(prg.transpose_xy,
                                          cl_env.queue,
                                          gwi, lwi)
+    _comparison(resolution, resolutionT,
+                init_transpose_xy_x, init_transpose_xy_y,
+                gwi, lwi)
 
-    data_in = npw.asrealarray(np.random.random(resolution))
-    data_out = npw.realempty(resolutionT)
-    data_out2 = npw.realempty(resolution)
-    data_gpu_in = cl.Buffer(cl_env.ctx,
-                            cl.mem_flags.READ_WRITE,
-                            size=data_in.nbytes)
-    data_gpu_out = cl.Buffer(cl_env.ctx,
-                             cl.mem_flags.READ_WRITE,
-                             size=data_out.nbytes)
-    data_gpu_out2 = cl.Buffer(cl_env.ctx,
-                              cl.mem_flags.READ_WRITE,
-                              size=data_out2.nbytes)
-    cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2)
-    cl_env.queue.finish()
 
-    # gpu_out <- gpu_in.T
-    init_transpose_xy_x(data_gpu_in, data_gpu_out)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out)
-    cl_env.queue.finish()
-   # data_in_t = data_in.swapaxes(0, 1)
-    assert np.allclose(data_out, data_in.T)
-
-    # gpu_in <- gpu_out.T ( = gpu_in.T.T = gpu_in)
-    init_transpose_xy_y(data_gpu_out, data_gpu_out2)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2)
-    cl_env.queue.finish()
-    assert np.allclose(data_out2, data_in)
+def test_transposition_xy2D_noVec_rect():
+    resolution = (512, 256)
+    resolutionT = (256, 512)
+    cl_env = get_opencl_environment()
+    vec = 4
+    src_transpose_xy = 'kernels/transpose_xy_noVec.cl'
+    build_options = ""
+    # Settings are taken from destination layout as current layout.
+    # gwi is computed form input layout (appears as transposed layout)
+    build_options += " -D NB_I=256 -D NB_II=512"
+    build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8 -D PADDING_XY=1"
+    gwi = (int(resolution[0]),
+           int(resolution[1]) / 4)
+    lwi = (32, 8)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0]) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1])
+    prg = cl_env.build_src(src_transpose_xy, build_options, vec)
+    init_transpose_xy_x = KernelLauncher(prg.transpose_xy,
+                                         cl_env.queue,
+                                         gwi, lwi)
 
-    data_gpu_in.release()
-    data_gpu_out.release()
-    data_gpu_out2.release()
+    build_options = ""
+    build_options += " -D NB_I=512 -D NB_II=256"
+    build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8 -D PADDING_XY=1"
+    gwi = (int(resolution[1]),
+           int(resolution[0]) / 4)
+    lwi = (32, 8)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[1]) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[0] / 4) / lwi[1])
+    prg = cl_env.build_src(src_transpose_xy, build_options, vec)
+    init_transpose_xy_y = KernelLauncher(prg.transpose_xy,
+                                         cl_env.queue,
+                                         gwi, lwi)
+    _comparison(resolution, resolutionT,
+                init_transpose_xy_x, init_transpose_xy_y,
+                gwi, lwi)
 
 
 def test_transposition_xy3D():
@@ -139,42 +182,36 @@ def test_transposition_xy3D():
            int(resolution[1] / 2),
            int(resolution[2]))
     lwi = (8, 8, 1)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 2) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 2) / lwi[1])
     prg = cl_env.build_src(src_transpose_xy, build_options, vec)
     init_transpose_xy = KernelLauncher(
         prg.transpose_xy, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolution,
+                init_transpose_xy, init_transpose_xy,
+                gwi, lwi)
 
-    data_in = npw.asrealarray(np.random.random(resolution))
-    data_out = npw.empty_like(data_in)
-    data_out2 = npw.empty_like(data_in)
-    data_gpu_in = cl.Buffer(cl_env.ctx,
-                            cl.mem_flags.READ_WRITE,
-                            size=data_in.nbytes)
-    data_gpu_out = cl.Buffer(cl_env.ctx,
-                             cl.mem_flags.READ_WRITE,
-                             size=data_out.nbytes)
-    data_gpu_out2 = cl.Buffer(cl_env.ctx,
-                              cl.mem_flags.READ_WRITE,
-                              size=data_out2.nbytes)
-    cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2)
-    cl_env.queue.finish()
-
-    init_transpose_xy(data_gpu_in, data_gpu_out)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out)
-    cl_env.queue.finish()
-    assert np.allclose(data_out, data_in.swapaxes(0, 1))
 
-    init_transpose_xy(data_gpu_out, data_gpu_out2)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2)
-    cl_env.queue.finish()
-    assert np.allclose(data_out2, data_in)
-
-    data_gpu_in.release()
-    data_gpu_out.release()
-    data_gpu_out2.release()
+def test_transposition_xy3D_noVec():
+    resolution = (32, 32, 32)
+    cl_env = get_opencl_environment()
+    vec = 2
+    src_transpose_xy = 'kernels/transpose_xy_noVec.cl'
+    build_options = ""
+    build_options += " -D NB_I=32 -D NB_II=32 -D NB_III=32"
+    build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8 -D PADDING_XY=1"
+    gwi = (int(resolution[0]),
+           int(resolution[1] / 2),
+           int(resolution[2]))
+    lwi = (16, 8, 1)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0]) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 2) / lwi[1])
+    prg = cl_env.build_src(src_transpose_xy, build_options, vec)
+    init_transpose_xy = KernelLauncher(
+        prg.transpose_xy, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolution,
+                init_transpose_xy, init_transpose_xy,
+                gwi, lwi)
 
 
 def test_transposition_xy3D_rect():
@@ -192,6 +229,8 @@ def test_transposition_xy3D_rect():
            int(resolution[1] / 2),
            int(resolution[2]))
     lwi = (8, 8, 1)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 2) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 2) / lwi[1])
     prg = cl_env.build_src(src_transpose_xy, build_options, vec)
     init_transpose_xy_x = KernelLauncher(
         prg.transpose_xy, cl_env.queue, gwi, lwi)
@@ -203,46 +242,79 @@ def test_transposition_xy3D_rect():
            int(resolution[0] / 2),
            int(resolution[2]))
     lwi = (8, 8, 1)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[1] / 2) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[0] / 2) / lwi[1])
     prg = cl_env.build_src(src_transpose_xy, build_options, vec)
     init_transpose_xy_y = KernelLauncher(
         prg.transpose_xy, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolutionT,
+                init_transpose_xy_x, init_transpose_xy_y,
+                gwi, lwi)
 
-    data_in = npw.asrealarray(np.random.random(resolution))
-    data_out = npw.realempty(resolutionT)
-    data_out2 = npw.realempty(resolution)
-    data_gpu_in = cl.Buffer(cl_env.ctx,
-                            cl.mem_flags.READ_WRITE,
-                            size=data_in.nbytes)
-    data_gpu_out = cl.Buffer(cl_env.ctx,
-                             cl.mem_flags.READ_WRITE,
-                             size=data_out.nbytes)
-    data_gpu_out2 = cl.Buffer(cl_env.ctx,
-                              cl.mem_flags.READ_WRITE,
-                              size=data_out2.nbytes)
-    cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2)
-    cl_env.queue.finish()
-
-    init_transpose_xy_x(data_gpu_in, data_gpu_out)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out)
-    cl_env.queue.finish()
-    assert np.allclose(data_out, data_in.swapaxes(0, 1))
 
-    data_out = np.empty_like(data_in)
-    init_transpose_xy_y(data_gpu_out, data_gpu_out2)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2)
-    cl_env.queue.finish()
-    assert np.allclose(data_out2, data_in)
+def test_transposition_xy3D_noVec_rect():
+    resolution = (32, 64, 32)
+    resolutionT = (64, 32, 32)
+    cl_env = get_opencl_environment()
+    vec = 2
+    src_transpose_xy = 'kernels/transpose_xy_noVec.cl'
+    build_options = ""
+    # Settings are taken from destination layout as current layout.
+    # gwi is computed form input layout (appears as transposed layout)
+    build_options += " -D NB_I=64 -D NB_II=32 -D NB_III=32"
+    build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8 -D PADDING_XY=1"
+    gwi = (int(resolution[0]),
+           int(resolution[1] / 2),
+           int(resolution[2]))
+    lwi = (16, 8, 1)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0]) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 2) / lwi[1])
+    prg = cl_env.build_src(src_transpose_xy, build_options, vec)
+    init_transpose_xy_x = KernelLauncher(
+        prg.transpose_xy, cl_env.queue, gwi, lwi)
 
-    data_gpu_in.release()
-    data_gpu_out.release()
-    data_gpu_out2.release()
+    build_options = ""
+    build_options += " -D NB_I=32 -D NB_II=64 -D NB_III=32"
+    build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8 -D PADDING_XY=1"
+    gwi = (int(resolution[1]),
+           int(resolution[0] / 2),
+           int(resolution[2]))
+    lwi = (16, 8, 1)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[1]) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[0] / 2) / lwi[1])
+    prg = cl_env.build_src(src_transpose_xy, build_options, vec)
+    init_transpose_xy_y = KernelLauncher(
+        prg.transpose_xy, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolutionT,
+                init_transpose_xy_x, init_transpose_xy_y,
+                gwi, lwi)
 
 
 def test_transposition_xz3D():
+    resolution = (32, 32, 32)
+    cl_env = get_opencl_environment()
+    vec = 2
+    src_transpose_xz = 'kernels/transpose_xz.cl'
+    build_options = ""
+    build_options += " -D NB_I=32 -D NB_II=32 -D NB_III=32"
+    build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4"
+    build_options += " -D BLOCK_DEPH_XZ=4 -D PADDING_XZ=1"
+    gwi = (int((resolution[0] / 2)),
+           int(resolution[1] / 4),
+           int(resolution[2] / 4))
+    lwi = (8, 4, 4)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 2) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1])
+    build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2])
+    prg = cl_env.build_src(src_transpose_xz, build_options, vec)
+    init_transpose_xz = KernelLauncher(
+        prg.transpose_xz, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolution,
+                init_transpose_xz, init_transpose_xz,
+                gwi, lwi, axe=2)
+
+
+def test_transposition_xz3D_noVec():
     resolution = (32, 32, 32)
     cl_env = get_opencl_environment()
     vec = 1
@@ -255,46 +327,62 @@ def test_transposition_xz3D():
            int(resolution[1] / 4),
            int(resolution[2] / 4))
     lwi = (16, 4, 4)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0]) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1])
+    build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2])
     prg = cl_env.build_src(src_transpose_xz, build_options, vec)
     init_transpose_xz = KernelLauncher(
         prg.transpose_xz, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolution,
+                init_transpose_xz, init_transpose_xz,
+                gwi, lwi, axe=2)
 
-    data_in = npw.asrealarray(np.random.random(resolution))
-    data_out = np.empty_like(data_in)
-    data_out2 = np.empty_like(data_in)
-    data_gpu_in = cl.Buffer(cl_env.ctx,
-                            cl.mem_flags.READ_WRITE,
-                            size=data_in.nbytes)
-    data_gpu_out = cl.Buffer(cl_env.ctx,
-                             cl.mem_flags.READ_WRITE,
-                             size=data_out.nbytes)
-    data_gpu_out2 = cl.Buffer(cl_env.ctx,
-                              cl.mem_flags.READ_WRITE,
-                              size=data_out2.nbytes)
-    cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2)
-    cl_env.queue.finish()
-
-    init_transpose_xz(data_gpu_in, data_gpu_out)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out)
-    cl_env.queue.finish()
-    assert np.allclose(data_out, data_in.swapaxes(0, 2))
 
-    init_transpose_xz(data_gpu_out, data_gpu_out2)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2)
-    cl_env.queue.finish()
-    assert np.allclose(data_out2, data_in)
+def test_transposition_xz3D_rect():
+    resolution = (32, 32, 64)
+    resolutionT = (64, 32, 32)
+    cl_env = get_opencl_environment()
+    vec = 2
+    src_transpose_xz = 'kernels/transpose_xz.cl'
+    build_options = ""
+    # Settings are taken from destination layout as current layout.
+    # gwi is computed form input layout (appears as transposed layout)
+    build_options += " -D NB_I=64 -D NB_II=32 -D NB_III=32"
+    build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4"
+    build_options += " -D BLOCK_DEPH_XZ=4 -D PADDING_XZ=1"
+    gwi = (int((resolution[0] / 2)),
+           int(resolution[1] / 4),
+           int(resolution[2] / 4))
+    lwi = (8, 4, 4)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 2) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1])
+    build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2])
+    prg = cl_env.build_src(src_transpose_xz, build_options, vec)
+    init_transpose_xz_x = KernelLauncher(
+        prg.transpose_xz, cl_env.queue, gwi, lwi)
 
-    data_gpu_in.release()
-    data_gpu_out.release()
-    data_gpu_out2.release()
+    build_options = ""
+    build_options += " -D NB_I=32 -D NB_II=32 -D NB_III=64"
+    build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4"
+    build_options += " -D BLOCK_DEPH_XZ=4 -D PADDING_XZ=1"
+    gwi = (int(resolution[2] / 2),
+           int(resolution[1] / 4),
+           int(resolution[0] / 4))
+    lwi = (8, 4, 4)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[2] / 2) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1])
+    build_options += " -D NB_GROUPS_III=" + str((resolution[0] / 4) / lwi[2])
+    prg = cl_env.build_src(src_transpose_xz, build_options, vec)
+    init_transpose_xz_z = KernelLauncher(
+        prg.transpose_xz, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolutionT,
+                init_transpose_xz_x, init_transpose_xz_z,
+                gwi, lwi, axe=2)
 
 
-def test_transposition_xz3D_rect():
+def test_transposition_xz3D_noVec_rect():
     resolution = (32, 32, 64)
+    resolutionT = (64, 32, 32)
     cl_env = get_opencl_environment()
     vec = 1
     src_transpose_xz = 'kernels/transpose_xz_noVec.cl'
@@ -308,6 +396,9 @@ def test_transposition_xz3D_rect():
            int(resolution[1] / 4),
            int(resolution[2] / 4))
     lwi = (16, 4, 4)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0]) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1])
+    build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2])
     prg = cl_env.build_src(src_transpose_xz, build_options, vec)
     init_transpose_xz_x = KernelLauncher(
         prg.transpose_xz, cl_env.queue, gwi, lwi)
@@ -320,46 +411,41 @@ def test_transposition_xz3D_rect():
            int(resolution[1] / 4),
            int(resolution[0] / 4))
     lwi = (16, 4, 4)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[2]) / lwi[0])
+    build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1])
+    build_options += " -D NB_GROUPS_III=" + str((resolution[0] / 4) / lwi[2])
     prg = cl_env.build_src(src_transpose_xz, build_options, vec)
     init_transpose_xz_z = KernelLauncher(
         prg.transpose_xz, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolutionT,
+                init_transpose_xz_x, init_transpose_xz_z,
+                gwi, lwi, axe=2)
 
-    data_in = npw.asrealarray(np.random.random(resolution))
-    data_res = data_in.copy().swapaxes(0, 2)
-    data_out = np.empty_like(data_res)
-    data_out2 = np.empty_like(data_in)
-    data_gpu_in = cl.Buffer(cl_env.ctx,
-                            cl.mem_flags.READ_WRITE,
-                            size=data_in.nbytes)
-    data_gpu_out = cl.Buffer(cl_env.ctx,
-                             cl.mem_flags.READ_WRITE,
-                             size=data_out.nbytes)
-    data_gpu_out2 = cl.Buffer(cl_env.ctx,
-                              cl.mem_flags.READ_WRITE,
-                              size=data_out2.nbytes)
-    cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2)
-    cl_env.queue.finish()
-
-    init_transpose_xz_x(data_gpu_in, data_gpu_out)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out)
-    cl_env.queue.finish()
-    assert np.allclose(data_out, data_res)
-
-    init_transpose_xz_z(data_gpu_out, data_gpu_out2)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2)
-    cl_env.queue.finish()
-    assert np.allclose(data_out2, data_in)
 
-    data_gpu_in.release()
-    data_gpu_out.release()
-    data_gpu_out2.release()
+def test_transposition_xz3Dslice():
+    resolution = (32, 32, 32)
+    cl_env = get_opencl_environment()
+    vec = 2
+    src_transpose_xz = 'kernels/transpose_xz_slice.cl'
+    build_options = ""
+    build_options += " -D NB_I=32 -D NB_II=32 -D NB_III=32"
+    build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=1"
+    build_options += " -D BLOCK_DEPH_XZ=4 -D PADDING_XZ=1"
+    gwi = (int(resolution[0] / 2),
+           int(resolution[1]),
+           int(resolution[2] / 4))
+    lwi = (8, 1, 4)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 2) / lwi[0])
+    build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2])
 
+    prg = cl_env.build_src(src_transpose_xz, build_options, vec)
+    init_transpose_xz = KernelLauncher(
+        prg.transpose_xz, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolution,
+                init_transpose_xz, init_transpose_xz,
+                gwi, lwi, axe=2)
 
-def test_transposition_xz3Dslice():
+def test_transposition_xz3Dslice_noVec():
     resolution = (32, 32, 32)
     cl_env = get_opencl_environment()
     vec = 1
@@ -378,43 +464,53 @@ def test_transposition_xz3Dslice():
     prg = cl_env.build_src(src_transpose_xz, build_options, vec)
     init_transpose_xz = KernelLauncher(
         prg.transpose_xz, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolution,
+                init_transpose_xz, init_transpose_xz,
+                gwi, lwi, axe=2)
 
-    data_in = npw.asrealarray(np.random.random(resolution))
-    data_out = npw.empty_like(data_in)
-    data_out2 = npw.empty_like(data_in)
-    data_gpu_in = cl.Buffer(cl_env.ctx,
-                            cl.mem_flags.READ_WRITE,
-                            size=data_in.nbytes)
-    data_gpu_out = cl.Buffer(cl_env.ctx,
-                             cl.mem_flags.READ_WRITE,
-                             size=data_out.nbytes)
-    data_gpu_out2 = cl.Buffer(cl_env.ctx,
-                              cl.mem_flags.READ_WRITE,
-                              size=data_out2.nbytes)
-    cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2)
-    cl_env.queue.finish()
-
-    init_transpose_xz(data_gpu_in, data_gpu_out)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out)
-    cl_env.queue.finish()
-    assert np.allclose(data_out, data_in.swapaxes(0, 2))
 
-    init_transpose_xz(data_gpu_out, data_gpu_out2)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2)
-    cl_env.queue.finish()
-    assert np.allclose(data_out2, data_in)
-
-    data_gpu_in.release()
-    data_gpu_out.release()
-    data_gpu_out2.release()
+def test_transposition_xz3Dslice_rect():
+    resolution = (32, 32, 64)
+    resolutionT = (64, 32, 32)
+    cl_env = get_opencl_environment()
+    vec = 2
+    src_transpose_xz = 'kernels/transpose_xz_slice.cl'
+    build_options = ""
+    # Settings are taken from destination layout as current layout.
+    # gwi is computed form input layout (appears as transposed layout)
+    build_options += " -D NB_I=64 -D NB_II=32 -D NB_III=32"
+    build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=1"
+    build_options += " -D BLOCK_DEPH_XZ=4 -D PADDING_XZ=1"
+    gwi = (int(resolution[0] / 2),
+           int(resolution[1]),
+           int(resolution[2] / 4))
+    lwi = (8, 1, 4)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 2) / lwi[0])
+    build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2])
+    prg = cl_env.build_src(src_transpose_xz, build_options, vec)
+    init_transpose_xz_x = KernelLauncher(
+        prg.transpose_xz, cl_env.queue, gwi, lwi)
 
+    build_options = ""
+    build_options += " -D NB_I=32 -D NB_II=32 -D NB_III=64"
+    build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=1"
+    build_options += " -D BLOCK_DEPH_XZ=4 -D PADDING_XZ=1"
+    gwi = (int(resolution[2] / 2),
+           int(resolution[1]),
+           int(resolution[0] / 4))
+    lwi = (8, 1, 4)
+    build_options += " -D NB_GROUPS_I=" + str((resolution[2] / 2) / lwi[0])
+    build_options += " -D NB_GROUPS_III=" + str((resolution[0] / 4) / lwi[2])
+    prg = cl_env.build_src(src_transpose_xz, build_options, vec)
+    init_transpose_xz_z = KernelLauncher(
+        prg.transpose_xz, cl_env.queue, gwi, lwi)
+    _comparison(resolution, resolutionT,
+                init_transpose_xz_x, init_transpose_xz_z,
+                gwi, lwi, axe=2)
 
-def test_transposition_xz3Dslice_rect():
+def test_transposition_xz3Dslice_noVec_rect():
     resolution = (32, 32, 64)
+    resolutionT = (64, 32, 32)
     cl_env = get_opencl_environment()
     vec = 1
     src_transpose_xz = 'kernels/transpose_xz_slice_noVec.cl'
@@ -447,38 +543,6 @@ def test_transposition_xz3Dslice_rect():
     prg = cl_env.build_src(src_transpose_xz, build_options, vec)
     init_transpose_xz_z = KernelLauncher(
         prg.transpose_xz, cl_env.queue, gwi, lwi)
-
-    data_in = npw.asrealarray(np.random.random(resolution))
-    data_res = data_in.copy().swapaxes(0, 2)
-    data_out = np.empty_like(data_res)
-    data_out2 = np.empty_like(data_in)
-    data_gpu_in = cl.Buffer(cl_env.ctx,
-                            cl.mem_flags.READ_WRITE,
-                            size=data_in.nbytes)
-    data_gpu_out = cl.Buffer(cl_env.ctx,
-                             cl.mem_flags.READ_WRITE,
-                             size=data_out.nbytes)
-    data_gpu_out2 = cl.Buffer(cl_env.ctx,
-                              cl.mem_flags.READ_WRITE,
-                              size=data_out2.nbytes)
-    cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out)
-    cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2)
-    cl_env.queue.finish()
-
-    init_transpose_xz_x(data_gpu_in, data_gpu_out)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out)
-    cl_env.queue.finish()
-    print len(np.where(np.abs(data_out-data_res)>0.0001)[0])
-    assert np.allclose(data_out, data_res)
-
-    init_transpose_xz_z(data_gpu_out, data_gpu_out2)
-    cl_env.queue.finish()
-    cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2)
-    cl_env.queue.finish()
-    assert np.allclose(data_out2, data_in)
-
-    data_gpu_in.release()
-    data_gpu_out.release()
-    data_gpu_out2.release()
+    _comparison(resolution, resolutionT,
+                init_transpose_xz_x, init_transpose_xz_z,
+                gwi, lwi, axe=2)
-- 
GitLab