From 980929391a82b2c0bc53a56a9cfe78e34d79c3f3 Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@univ-reims.fr> Date: Fri, 17 Apr 2015 08:32:38 +0200 Subject: [PATCH] Fix transpositions. Improve gpu transpositions test --- .../hysop/gpu/cl_src/kernels/transpose_xy.cl | 78 +-- .../gpu/cl_src/kernels/transpose_xy_noVec.cl | 67 ++- .../hysop/gpu/cl_src/kernels/transpose_xz.cl | 83 ++- .../gpu/cl_src/kernels/transpose_xz_noVec.cl | 75 ++- .../gpu/cl_src/kernels/transpose_xz_slice.cl | 66 +- .../kernels/transpose_xz_slice_noVec.cl | 58 +- HySoP/hysop/gpu/tests/test_transposition.py | 562 ++++++++++-------- 7 files changed, 521 insertions(+), 468 deletions(-) diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl index c430da22e..bdba54042 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl @@ -40,60 +40,60 @@ __kernel void transpose_xy(__global const float* in, uint lid_x = get_local_id(0); uint lid_y = get_local_id(1); - unint xIndex, yIndex, zIndex; + uint xIndex, yIndex, zIndex; uint index_in, index_out; uint gidI, gidII, i; __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */ -#ifdef NB_Z - for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2)) { +#ifdef NB_III + for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2)) #else - zIndex=get_global_id(2); { + zIndex=get_global_id(2); #endif - for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { - for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) { + { + for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { + for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) { - /* Use of diagonal coordinates */ + /* Use of diagonal coordinates */ #if NB_II == NB_I - group_id_x = (gidI + gidII) % NB_GROUPS_I; - group_id_y = gidI; + group_id_x = (gidI + gidII) % NB_GROUPS_I; + group_id_y = gidI; #else - uint bid = gidI + gidII * NB_GROUPS_I; - group_id_y = bid%NB_GROUPS_II; - group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I; + uint bid = gidI + gidII * NB_GROUPS_I; + group_id_y = bid%NB_GROUPS_II; + group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I; #endif - /* Global input index for work-item */ - xIndex = group_id_x * TILE_DIM_XY + lid_x; - yIndex = group_id_y * TILE_DIM_XY + lid_y; - index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I; + /* Global input index for work-item */ + xIndex = group_id_x * TILE_DIM_XY + lid_x*__N__; + yIndex = group_id_y * TILE_DIM_XY + lid_y; + //zIndex = get_global_id(2); + index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I; - /* Global output index */ - xIndex = group_id_y * TILE_DIM_XY + lid_x*__N__; - yIndex = group_id_x * TILE_DIM_XY + lid_y; - index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; + /* Global output index */ + xIndex = group_id_y * TILE_DIM_XY + lid_x*__N__; + yIndex = group_id_x * TILE_DIM_XY + lid_y; + index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) - { - /* Fill the tile */ - temp = vload__N__((index_in + i * NB_II)/__N__, in); - tile[lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__; - } - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); + for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) { + /* Fill the tile */ + temp = vload__N__((index_in + i * NB_II)/__N__, in); + tile[lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__; + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); - for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) - { - /* Write transposed data */ - temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y + i], - ); - vstore__N__(temp, (index_out + i*NB_I)/__N__, out); + for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) { + /* Write transposed data */ + temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y + i], + ); + vstore__N__(temp, (index_out + i*NB_I)/__N__, out); + } + barrier(CLK_LOCAL_MEM_FENCE); + } } - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); -} -} -} + } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl index dde7ba936..083d86eb2 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl @@ -46,49 +46,50 @@ __kernel void transpose_xy(__global const float* in, __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */ #ifdef NB_Z - for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2)) { + for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2)) #else - zIndex=get_global_id(2); { + zIndex=get_global_id(2); #endif - for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { - for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) { + { + for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { + for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) { - /* Use of diagonal coordinates */ + /* Use of diagonal coordinates */ #if NB_II == NB_I - group_id_x = (gidI + gidII) % NB_GROUPS_I; - group_id_y = gidI; + group_id_x = (gidI + gidII) % NB_GROUPS_I; + group_id_y = gidI; #else - uint bid = gidI + gidII * NB_GROUPS_I; - group_id_y = bid%NB_GROUPS_II; - group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I; + uint bid = gidI + gidII * NB_GROUPS_I; + group_id_y = bid%NB_GROUPS_II; + group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I; #endif - /* Global input index for work-item */ - xIndex = group_id_x * TILE_DIM_XY + lid_x; - yIndex = group_id_y * TILE_DIM_XY + lid_y; - index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I; + /* Global input index for work-item */ + xIndex = group_id_x * TILE_DIM_XY + lid_x; + yIndex = group_id_y * TILE_DIM_XY + lid_y; + index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I; - /* Global output index */ - xIndex = group_id_y * TILE_DIM_XY + lid_x; - yIndex = group_id_x * TILE_DIM_XY + lid_y; - index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; + /* Global output index */ + xIndex = group_id_y * TILE_DIM_XY + lid_x; + yIndex = group_id_x * TILE_DIM_XY + lid_y; + index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) { - /* Fill the tile */ - tile[lid_y + i][lid_x] = in[index_in + i * NB_II]; - } + for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) { + /* Fill the tile */ + tile[lid_y + i][lid_x] = in[index_in + i * NB_II]; + } - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); - for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) { - /* Write transposed data */ - out[index_out + i*NB_I] = tile[lid_x][lid_y + i]; - } + for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) { + /* Write transposed data */ + out[index_out + i*NB_I] = tile[lid_x][lid_y + i]; + } - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); -} -} -} + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + } + } + } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl index 294925000..b2197fbb1 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl @@ -32,62 +32,59 @@ __kernel void transpose_xz(__global const float* in, uint lid_y = get_local_id(1); uint lid_z = get_local_id(2); - /* Global input index for work-item */ uint xIndex, yIndex, zIndex; - uint index_in, index_out; - uint gidI, gidIII, j; + uint index_in, index_out, i, j; + uint gidI, gidII, gidIII; + + __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ - __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ - for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) { for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { - for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) { + for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) { + for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) { - /* Use of diagonal coordinates */ + /* Use of diagonal coordinates */ #if NB_III == NB_I - group_id_x = (gidI + gidIII) % NB_GROUPS_I; - group_id_z = gidI; + group_id_x = (gidI + gidIII) % NB_GROUPS_I; + group_id_z = gidI; #else - uint bid = gidI + gidIII * NB_GROUPS_I; - group_id_z = bid%NB_GROUPS_III; - group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I; + uint bid = gidI + gidIII * NB_GROUPS_I; + group_id_z = bid%NB_GROUPS_III; + group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I; #endif - /* Global input index for work-item */ - xIndex = group_id_x * TILE_DIM_XZ + lid_x; - zIndex = group_id_z * TILE_DIM_XZ + lid_z; - index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; + /* Global input index for work-item */ + xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__; + yIndex = gidII * TILE_DIM_XZ + lid_y; + zIndex = group_id_z * TILE_DIM_XZ + lid_z; + index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; - /* Global output index */ - xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__; - zIndex = group_id_x * TILE_DIM_XZ + lid_z; - index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; + /* Global output index */ + xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__; + zIndex = group_id_x * TILE_DIM_XZ + lid_z; + index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) - { - for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) - { - /* Fill the tile */ - temp = vload__N__((index_in + i*NB_III + j*NB_III*NB_II)/__N__, in); - tile[lid_z + j][lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__; - } - } - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); + for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { + for(i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) { + /* Fill the tile */ + temp = vload__N__((index_in + i*NB_III + j*NB_III*NB_II)/__N__, in); + tile[lid_z + j][lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__; + } + + } + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); - for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) - { - for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) - { - /* Write transposed data */ - temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y+i][lid_z + j], - ); - vstore__N__(temp, (index_out + i*NB_I + j*NB_I*NB_II)/__N__, out); + for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { + for(i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) { + /* Write transposed data */ + temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y+i][lid_z + j], + ); + vstore__N__(temp, (index_out + i*NB_I + j*NB_I*NB_II)/__N__, out); + } } + } } - barrier(CLK_LOCAL_MEM_FENCE); -} -} -} + } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl index 37c811cf2..475bc9aae 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl @@ -31,59 +31,54 @@ __kernel void transpose_xz(__global const float* in, uint lid_y = get_local_id(1); uint lid_z = get_local_id(2); - /* Global input index for work-item */ uint xIndex, yIndex, zIndex; - uint index_in, index_out; - uint gidI, gidIII, j; + uint index_in, index_out, i, j; + uint gidI, gidII, gidIII; - __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ + __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ - for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) { for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { - for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) { + for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) { + for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) { - /* Use of diagonal coordinates */ + /* Use of diagonal coordinates */ #if NB_III == NB_I - group_id_x = (gidI + gidIII) % NB_GROUPS_I; - group_id_z = gidI; + group_id_x = (gidI + gidIII) % NB_GROUPS_I; + group_id_z = gidI; #else - uint bid = gidI + gidIII * NB_GROUPS_I; - group_id_z = bid%NB_GROUPS_III; - group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I; + uint bid = gidI + gidIII * NB_GROUPS_I; + group_id_z = bid%NB_GROUPS_III; + group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I; #endif - /* Global input index for work-item */ - xIndex = group_id_x * TILE_DIM_XZ + lid_x; - zIndex = group_id_z * TILE_DIM_XZ + lid_z; - index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; + /* Global input index for work-item */ + xIndex = group_id_x * TILE_DIM_XZ + lid_x; + yIndex = gidII * TILE_DIM_XZ + lid_y; + zIndex = group_id_z * TILE_DIM_XZ + lid_z; + index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; - /* Global output index */ - xIndex = group_id_z * TILE_DIM_XZ + lid_x; - zIndex = group_id_x * TILE_DIM_XZ + lid_z; - index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; + /* Global output index */ + xIndex = group_id_z * TILE_DIM_XZ + lid_x; + zIndex = group_id_x * TILE_DIM_XZ + lid_z; + index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) - { - for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) - { - /* Fill the tile */ - tile[lid_z + j][lid_y + i][lid_x] = in[index_in + i*NB_III + j*NB_III*NB_II]; + for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { + for(i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) { + /* Fill the tile */ + tile[lid_z + j][lid_y + i][lid_x] = in[index_in + i*NB_III + j*NB_III*NB_II]; + } } + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); - } - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); - - for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) - { - for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) - { - /* Write transposed data */ - out[index_out + i*NB_I + j*NB_I*NB_II] = tile[lid_x][lid_y+i][lid_z + j]; + for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { + for(i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) { + /* Write transposed data */ + out[index_out + i*NB_I + j*NB_I*NB_II] = tile[lid_x][lid_y+i][lid_z + j]; + } } + barrier(CLK_LOCAL_MEM_FENCE); + } } - barrier(CLK_LOCAL_MEM_FENCE); -} -} -} + } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl index 91475e45d..ec394f6cb 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl @@ -31,7 +31,6 @@ __kernel void transpose_xz(__global const float* in, uint lid_x = get_local_id(0); uint lid_z = get_local_id(2); - /* Global input index for work-item */ uint xIndex, yIndex, zIndex; uint index_in, index_out; uint gidI, gidIII, j; @@ -39,48 +38,45 @@ __kernel void transpose_xz(__global const float* in, __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) { - for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { - for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) { + for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { + for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) { - /* Use of diagonal coordinates */ + /* Use of diagonal coordinates */ #if NB_III == NB_I - group_id_x = (gidI + gidIII) % NB_GROUPS_I; - group_id_z = gidI; + group_id_x = (gidI + gidIII) % NB_GROUPS_I; + group_id_z = gidI; #else - uint bid = gidI + gidIII * NB_GROUPS_I; - group_id_z = bid%NB_GROUPS_III; - group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I; + uint bid = gidI + gidIII * NB_GROUPS_I; + group_id_z = bid%NB_GROUPS_III; + group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I; #endif - /* Global input index for work-item */ - xIndex = group_id_x * TILE_DIM_XZ + lid_x; - zIndex = group_id_z * TILE_DIM_XZ + lid_z; - index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; + /* Global input index for work-item */ + xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__; + zIndex = group_id_z * TILE_DIM_XZ + lid_z; + index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; - /* Global output index */ - xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__; - zIndex = group_id_x * TILE_DIM_XZ + lid_z; - index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; + /* Global output index */ + xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__; + zIndex = group_id_x * TILE_DIM_XZ + lid_z; + index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) - { - /* Fill the tile */ - temp = vload__N__((index_in + j*NB_III*NB_II)/__N__, in); - tile[lid_z + j][lid_x*__N__+__NN__] = temp.s__NN__; + for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { + /* Fill the tile */ + temp = vload__N__((index_in + j*NB_III*NB_II)/__N__, in); + tile[lid_z + j][lid_x*__N__+__NN__] = temp.s__NN__; - } - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); + } + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); - for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) - { - /* Write transposed data */ - temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_z + j], - ); - vstore__N__(temp, (index_out + j*NB_I*NB_II)/__N__, out); + for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { + /* Write transposed data */ + temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_z + j], + ); + vstore__N__(temp, (index_out + j*NB_I*NB_II)/__N__, out); + } + } } - barrier(CLK_LOCAL_MEM_FENCE); -} -} -} + } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl index 9325ccfff..d97cb925e 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl @@ -38,42 +38,42 @@ __kernel void transpose_xz(__global const float* in, __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) { - for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { - for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) { + for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { + for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) { - /* Use of diagonal coordinates */ + /* Use of diagonal coordinates */ #if NB_III == NB_I - group_id_x = (gidI + gidIII) % NB_GROUPS_I; - group_id_z = gidI; + group_id_x = (gidI + gidIII) % NB_GROUPS_I; + group_id_z = gidI; #else - uint bid = gidI + gidIII * NB_GROUPS_I; - group_id_z = bid%NB_GROUPS_III; - group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I; + uint bid = gidI + gidIII * NB_GROUPS_I; + group_id_z = bid%NB_GROUPS_III; + group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I; #endif - xIndex = group_id_x * TILE_DIM_XZ + lid_x; - zIndex = group_id_z * TILE_DIM_XZ + lid_z; - index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; + xIndex = group_id_x * TILE_DIM_XZ + lid_x; + zIndex = group_id_z * TILE_DIM_XZ + lid_z; + index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; - /* Global output index */ - xIndex = group_id_z * TILE_DIM_XZ + lid_x; - zIndex = group_id_x * TILE_DIM_XZ + lid_z; - index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; + /* Global output index */ + xIndex = group_id_z * TILE_DIM_XZ + lid_x; + zIndex = group_id_x * TILE_DIM_XZ + lid_z; + index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { - /* Fill the tile */ - tile[lid_z + j][lid_x] = in[index_in + j*NB_III*NB_II]; - } - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); + for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { + /* Fill the tile */ + tile[lid_z + j][lid_x] = in[index_in + j*NB_III*NB_II]; + } + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); - for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { - /* Write transposed data */ - out[index_out + j*NB_I*NB_II] = tile[lid_x][lid_z + j]; - tile[lid_x][lid_z + j] = 0.0; + for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { + /* Write transposed data */ + out[index_out + j*NB_I*NB_II] = tile[lid_x][lid_z + j]; + tile[lid_x][lid_z + j] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } } - barrier(CLK_LOCAL_MEM_FENCE); -} -} -} } diff --git a/HySoP/hysop/gpu/tests/test_transposition.py b/HySoP/hysop/gpu/tests/test_transposition.py index ddb81e299..39862afed 100644 --- a/HySoP/hysop/gpu/tests/test_transposition.py +++ b/HySoP/hysop/gpu/tests/test_transposition.py @@ -8,27 +8,15 @@ from hysop.gpu.tools import get_opencl_environment from hysop.gpu.gpu_kernel import KernelLauncher import hysop.tools.numpywrappers as npw +cl_env = get_opencl_environment() -def test_transposition_xy2D(): - resolution = (256, 256) - cl_env = get_opencl_environment() - vec = 4 - src_transpose_xy = 'kernels/transpose_xy.cl' - build_options = "" - build_options += " -D NB_I=256 -D NB_II=256" - build_options += " -D PADDING_XY=1" - build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8" - gwi = (int(resolution[0] / 4), int(resolution[1]) / 4) - lwi = (8, 8) - - # Build code - prg = cl_env.build_src(src_transpose_xy, build_options, vec) - init_transpose_xy = KernelLauncher( - prg.transpose_xy, cl_env.queue, gwi, lwi) +def _comparison(resolution, resolutionT, + transpose_f, transpose_b, + gwi, lwi, axe=1): data_in = npw.asrealarray(np.random.random(resolution)) - data_out = np.empty_like(data_in) - data_out2 = np.empty_like(data_in) + data_out = npw.realempty(resolutionT) + data_out2 = npw.realempty(resolution) data_gpu_in = cl.Buffer(cl_env.ctx, cl.mem_flags.READ_WRITE, size=data_in.nbytes) @@ -37,19 +25,21 @@ def test_transposition_xy2D(): size=data_out.nbytes) data_gpu_out2 = cl.Buffer(cl_env.ctx, cl.mem_flags.READ_WRITE, - size=data_out.nbytes) + size=data_out2.nbytes) cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in) cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out) cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2) cl_env.queue.finish() - init_transpose_xy(data_gpu_in, data_gpu_out) + # gpu_out <- gpu_in.T + transpose_f(data_gpu_in, data_gpu_out) cl_env.queue.finish() cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out) cl_env.queue.finish() - assert np.allclose(data_out, data_in.T) + assert np.allclose(data_out, data_in.swapaxes(0, axe)) - init_transpose_xy(data_gpu_out, data_gpu_out2) + # gpu_in <- gpu_out.T ( = gpu_in.T.T = gpu_in) + transpose_b(data_gpu_out, data_gpu_out2) cl_env.queue.finish() cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2) cl_env.queue.finish() @@ -60,10 +50,54 @@ def test_transposition_xy2D(): data_gpu_out2.release() +def test_transposition_xy2D(): + resolution = (256, 256) + cl_env = get_opencl_environment() + vec = 4 + src_transpose_xy = 'kernels/transpose_xy.cl' + build_options = "" + build_options += " -D NB_I=256 -D NB_II=256" + build_options += " -D PADDING_XY=1" + build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8" + gwi = (int(resolution[0] / 4), int(resolution[1]) / 4) + lwi = (8, 8) + build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 4) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1]) + + # Build code + prg = cl_env.build_src(src_transpose_xy, build_options, vec) + init_transpose_xy = KernelLauncher( + prg.transpose_xy, cl_env.queue, gwi, lwi) + _comparison(resolution, resolution, + init_transpose_xy, init_transpose_xy, + gwi, lwi) + + +def test_transposition_xy2D_noVec(): + resolution = (256, 256) + cl_env = get_opencl_environment() + src_transpose_xy = 'kernels/transpose_xy_noVec.cl' + build_options = "" + build_options += " -D NB_I=256 -D NB_II=256" + build_options += " -D PADDING_XY=1" + build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8" + gwi = (int(resolution[0]), int(resolution[1]) / 4) + lwi = (32, 8) + build_options += " -D NB_GROUPS_I=" + str((resolution[0]) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1]) + + # Build code + prg = cl_env.build_src(src_transpose_xy, build_options) + init_transpose_xy = KernelLauncher( + prg.transpose_xy, cl_env.queue, gwi, lwi) + _comparison(resolution, resolution, + init_transpose_xy, init_transpose_xy, + gwi, lwi) + + def test_transposition_xy2D_rect(): resolution = (512, 256) resolutionT = (256, 512) - cl_env = get_opencl_environment() vec = 4 src_transpose_xy = 'kernels/transpose_xy.cl' build_options = "" @@ -74,6 +108,8 @@ def test_transposition_xy2D_rect(): gwi = (int(resolution[0] / 4), int(resolution[1]) / 4) lwi = (8, 8) + build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 4) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1]) prg = cl_env.build_src(src_transpose_xy, build_options, vec) init_transpose_xy_x = KernelLauncher(prg.transpose_xy, cl_env.queue, @@ -85,46 +121,53 @@ def test_transposition_xy2D_rect(): gwi = (int(resolution[1] / 4), int(resolution[0]) / 4) lwi = (8, 8) + build_options += " -D NB_GROUPS_I=" + str((resolution[1] / 4) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[0] / 4) / lwi[1]) prg = cl_env.build_src(src_transpose_xy, build_options, vec) init_transpose_xy_y = KernelLauncher(prg.transpose_xy, cl_env.queue, gwi, lwi) + _comparison(resolution, resolutionT, + init_transpose_xy_x, init_transpose_xy_y, + gwi, lwi) - data_in = npw.asrealarray(np.random.random(resolution)) - data_out = npw.realempty(resolutionT) - data_out2 = npw.realempty(resolution) - data_gpu_in = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_in.nbytes) - data_gpu_out = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out.nbytes) - data_gpu_out2 = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out2.nbytes) - cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in) - cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out) - cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2) - cl_env.queue.finish() - # gpu_out <- gpu_in.T - init_transpose_xy_x(data_gpu_in, data_gpu_out) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out) - cl_env.queue.finish() - # data_in_t = data_in.swapaxes(0, 1) - assert np.allclose(data_out, data_in.T) - - # gpu_in <- gpu_out.T ( = gpu_in.T.T = gpu_in) - init_transpose_xy_y(data_gpu_out, data_gpu_out2) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2) - cl_env.queue.finish() - assert np.allclose(data_out2, data_in) +def test_transposition_xy2D_noVec_rect(): + resolution = (512, 256) + resolutionT = (256, 512) + cl_env = get_opencl_environment() + vec = 4 + src_transpose_xy = 'kernels/transpose_xy_noVec.cl' + build_options = "" + # Settings are taken from destination layout as current layout. + # gwi is computed form input layout (appears as transposed layout) + build_options += " -D NB_I=256 -D NB_II=512" + build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8 -D PADDING_XY=1" + gwi = (int(resolution[0]), + int(resolution[1]) / 4) + lwi = (32, 8) + build_options += " -D NB_GROUPS_I=" + str((resolution[0]) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1]) + prg = cl_env.build_src(src_transpose_xy, build_options, vec) + init_transpose_xy_x = KernelLauncher(prg.transpose_xy, + cl_env.queue, + gwi, lwi) - data_gpu_in.release() - data_gpu_out.release() - data_gpu_out2.release() + build_options = "" + build_options += " -D NB_I=512 -D NB_II=256" + build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8 -D PADDING_XY=1" + gwi = (int(resolution[1]), + int(resolution[0]) / 4) + lwi = (32, 8) + build_options += " -D NB_GROUPS_I=" + str((resolution[1]) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[0] / 4) / lwi[1]) + prg = cl_env.build_src(src_transpose_xy, build_options, vec) + init_transpose_xy_y = KernelLauncher(prg.transpose_xy, + cl_env.queue, + gwi, lwi) + _comparison(resolution, resolutionT, + init_transpose_xy_x, init_transpose_xy_y, + gwi, lwi) def test_transposition_xy3D(): @@ -139,42 +182,36 @@ def test_transposition_xy3D(): int(resolution[1] / 2), int(resolution[2])) lwi = (8, 8, 1) + build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 2) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 2) / lwi[1]) prg = cl_env.build_src(src_transpose_xy, build_options, vec) init_transpose_xy = KernelLauncher( prg.transpose_xy, cl_env.queue, gwi, lwi) + _comparison(resolution, resolution, + init_transpose_xy, init_transpose_xy, + gwi, lwi) - data_in = npw.asrealarray(np.random.random(resolution)) - data_out = npw.empty_like(data_in) - data_out2 = npw.empty_like(data_in) - data_gpu_in = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_in.nbytes) - data_gpu_out = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out.nbytes) - data_gpu_out2 = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out2.nbytes) - cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in) - cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out) - cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2) - cl_env.queue.finish() - - init_transpose_xy(data_gpu_in, data_gpu_out) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out) - cl_env.queue.finish() - assert np.allclose(data_out, data_in.swapaxes(0, 1)) - init_transpose_xy(data_gpu_out, data_gpu_out2) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2) - cl_env.queue.finish() - assert np.allclose(data_out2, data_in) - - data_gpu_in.release() - data_gpu_out.release() - data_gpu_out2.release() +def test_transposition_xy3D_noVec(): + resolution = (32, 32, 32) + cl_env = get_opencl_environment() + vec = 2 + src_transpose_xy = 'kernels/transpose_xy_noVec.cl' + build_options = "" + build_options += " -D NB_I=32 -D NB_II=32 -D NB_III=32" + build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8 -D PADDING_XY=1" + gwi = (int(resolution[0]), + int(resolution[1] / 2), + int(resolution[2])) + lwi = (16, 8, 1) + build_options += " -D NB_GROUPS_I=" + str((resolution[0]) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 2) / lwi[1]) + prg = cl_env.build_src(src_transpose_xy, build_options, vec) + init_transpose_xy = KernelLauncher( + prg.transpose_xy, cl_env.queue, gwi, lwi) + _comparison(resolution, resolution, + init_transpose_xy, init_transpose_xy, + gwi, lwi) def test_transposition_xy3D_rect(): @@ -192,6 +229,8 @@ def test_transposition_xy3D_rect(): int(resolution[1] / 2), int(resolution[2])) lwi = (8, 8, 1) + build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 2) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 2) / lwi[1]) prg = cl_env.build_src(src_transpose_xy, build_options, vec) init_transpose_xy_x = KernelLauncher( prg.transpose_xy, cl_env.queue, gwi, lwi) @@ -203,46 +242,79 @@ def test_transposition_xy3D_rect(): int(resolution[0] / 2), int(resolution[2])) lwi = (8, 8, 1) + build_options += " -D NB_GROUPS_I=" + str((resolution[1] / 2) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[0] / 2) / lwi[1]) prg = cl_env.build_src(src_transpose_xy, build_options, vec) init_transpose_xy_y = KernelLauncher( prg.transpose_xy, cl_env.queue, gwi, lwi) + _comparison(resolution, resolutionT, + init_transpose_xy_x, init_transpose_xy_y, + gwi, lwi) - data_in = npw.asrealarray(np.random.random(resolution)) - data_out = npw.realempty(resolutionT) - data_out2 = npw.realempty(resolution) - data_gpu_in = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_in.nbytes) - data_gpu_out = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out.nbytes) - data_gpu_out2 = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out2.nbytes) - cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in) - cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out) - cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2) - cl_env.queue.finish() - - init_transpose_xy_x(data_gpu_in, data_gpu_out) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out) - cl_env.queue.finish() - assert np.allclose(data_out, data_in.swapaxes(0, 1)) - data_out = np.empty_like(data_in) - init_transpose_xy_y(data_gpu_out, data_gpu_out2) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2) - cl_env.queue.finish() - assert np.allclose(data_out2, data_in) +def test_transposition_xy3D_noVec_rect(): + resolution = (32, 64, 32) + resolutionT = (64, 32, 32) + cl_env = get_opencl_environment() + vec = 2 + src_transpose_xy = 'kernels/transpose_xy_noVec.cl' + build_options = "" + # Settings are taken from destination layout as current layout. + # gwi is computed form input layout (appears as transposed layout) + build_options += " -D NB_I=64 -D NB_II=32 -D NB_III=32" + build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8 -D PADDING_XY=1" + gwi = (int(resolution[0]), + int(resolution[1] / 2), + int(resolution[2])) + lwi = (16, 8, 1) + build_options += " -D NB_GROUPS_I=" + str((resolution[0]) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 2) / lwi[1]) + prg = cl_env.build_src(src_transpose_xy, build_options, vec) + init_transpose_xy_x = KernelLauncher( + prg.transpose_xy, cl_env.queue, gwi, lwi) - data_gpu_in.release() - data_gpu_out.release() - data_gpu_out2.release() + build_options = "" + build_options += " -D NB_I=32 -D NB_II=64 -D NB_III=32" + build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8 -D PADDING_XY=1" + gwi = (int(resolution[1]), + int(resolution[0] / 2), + int(resolution[2])) + lwi = (16, 8, 1) + build_options += " -D NB_GROUPS_I=" + str((resolution[1]) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[0] / 2) / lwi[1]) + prg = cl_env.build_src(src_transpose_xy, build_options, vec) + init_transpose_xy_y = KernelLauncher( + prg.transpose_xy, cl_env.queue, gwi, lwi) + _comparison(resolution, resolutionT, + init_transpose_xy_x, init_transpose_xy_y, + gwi, lwi) def test_transposition_xz3D(): + resolution = (32, 32, 32) + cl_env = get_opencl_environment() + vec = 2 + src_transpose_xz = 'kernels/transpose_xz.cl' + build_options = "" + build_options += " -D NB_I=32 -D NB_II=32 -D NB_III=32" + build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4" + build_options += " -D BLOCK_DEPH_XZ=4 -D PADDING_XZ=1" + gwi = (int((resolution[0] / 2)), + int(resolution[1] / 4), + int(resolution[2] / 4)) + lwi = (8, 4, 4) + build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 2) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1]) + build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2]) + prg = cl_env.build_src(src_transpose_xz, build_options, vec) + init_transpose_xz = KernelLauncher( + prg.transpose_xz, cl_env.queue, gwi, lwi) + _comparison(resolution, resolution, + init_transpose_xz, init_transpose_xz, + gwi, lwi, axe=2) + + +def test_transposition_xz3D_noVec(): resolution = (32, 32, 32) cl_env = get_opencl_environment() vec = 1 @@ -255,46 +327,62 @@ def test_transposition_xz3D(): int(resolution[1] / 4), int(resolution[2] / 4)) lwi = (16, 4, 4) + build_options += " -D NB_GROUPS_I=" + str((resolution[0]) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1]) + build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2]) prg = cl_env.build_src(src_transpose_xz, build_options, vec) init_transpose_xz = KernelLauncher( prg.transpose_xz, cl_env.queue, gwi, lwi) + _comparison(resolution, resolution, + init_transpose_xz, init_transpose_xz, + gwi, lwi, axe=2) - data_in = npw.asrealarray(np.random.random(resolution)) - data_out = np.empty_like(data_in) - data_out2 = np.empty_like(data_in) - data_gpu_in = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_in.nbytes) - data_gpu_out = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out.nbytes) - data_gpu_out2 = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out2.nbytes) - cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in) - cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out) - cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2) - cl_env.queue.finish() - - init_transpose_xz(data_gpu_in, data_gpu_out) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out) - cl_env.queue.finish() - assert np.allclose(data_out, data_in.swapaxes(0, 2)) - init_transpose_xz(data_gpu_out, data_gpu_out2) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2) - cl_env.queue.finish() - assert np.allclose(data_out2, data_in) +def test_transposition_xz3D_rect(): + resolution = (32, 32, 64) + resolutionT = (64, 32, 32) + cl_env = get_opencl_environment() + vec = 2 + src_transpose_xz = 'kernels/transpose_xz.cl' + build_options = "" + # Settings are taken from destination layout as current layout. + # gwi is computed form input layout (appears as transposed layout) + build_options += " -D NB_I=64 -D NB_II=32 -D NB_III=32" + build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4" + build_options += " -D BLOCK_DEPH_XZ=4 -D PADDING_XZ=1" + gwi = (int((resolution[0] / 2)), + int(resolution[1] / 4), + int(resolution[2] / 4)) + lwi = (8, 4, 4) + build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 2) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1]) + build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2]) + prg = cl_env.build_src(src_transpose_xz, build_options, vec) + init_transpose_xz_x = KernelLauncher( + prg.transpose_xz, cl_env.queue, gwi, lwi) - data_gpu_in.release() - data_gpu_out.release() - data_gpu_out2.release() + build_options = "" + build_options += " -D NB_I=32 -D NB_II=32 -D NB_III=64" + build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4" + build_options += " -D BLOCK_DEPH_XZ=4 -D PADDING_XZ=1" + gwi = (int(resolution[2] / 2), + int(resolution[1] / 4), + int(resolution[0] / 4)) + lwi = (8, 4, 4) + build_options += " -D NB_GROUPS_I=" + str((resolution[2] / 2) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1]) + build_options += " -D NB_GROUPS_III=" + str((resolution[0] / 4) / lwi[2]) + prg = cl_env.build_src(src_transpose_xz, build_options, vec) + init_transpose_xz_z = KernelLauncher( + prg.transpose_xz, cl_env.queue, gwi, lwi) + _comparison(resolution, resolutionT, + init_transpose_xz_x, init_transpose_xz_z, + gwi, lwi, axe=2) -def test_transposition_xz3D_rect(): +def test_transposition_xz3D_noVec_rect(): resolution = (32, 32, 64) + resolutionT = (64, 32, 32) cl_env = get_opencl_environment() vec = 1 src_transpose_xz = 'kernels/transpose_xz_noVec.cl' @@ -308,6 +396,9 @@ def test_transposition_xz3D_rect(): int(resolution[1] / 4), int(resolution[2] / 4)) lwi = (16, 4, 4) + build_options += " -D NB_GROUPS_I=" + str((resolution[0]) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1]) + build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2]) prg = cl_env.build_src(src_transpose_xz, build_options, vec) init_transpose_xz_x = KernelLauncher( prg.transpose_xz, cl_env.queue, gwi, lwi) @@ -320,46 +411,41 @@ def test_transposition_xz3D_rect(): int(resolution[1] / 4), int(resolution[0] / 4)) lwi = (16, 4, 4) + build_options += " -D NB_GROUPS_I=" + str((resolution[2]) / lwi[0]) + build_options += " -D NB_GROUPS_II=" + str((resolution[1] / 4) / lwi[1]) + build_options += " -D NB_GROUPS_III=" + str((resolution[0] / 4) / lwi[2]) prg = cl_env.build_src(src_transpose_xz, build_options, vec) init_transpose_xz_z = KernelLauncher( prg.transpose_xz, cl_env.queue, gwi, lwi) + _comparison(resolution, resolutionT, + init_transpose_xz_x, init_transpose_xz_z, + gwi, lwi, axe=2) - data_in = npw.asrealarray(np.random.random(resolution)) - data_res = data_in.copy().swapaxes(0, 2) - data_out = np.empty_like(data_res) - data_out2 = np.empty_like(data_in) - data_gpu_in = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_in.nbytes) - data_gpu_out = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out.nbytes) - data_gpu_out2 = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out2.nbytes) - cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in) - cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out) - cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2) - cl_env.queue.finish() - - init_transpose_xz_x(data_gpu_in, data_gpu_out) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out) - cl_env.queue.finish() - assert np.allclose(data_out, data_res) - - init_transpose_xz_z(data_gpu_out, data_gpu_out2) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2) - cl_env.queue.finish() - assert np.allclose(data_out2, data_in) - data_gpu_in.release() - data_gpu_out.release() - data_gpu_out2.release() +def test_transposition_xz3Dslice(): + resolution = (32, 32, 32) + cl_env = get_opencl_environment() + vec = 2 + src_transpose_xz = 'kernels/transpose_xz_slice.cl' + build_options = "" + build_options += " -D NB_I=32 -D NB_II=32 -D NB_III=32" + build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=1" + build_options += " -D BLOCK_DEPH_XZ=4 -D PADDING_XZ=1" + gwi = (int(resolution[0] / 2), + int(resolution[1]), + int(resolution[2] / 4)) + lwi = (8, 1, 4) + build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 2) / lwi[0]) + build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2]) + prg = cl_env.build_src(src_transpose_xz, build_options, vec) + init_transpose_xz = KernelLauncher( + prg.transpose_xz, cl_env.queue, gwi, lwi) + _comparison(resolution, resolution, + init_transpose_xz, init_transpose_xz, + gwi, lwi, axe=2) -def test_transposition_xz3Dslice(): +def test_transposition_xz3Dslice_noVec(): resolution = (32, 32, 32) cl_env = get_opencl_environment() vec = 1 @@ -378,43 +464,53 @@ def test_transposition_xz3Dslice(): prg = cl_env.build_src(src_transpose_xz, build_options, vec) init_transpose_xz = KernelLauncher( prg.transpose_xz, cl_env.queue, gwi, lwi) + _comparison(resolution, resolution, + init_transpose_xz, init_transpose_xz, + gwi, lwi, axe=2) - data_in = npw.asrealarray(np.random.random(resolution)) - data_out = npw.empty_like(data_in) - data_out2 = npw.empty_like(data_in) - data_gpu_in = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_in.nbytes) - data_gpu_out = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out.nbytes) - data_gpu_out2 = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out2.nbytes) - cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in) - cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out) - cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2) - cl_env.queue.finish() - - init_transpose_xz(data_gpu_in, data_gpu_out) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out) - cl_env.queue.finish() - assert np.allclose(data_out, data_in.swapaxes(0, 2)) - init_transpose_xz(data_gpu_out, data_gpu_out2) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2) - cl_env.queue.finish() - assert np.allclose(data_out2, data_in) - - data_gpu_in.release() - data_gpu_out.release() - data_gpu_out2.release() +def test_transposition_xz3Dslice_rect(): + resolution = (32, 32, 64) + resolutionT = (64, 32, 32) + cl_env = get_opencl_environment() + vec = 2 + src_transpose_xz = 'kernels/transpose_xz_slice.cl' + build_options = "" + # Settings are taken from destination layout as current layout. + # gwi is computed form input layout (appears as transposed layout) + build_options += " -D NB_I=64 -D NB_II=32 -D NB_III=32" + build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=1" + build_options += " -D BLOCK_DEPH_XZ=4 -D PADDING_XZ=1" + gwi = (int(resolution[0] / 2), + int(resolution[1]), + int(resolution[2] / 4)) + lwi = (8, 1, 4) + build_options += " -D NB_GROUPS_I=" + str((resolution[0] / 2) / lwi[0]) + build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2]) + prg = cl_env.build_src(src_transpose_xz, build_options, vec) + init_transpose_xz_x = KernelLauncher( + prg.transpose_xz, cl_env.queue, gwi, lwi) + build_options = "" + build_options += " -D NB_I=32 -D NB_II=32 -D NB_III=64" + build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=1" + build_options += " -D BLOCK_DEPH_XZ=4 -D PADDING_XZ=1" + gwi = (int(resolution[2] / 2), + int(resolution[1]), + int(resolution[0] / 4)) + lwi = (8, 1, 4) + build_options += " -D NB_GROUPS_I=" + str((resolution[2] / 2) / lwi[0]) + build_options += " -D NB_GROUPS_III=" + str((resolution[0] / 4) / lwi[2]) + prg = cl_env.build_src(src_transpose_xz, build_options, vec) + init_transpose_xz_z = KernelLauncher( + prg.transpose_xz, cl_env.queue, gwi, lwi) + _comparison(resolution, resolutionT, + init_transpose_xz_x, init_transpose_xz_z, + gwi, lwi, axe=2) -def test_transposition_xz3Dslice_rect(): +def test_transposition_xz3Dslice_noVec_rect(): resolution = (32, 32, 64) + resolutionT = (64, 32, 32) cl_env = get_opencl_environment() vec = 1 src_transpose_xz = 'kernels/transpose_xz_slice_noVec.cl' @@ -447,38 +543,6 @@ def test_transposition_xz3Dslice_rect(): prg = cl_env.build_src(src_transpose_xz, build_options, vec) init_transpose_xz_z = KernelLauncher( prg.transpose_xz, cl_env.queue, gwi, lwi) - - data_in = npw.asrealarray(np.random.random(resolution)) - data_res = data_in.copy().swapaxes(0, 2) - data_out = np.empty_like(data_res) - data_out2 = np.empty_like(data_in) - data_gpu_in = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_in.nbytes) - data_gpu_out = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out.nbytes) - data_gpu_out2 = cl.Buffer(cl_env.ctx, - cl.mem_flags.READ_WRITE, - size=data_out2.nbytes) - cl.enqueue_copy(cl_env.queue, data_gpu_in, data_in) - cl.enqueue_copy(cl_env.queue, data_gpu_out, data_out) - cl.enqueue_copy(cl_env.queue, data_gpu_out2, data_out2) - cl_env.queue.finish() - - init_transpose_xz_x(data_gpu_in, data_gpu_out) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out) - cl_env.queue.finish() - print len(np.where(np.abs(data_out-data_res)>0.0001)[0]) - assert np.allclose(data_out, data_res) - - init_transpose_xz_z(data_gpu_out, data_gpu_out2) - cl_env.queue.finish() - cl.enqueue_copy(cl_env.queue, data_out2, data_gpu_out2) - cl_env.queue.finish() - assert np.allclose(data_out2, data_in) - - data_gpu_in.release() - data_gpu_out.release() - data_gpu_out2.release() + _comparison(resolution, resolutionT, + init_transpose_xz_x, init_transpose_xz_z, + gwi, lwi, axe=2) -- GitLab