Skip to content
Snippets Groups Projects
Commit 98092939 authored by Jean-Matthieu Etancelin's avatar Jean-Matthieu Etancelin
Browse files

Fix transpositions. Improve gpu transpositions test

parent ef2fa761
No related branches found
No related tags found
No related merge requests found
......@@ -40,60 +40,60 @@ __kernel void transpose_xy(__global const float* in,
uint lid_x = get_local_id(0);
uint lid_y = get_local_id(1);
unint xIndex, yIndex, zIndex;
uint xIndex, yIndex, zIndex;
uint index_in, index_out;
uint gidI, gidII, i;
__local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */
#ifdef NB_Z
for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2)) {
#ifdef NB_III
for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2))
#else
zIndex=get_global_id(2); {
zIndex=get_global_id(2);
#endif
for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
{
for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
/* Use of diagonal coordinates */
/* Use of diagonal coordinates */
#if NB_II == NB_I
group_id_x = (gidI + gidII) % NB_GROUPS_I;
group_id_y = gidI;
group_id_x = (gidI + gidII) % NB_GROUPS_I;
group_id_y = gidI;
#else
uint bid = gidI + gidII * NB_GROUPS_I;
group_id_y = bid%NB_GROUPS_II;
group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I;
uint bid = gidI + gidII * NB_GROUPS_I;
group_id_y = bid%NB_GROUPS_II;
group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I;
#endif
/* Global input index for work-item */
xIndex = group_id_x * TILE_DIM_XY + lid_x;
yIndex = group_id_y * TILE_DIM_XY + lid_y;
index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
/* Global input index for work-item */
xIndex = group_id_x * TILE_DIM_XY + lid_x*__N__;
yIndex = group_id_y * TILE_DIM_XY + lid_y;
//zIndex = get_global_id(2);
index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
/* Global output index */
xIndex = group_id_y * TILE_DIM_XY + lid_x*__N__;
yIndex = group_id_x * TILE_DIM_XY + lid_y;
index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
/* Global output index */
xIndex = group_id_y * TILE_DIM_XY + lid_x*__N__;
yIndex = group_id_x * TILE_DIM_XY + lid_y;
index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
{
/* Fill the tile */
temp = vload__N__((index_in + i * NB_II)/__N__, in);
tile[lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__;
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
/* Fill the tile */
temp = vload__N__((index_in + i * NB_II)/__N__, in);
tile[lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__;
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY)
{
/* Write transposed data */
temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y + i],
);
vstore__N__(temp, (index_out + i*NB_I)/__N__, out);
for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
/* Write transposed data */
temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y + i],
);
vstore__N__(temp, (index_out + i*NB_I)/__N__, out);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
}
}
......@@ -46,49 +46,50 @@ __kernel void transpose_xy(__global const float* in,
__local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */
#ifdef NB_Z
for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2)) {
for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2))
#else
zIndex=get_global_id(2); {
zIndex=get_global_id(2);
#endif
for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
{
for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
/* Use of diagonal coordinates */
/* Use of diagonal coordinates */
#if NB_II == NB_I
group_id_x = (gidI + gidII) % NB_GROUPS_I;
group_id_y = gidI;
group_id_x = (gidI + gidII) % NB_GROUPS_I;
group_id_y = gidI;
#else
uint bid = gidI + gidII * NB_GROUPS_I;
group_id_y = bid%NB_GROUPS_II;
group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I;
uint bid = gidI + gidII * NB_GROUPS_I;
group_id_y = bid%NB_GROUPS_II;
group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I;
#endif
/* Global input index for work-item */
xIndex = group_id_x * TILE_DIM_XY + lid_x;
yIndex = group_id_y * TILE_DIM_XY + lid_y;
index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
/* Global input index for work-item */
xIndex = group_id_x * TILE_DIM_XY + lid_x;
yIndex = group_id_y * TILE_DIM_XY + lid_y;
index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I;
/* Global output index */
xIndex = group_id_y * TILE_DIM_XY + lid_x;
yIndex = group_id_x * TILE_DIM_XY + lid_y;
index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
/* Global output index */
xIndex = group_id_y * TILE_DIM_XY + lid_x;
yIndex = group_id_x * TILE_DIM_XY + lid_y;
index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
/* Fill the tile */
tile[lid_y + i][lid_x] = in[index_in + i * NB_II];
}
for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
/* Fill the tile */
tile[lid_y + i][lid_x] = in[index_in + i * NB_II];
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
/* Write transposed data */
out[index_out + i*NB_I] = tile[lid_x][lid_y + i];
}
for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) {
/* Write transposed data */
out[index_out + i*NB_I] = tile[lid_x][lid_y + i];
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
}
......@@ -32,62 +32,59 @@ __kernel void transpose_xz(__global const float* in,
uint lid_y = get_local_id(1);
uint lid_z = get_local_id(2);
/* Global input index for work-item */
uint xIndex, yIndex, zIndex;
uint index_in, index_out;
uint gidI, gidIII, j;
uint index_in, index_out, i, j;
uint gidI, gidII, gidIII;
__local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
__local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) {
for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
/* Use of diagonal coordinates */
/* Use of diagonal coordinates */
#if NB_III == NB_I
group_id_x = (gidI + gidIII) % NB_GROUPS_I;
group_id_z = gidI;
group_id_x = (gidI + gidIII) % NB_GROUPS_I;
group_id_z = gidI;
#else
uint bid = gidI + gidIII * NB_GROUPS_I;
group_id_z = bid%NB_GROUPS_III;
group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
uint bid = gidI + gidIII * NB_GROUPS_I;
group_id_z = bid%NB_GROUPS_III;
group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
#endif
/* Global input index for work-item */
xIndex = group_id_x * TILE_DIM_XZ + lid_x;
zIndex = group_id_z * TILE_DIM_XZ + lid_z;
index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
/* Global input index for work-item */
xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__;
yIndex = gidII * TILE_DIM_XZ + lid_y;
zIndex = group_id_z * TILE_DIM_XZ + lid_z;
index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
/* Global output index */
xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__;
zIndex = group_id_x * TILE_DIM_XZ + lid_z;
index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
/* Global output index */
xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__;
zIndex = group_id_x * TILE_DIM_XZ + lid_z;
index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
{
for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
{
/* Fill the tile */
temp = vload__N__((index_in + i*NB_III + j*NB_III*NB_II)/__N__, in);
tile[lid_z + j][lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__;
}
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
for(i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) {
/* Fill the tile */
temp = vload__N__((index_in + i*NB_III + j*NB_III*NB_II)/__N__, in);
tile[lid_z + j][lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__;
}
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
{
for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
{
/* Write transposed data */
temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y+i][lid_z + j],
);
vstore__N__(temp, (index_out + i*NB_I + j*NB_I*NB_II)/__N__, out);
for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
for(i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) {
/* Write transposed data */
temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y+i][lid_z + j],
);
vstore__N__(temp, (index_out + i*NB_I + j*NB_I*NB_II)/__N__, out);
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
}
}
......@@ -31,59 +31,54 @@ __kernel void transpose_xz(__global const float* in,
uint lid_y = get_local_id(1);
uint lid_z = get_local_id(2);
/* Global input index for work-item */
uint xIndex, yIndex, zIndex;
uint index_in, index_out;
uint gidI, gidIII, j;
uint index_in, index_out, i, j;
uint gidI, gidII, gidIII;
__local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
__local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) {
for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) {
for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
/* Use of diagonal coordinates */
/* Use of diagonal coordinates */
#if NB_III == NB_I
group_id_x = (gidI + gidIII) % NB_GROUPS_I;
group_id_z = gidI;
group_id_x = (gidI + gidIII) % NB_GROUPS_I;
group_id_z = gidI;
#else
uint bid = gidI + gidIII * NB_GROUPS_I;
group_id_z = bid%NB_GROUPS_III;
group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
uint bid = gidI + gidIII * NB_GROUPS_I;
group_id_z = bid%NB_GROUPS_III;
group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
#endif
/* Global input index for work-item */
xIndex = group_id_x * TILE_DIM_XZ + lid_x;
zIndex = group_id_z * TILE_DIM_XZ + lid_z;
index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
/* Global input index for work-item */
xIndex = group_id_x * TILE_DIM_XZ + lid_x;
yIndex = gidII * TILE_DIM_XZ + lid_y;
zIndex = group_id_z * TILE_DIM_XZ + lid_z;
index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
/* Global output index */
xIndex = group_id_z * TILE_DIM_XZ + lid_x;
zIndex = group_id_x * TILE_DIM_XZ + lid_z;
index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
/* Global output index */
xIndex = group_id_z * TILE_DIM_XZ + lid_x;
zIndex = group_id_x * TILE_DIM_XZ + lid_z;
index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
{
for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
{
/* Fill the tile */
tile[lid_z + j][lid_y + i][lid_x] = in[index_in + i*NB_III + j*NB_III*NB_II];
for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
for(i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) {
/* Fill the tile */
tile[lid_z + j][lid_y + i][lid_x] = in[index_in + i*NB_III + j*NB_III*NB_II];
}
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
{
for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ)
{
/* Write transposed data */
out[index_out + i*NB_I + j*NB_I*NB_II] = tile[lid_x][lid_y+i][lid_z + j];
for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
for(i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) {
/* Write transposed data */
out[index_out + i*NB_I + j*NB_I*NB_II] = tile[lid_x][lid_y+i][lid_z + j];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
}
}
......@@ -31,7 +31,6 @@ __kernel void transpose_xz(__global const float* in,
uint lid_x = get_local_id(0);
uint lid_z = get_local_id(2);
/* Global input index for work-item */
uint xIndex, yIndex, zIndex;
uint index_in, index_out;
uint gidI, gidIII, j;
......@@ -39,48 +38,45 @@ __kernel void transpose_xz(__global const float* in,
__local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) {
for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
/* Use of diagonal coordinates */
/* Use of diagonal coordinates */
#if NB_III == NB_I
group_id_x = (gidI + gidIII) % NB_GROUPS_I;
group_id_z = gidI;
group_id_x = (gidI + gidIII) % NB_GROUPS_I;
group_id_z = gidI;
#else
uint bid = gidI + gidIII * NB_GROUPS_I;
group_id_z = bid%NB_GROUPS_III;
group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
uint bid = gidI + gidIII * NB_GROUPS_I;
group_id_z = bid%NB_GROUPS_III;
group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
#endif
/* Global input index for work-item */
xIndex = group_id_x * TILE_DIM_XZ + lid_x;
zIndex = group_id_z * TILE_DIM_XZ + lid_z;
index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
/* Global input index for work-item */
xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__;
zIndex = group_id_z * TILE_DIM_XZ + lid_z;
index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
/* Global output index */
xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__;
zIndex = group_id_x * TILE_DIM_XZ + lid_z;
index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
/* Global output index */
xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__;
zIndex = group_id_x * TILE_DIM_XZ + lid_z;
index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
{
/* Fill the tile */
temp = vload__N__((index_in + j*NB_III*NB_II)/__N__, in);
tile[lid_z + j][lid_x*__N__+__NN__] = temp.s__NN__;
for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
/* Fill the tile */
temp = vload__N__((index_in + j*NB_III*NB_II)/__N__, in);
tile[lid_z + j][lid_x*__N__+__NN__] = temp.s__NN__;
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ)
{
/* Write transposed data */
temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_z + j],
);
vstore__N__(temp, (index_out + j*NB_I*NB_II)/__N__, out);
for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
/* Write transposed data */
temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_z + j],
);
vstore__N__(temp, (index_out + j*NB_I*NB_II)/__N__, out);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
}
}
......@@ -38,42 +38,42 @@ __kernel void transpose_xz(__global const float* in,
__local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */
for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) {
for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) {
for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) {
/* Use of diagonal coordinates */
/* Use of diagonal coordinates */
#if NB_III == NB_I
group_id_x = (gidI + gidIII) % NB_GROUPS_I;
group_id_z = gidI;
group_id_x = (gidI + gidIII) % NB_GROUPS_I;
group_id_z = gidI;
#else
uint bid = gidI + gidIII * NB_GROUPS_I;
group_id_z = bid%NB_GROUPS_III;
group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
uint bid = gidI + gidIII * NB_GROUPS_I;
group_id_z = bid%NB_GROUPS_III;
group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I;
#endif
xIndex = group_id_x * TILE_DIM_XZ + lid_x;
zIndex = group_id_z * TILE_DIM_XZ + lid_z;
index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
xIndex = group_id_x * TILE_DIM_XZ + lid_x;
zIndex = group_id_z * TILE_DIM_XZ + lid_z;
index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II;
/* Global output index */
xIndex = group_id_z * TILE_DIM_XZ + lid_x;
zIndex = group_id_x * TILE_DIM_XZ + lid_z;
index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
/* Global output index */
xIndex = group_id_z * TILE_DIM_XZ + lid_x;
zIndex = group_id_x * TILE_DIM_XZ + lid_z;
index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II;
for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
/* Fill the tile */
tile[lid_z + j][lid_x] = in[index_in + j*NB_III*NB_II];
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
/* Fill the tile */
tile[lid_z + j][lid_x] = in[index_in + j*NB_III*NB_II];
}
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
/* Write transposed data */
out[index_out + j*NB_I*NB_II] = tile[lid_x][lid_z + j];
tile[lid_x][lid_z + j] = 0.0;
for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) {
/* Write transposed data */
out[index_out + j*NB_I*NB_II] = tile[lid_x][lid_z + j];
tile[lid_x][lid_z + j] = 0.0;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
}
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment