From 5134bc319b746c3e4b5e7eccd57443b1069330f9 Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@univ-reims.fr> Date: Thu, 16 Apr 2015 16:06:10 +0200 Subject: [PATCH] GPU code improvement for large problems --- HySoP/hysop/gpu/cl_src/kernels/advection.cl | 54 +++-- .../cl_src/kernels/advection_and_remeshing.cl | 84 +++++--- .../kernels/advection_and_remeshing_noVec.cl | 17 +- .../advection_euler_and_remeshing_noVec.cl | 70 +++--- .../cl_src/kernels/advection_euler_noVec.cl | 28 ++- .../gpu/cl_src/kernels/advection_noVec.cl | 35 ++- HySoP/hysop/gpu/cl_src/kernels/diffusion.cl | 203 ++++++++++-------- HySoP/hysop/gpu/cl_src/kernels/remeshing.cl | 17 +- .../gpu/cl_src/kernels/remeshing_noVec.cl | 17 +- .../hysop/gpu/cl_src/kernels/transpose_xy.cl | 42 ++-- .../gpu/cl_src/kernels/transpose_xy_noVec.cl | 61 ++++-- .../hysop/gpu/cl_src/kernels/transpose_xz.cl | 36 ++-- .../gpu/cl_src/kernels/transpose_xz_noVec.cl | 36 ++-- .../gpu/cl_src/kernels/transpose_xz_slice.cl | 36 ++-- .../kernels/transpose_xz_slice_noVec.cl | 57 ++--- HySoP/hysop/gpu/config_default.py | 91 +++++--- HySoP/hysop/gpu/config_k20m.py | 87 +++++--- HySoP/hysop/gpu/gpu_diffusion.py | 7 +- HySoP/hysop/gpu/gpu_kernel.py | 11 +- HySoP/hysop/gpu/gpu_particle_advection.py | 8 +- .../tests/test_advection_randomVelocity.py | 34 ++- HySoP/hysop/gpu/tests/test_transposition.py | 8 + 22 files changed, 668 insertions(+), 371 deletions(-) diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection.cl b/HySoP/hysop/gpu/cl_src/kernels/advection.cl index 29890ba9c..2e9e341e3 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/advection.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/advection.cl @@ -5,19 +5,23 @@ /** * Computes particles positions from the velocity field. - * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. - * Each work-item computes <code>NB_I/WI_NB</code> particles positions. - * Particle are computed through OpenCL vector types of lenght 2, 4 or 8. + * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. Computations of 1D problems are placed in loops over gidY and gidZ to adjust local workload and handle the work-item maximum size. + * Each work-item computes <code>NB_I/WI_NB</code> particles positions in each 1D problem. + * Particle are computed through OpenCL vector types of length 2, 4 or 8. * Velocity data are copied to a local buffer as a cache. * * @param gvelo Velocity. * @param ppos Particle position. * @param dt Time step. - * @param min_position Domain lower coordinate. - * @param dx Space step. + * @param mesh Mesh description. + * @param inv_v_dx_y velocity grid 1/dy + * @param inv_v_dx_z velocity grid 1/dz * * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. + * @remark <code>NB_X</code>, <code>NB_Y</code>, <code>NB_Z</code> : points number in physical space directions. * @remark <code>WI_NB</code> corresponds to the work-item number. + * @remark <code>ADVEC_IS_MULTISCALE</code> is a flag for multiscale. + * @remark <code>V_NB_I</code>, <code>V_NB_II</code>, <code>V_NB_III</code> : points number for velocity grid in directions from 1st varying index to last. * @remark <code>__N__</code> is expanded at compilation time by vector width. * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. * @see hysop.gpu.tools.parse_file @@ -31,29 +35,43 @@ __kernel void advection_kernel(__global const float* gvelo, __constant struct AdvectionMeshInfo* mesh) { uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ - uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ - uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + uint gidY; /* OpenCL work-itme global index (Y) */ + uint gidZ; /* OpenCL work-itme global index (Z) */ uint i; /* Particle index in 1D problem */ float__N__ p; /* Particle position */ - uint line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */ + uint line_index; /* Current 1D problem index */ __local float velocity_cache[V_NB_I]; /* Velocity cache */ + for(gidZ=get_global_id(2); +#ifdef NB_Z + gidZ<NB_III; +#else + gidZ<=get_global_id(2); // Single element loop +#endif + gidZ+=get_global_size(2)) { + for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) { + + // 1D problem computations + line_index = gidY*NB_I+ gidZ*NB_I*NB_II; #if ADVEC_IS_MULTISCALE - fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh); + fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh); #else - fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, mesh); + fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, mesh); #endif - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=gidX*__N__; i<NB_I; i+=WI_NB*__N__) { + /* Compute position */ + p = advection(i, dt, velocity_cache, mesh); + /* Store result */ + vstore__N__(p, (i+line_index)/__N__, ppos); + } - for(i=gidX*__N__; i<NB_I; i+=WI_NB*__N__) - { - /* Compute position */ - p = advection(i, dt, velocity_cache, mesh); - /* Store result */ - vstore__N__(p, (i+line_index)/__N__, ppos); + barrier(CLK_LOCAL_MEM_FENCE); } + } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing.cl index b1b998913..c9fb52ebd 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing.cl @@ -1,11 +1,11 @@ /** * @file advection_and_remeshing.cl - * Advection and remeshing kernel. + * Advection and remeshing kernel, vectorized version. */ /** - * Performs advection and then remeshing of the particles' scalar. - * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. + * Performs advection and then remeshing of the particles scalar. + * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. Computations of 1D problems are placed in loops over gidY and gidZ to adjust local workload and handle the work-item maximum size. * Each work-item computes NB_I/WI_NB particles positions. To avoid concurrent witings, in case of strong velocity gradients, work-items computes contiguous particles. * Particle are computed through OpenCL vector types of lenght 2, 4 or 8. * Scalar results are stored in a local buffer as a cache and then copied to global memory buffer. @@ -14,11 +14,15 @@ * @param pscal Particle scalar * @param gscal Grid scalar * @param dt Time step - * @param min_position Domain lower coordinate - * @param dx Space step + * @param mesh Mesh description. + * @param inv_v_dx_y velocity grid 1/dy + * @param inv_v_dx_z velocity grid 1/dz * * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. + * @remark <code>NB_X</code>, <code>NB_Y</code>, <code>NB_Z</code> : points number in physical space directions. * @remark <code>WI_NB</code> corresponds to the work-item number. + * @remark <code>ADVEC_IS_MULTISCALE</code> is a flag for multiscale. + * @remark <code>V_NB_I</code>, <code>V_NB_II</code>, <code>V_NB_III</code> : points number for velocity grid in directions from 1st varying index to last. * @remark <code>__N__</code> is expanded at compilation time by vector width. * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. * @remark <code>__RCOMP_I</code> flag is for instruction expansion for the different remeshed components. @@ -36,48 +40,62 @@ __kernel void advection_and_remeshing(__global const float* gvelo, __constant struct AdvectionMeshInfo* mesh) { uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ - uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ - uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + uint gidY; /* OpenCL work-itme global index (Y) */ + uint gidZ; /* OpenCL work-itme global index (Z) */ uint i; /* Particle index in 1D problem */ float__N__ p; /* Particle position */ __RCOMP_I float__N__ s__ID__; /* Particle scalar */ - uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */ + uint line_index; /* Current 1D problem index */ __RCOMP_I__local float gscal_loc__ID__[NB_I]; /* Local buffer for result */ __local float velocity_cache[V_NB_I]; /* Velocity cache */ + for(gidZ=get_global_id(2); +#ifdef NB_Z + gidZ<NB_III; +#else + gidZ<=get_global_id(2); // Single element loop +#endif + gidZ+=get_global_size(2)) { + for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) { + + // 1D problem computations + line_index = gidY*NB_I+ gidZ*NB_I*NB_II; + #if ADVEC_IS_MULTISCALE - fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh); + fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh); #else - fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, mesh); + fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, mesh); #endif - for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__)) - { - /* Initialize result buffer */ - __RCOMP_Igscal_loc__ID__[noBC_id(i+__NN__)] = 0.0; - } + for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__)) { + /* Initialize result buffer */ + __RCOMP_Igscal_loc__ID__[noBC_id(i+__NN__)] = 0.0; + } - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=__N__) - { - /* Read Particle scalar */ - __RCOMP_Is__ID__ = vload__N__((i + line_index)/__N__, pscal__ID__); - /* Compute particle position */ - p = advection(i, dt, velocity_cache, mesh); - /* Remesh particle */ - remesh(i, __RCOMP_Ps__ID__, p, __RCOMP_Pgscal_loc__ID__, mesh); - } + for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=__N__) { + /* Read Particle scalar */ + __RCOMP_Is__ID__ = vload__N__((i + line_index)/__N__, pscal__ID__); + /* Compute particle position */ + p = advection(i, dt, velocity_cache, mesh); + /* Remesh particle */ + remesh(i, __RCOMP_Ps__ID__, p, __RCOMP_Pgscal_loc__ID__, mesh); + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__)) { + /* Store result */ + __RCOMP_Ivstore__N__((float__N__)(gscal_loc__ID__[noBC_id(i+__NN__)], + ), (i + line_index)/__N__, gscal__ID__); + } - for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__)) - { - /* Store result */ - __RCOMP_Ivstore__N__((float__N__)(gscal_loc__ID__[noBC_id(i+__NN__)], - ), (i + line_index)/__N__, gscal__ID__); + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); } + } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl index 19dc24f54..5759dc6f5 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl @@ -36,16 +36,23 @@ __kernel void advection_and_remeshing(__global const float* gvelo, __constant struct AdvectionMeshInfo* mesh) { uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ - uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ - uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + uint gidY; /* OpenCL work-itme global index (Y) */ + uint gidZ; /* OpenCL work-itme global index (Z) */ uint i; /* Particle index in 1D problem */ float p; /* Particle position */ __RCOMP_I float s__ID__; /* Particle scalar */ - uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */ + uint line_index; /* Current 1D problem index */ __RCOMP_I__local float gscal_loc__ID__[NB_I]; /* Local buffer for result */ __local float velocity_cache[V_NB_I]; /* Velocity cache */ +#ifdef NB_Z + for(gidZ=get_global_id(2); gidZ<NB_III; gidZ+=get_global_size(2)) { +#else + gidZ=get_global_id(2); { +#endif + for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) { + line_index = gidY*NB_I+ gidZ*NB_I*NB_II; #if ADVEC_IS_MULTISCALE fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh); @@ -80,4 +87,8 @@ __kernel void advection_and_remeshing(__global const float* gvelo, /* Store result */ __RCOMP_Igscal__ID__[i + line_index] = gscal_loc__ID__[noBC_id(i)]; } + + barrier(CLK_LOCAL_MEM_FENCE); +} +} } diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_euler_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_euler_and_remeshing_noVec.cl index 47fda2027..99565be69 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/advection_euler_and_remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/advection_euler_and_remeshing_noVec.cl @@ -1,11 +1,11 @@ /** * @file advection_and_remeshing.cl - * Advection and remeshing kernel. + * Euler advection and remeshing kernel. */ /** - * Performs advection and then remeshing of the particles' scalar. - * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. + * Performs advection and then remeshing of the particles scalar. + * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. Computations of 1D problems are placed in loops over gidY and gidZ to adjust local workload and handle the work-item maximum size. * Each work-item computes NB_I/WI_NB particles positions. To avoid concurrent witings, in case of strong velocity gradients, work-items computes contiguous particles. * Particle are computed through OpenCL vector types of lenght 2, 4 or 8. * Scalar results are stored in a local buffer as a cache and then copied to global memory buffer. @@ -33,41 +33,53 @@ __kernel void advection_and_remeshing(__global const float* gvelo, __constant struct AdvectionMeshInfo* mesh) { uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ - uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ - uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + uint gidY; /* OpenCL work-itme global index (Y) */ + uint gidZ; /* OpenCL work-itme global index (Z) */ uint i; /* Particle index in 1D problem */ float p,c; /* Particle position */ __RCOMP_I float s__ID__; /* Particle scalar */ - uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */ + uint line_index; /* Current 1D problem index */ __RCOMP_I__local float gscal_loc__ID__[NB_I]; /* Local buffer for result */ - for(i=gidX; i<NB_I; i+=(WI_NB)) - { - /* Initialize result buffer */ - __RCOMP_Igscal_loc__ID__[noBC_id(i)] = 0.0; - } + for(gidZ=get_global_id(2); +#ifdef NB_Z + gidZ<NB_III; +#else + gidZ<=get_global_id(2); // Single element loop +#endif + gidZ+=get_global_size(2)) { + for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) { - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); + // 1D computations + line_index = gidY*NB_I+ gidZ*NB_I*NB_II; - for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=1) - { - /* Read Particle scalar */ - __RCOMP_Is__ID__ = pscal__ID__[i + line_index]; - /* Compute particle position */ - c = fma(i, mesh->dx.x, mesh->min_position); - p = fma(dt, gvelo[i+line_index], c); - /* Remesh particle */ - remesh(i, __RCOMP_Ps__ID__, p, __RCOMP_Pgscal_loc__ID__, mesh); - } + for(i=gidX; i<NB_I; i+=(WI_NB)) { + /* Initialize result buffer */ + __RCOMP_Igscal_loc__ID__[noBC_id(i)] = 0.0; + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=1) { + /* Read Particle scalar */ + __RCOMP_Is__ID__ = pscal__ID__[i + line_index]; + /* Compute particle position */ + c = fma(i, mesh->dx.x, mesh->min_position); + p = fma(dt, gvelo[i+line_index], c); + /* Remesh particle */ + remesh(i, __RCOMP_Ps__ID__, p, __RCOMP_Pgscal_loc__ID__, mesh); + } - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX; i<NB_I; i+=(WI_NB)) - { - /* Store result */ - __RCOMP_Igscal__ID__[i + line_index] = gscal_loc__ID__[noBC_id(i)]; + for(i=gidX; i<NB_I; i+=(WI_NB)) { + /* Store result */ + __RCOMP_Igscal__ID__[i + line_index] = gscal_loc__ID__[noBC_id(i)]; + } + barrier(CLK_LOCAL_MEM_FENCE); } + } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_euler_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_euler_noVec.cl index 091f57060..df90575a2 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/advection_euler_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/advection_euler_noVec.cl @@ -23,15 +23,29 @@ __kernel void advection_kernel(__global const float* gvelo, __constant struct AdvectionMeshInfo* mesh) { uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ - uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ - uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + uint gidY; /* OpenCL work-itme global index (Y) */ + uint gidZ; /* OpenCL work-itme global index (Z) */ uint i; /* Particle index in 1D problem */ - uint line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */ + uint line_index; /* Current 1D problem index */ float c; - for(i=gidX; i<NB_I; i+=WI_NB) - { - c = fma(i, mesh->dx.x, mesh->min_position); - ppos[i+line_index] = fma(dt, gvelo[i+line_index], c); + for(gidZ=get_global_id(2); +#ifdef NB_Z + gidZ<NB_III; +#else + gidZ<=get_global_id(2); // Single element loop +#endif + gidZ+=get_global_size(2)) { + for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) { + + //1D computations + line_index = gidY*NB_I+ gidZ*NB_I*NB_II; + + for(i=gidX; i<NB_I; i+=WI_NB) { + c = fma(i, mesh->dx.x, mesh->min_position); + ppos[i+line_index] = fma(dt, gvelo[i+line_index], c); + } + barrier(CLK_LOCAL_MEM_FENCE); } + } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl index d116c3cce..78ca64d61 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl @@ -26,25 +26,38 @@ __kernel void advection_kernel(__global const float* gvelo, __constant struct AdvectionMeshInfo* mesh) { uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ - uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ - uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + uint gidY; /* OpenCL work-itme global index (Y) */ + uint gidZ; /* OpenCL work-itme global index (Z) */ uint i; /* Particle index in 1D problem */ - uint line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */ + uint line_index; /* Current 1D problem index */ __local float velocity_cache[V_NB_I]; /* Velocity cache */ -#if ADVEC_IS_MULTISCALE - fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh); + for(gidZ=get_global_id(2); +#ifdef NB_Z + gidZ<NB_III; #else - fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, mesh); + gidZ<=get_global_id(2); // Single element loop #endif + gidZ+=get_global_size(2)) { + for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) { + + // 1D computation + line_index = gidY*NB_I+ gidZ*NB_I*NB_II; +#if ADVEC_IS_MULTISCALE + fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, inv_v_dx_y, inv_v_dx_z, mesh); +#else + fill_velocity_cache(gvelo, gidX, gidY, gidZ, velocity_cache, mesh); +#endif - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX; i<NB_I; i+=WI_NB) - { - ppos[i+line_index] = advection(i, dt, velocity_cache, mesh); + for(i=gidX; i<NB_I; i+=WI_NB) { + ppos[i+line_index] = advection(i, dt, velocity_cache, mesh); + } + barrier(CLK_LOCAL_MEM_FENCE); } + } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl b/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl index 7bd4ef31c..edbfa572a 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl @@ -1,4 +1,26 @@ - +/** + * @file diffusion.cl + * Diffusion kernel. + */ + +/** + * Computes diffusion operator with finite differences. + * Stencil computation is performed within a 2D index space of size <code>TILE_SIZE</code> by a work-group. The 3rd direction is traversed in a loop for data reuse. + * + * @param scal_in Input scalar field + * @param ghostsX Ghosts array if X is a communication direction + * @param ghostsY Ghosts array if Y is a communication direction + * @param ghostsZ Ghosts array if Z is a communication direction + * @param scal_out Output scalar field + * @param nudt Diffusion coefficient + * @param dx Mesh space step + * + * @remark <code>NB_X</code>, <code>NB_Y</code>, <code>NB_Z</code> : points number in physical space directions. + * @remark <code>NB_PART</code> Particles number per work-item in computing direction + * @remark <code>CUT_DIT_X</code>, <code>CUT_DIT_Y</code> and <code>CUT_DIT_Z</code> : flags for communication direction + * @remark <code>NB_GROUPS_I</code> and <code>NB_GROUPS_II</code> : tiles number in X and Y directions. + * @remark <code>L_WIDTH</code> : work-item number in tile. + */ __kernel void diffusion(__global const float* scal_in, #if CUT_DIR_X == 1 @@ -11,47 +33,104 @@ __kernel void diffusion(__global const float* scal_in, __global const float* ghostsZ, #endif __global float* scal_out, - float nudt, float4 dx) + float nudt, + float4 dx) { - int t_gidX = get_group_id(0); - int t_gidY = get_group_id(1); - int lidX = get_local_id(0); - int lidY = get_local_id(1); - int gidX = t_gidX*TILE_SIZE + lidX; /* OpenCL work-item global index (X) */ - int gidY = t_gidY*TILE_SIZE + lidY; /* OpenCL work-item global index (Y) */ - int gidZ; - float cx = nudt/(dx.x*dx.x); - float cy = nudt/(dx.y*dx.y); - float cz = nudt/(dx.z*dx.z); + int t_gidX, t_gidY; + int lidX, lidY; + int gidX, gidY, gidZ; + float cx, cy, cz; float scal_z_m[NB_PART]; float scal_z[NB_PART]; float scal_z_p[NB_PART]; float s; uint i; - for(i=0;i<NB_PART;i++){ + __local float tile_XY[TILE_SIZE+2][TILE_SIZE+2]; + + for (t_gidX=get_group_id(0); t_gidX<NB_GROUPS_I; t_gidX+=get_num_groups(0)) { + for (t_gidY=get_group_id(1); t_gidY<NB_GROUPS_II; t_gidY+=get_num_groups(1)) { + + // Tile computation + lidX = get_local_id(0); + lidY = get_local_id(1); + gidX = t_gidX*TILE_SIZE + lidX; /* OpenCL work-item global index (X) */ + gidY = t_gidY*TILE_SIZE + lidY; /* OpenCL work-item global index (Y) */ + cx = nudt/(dx.x*dx.x); + cy = nudt/(dx.y*dx.y); + cz = nudt/(dx.z*dx.z); + + for(i=0;i<NB_PART;i++) { #if CUT_DIR_Z == 1 - scal_z_m[i] = ghostsZ[gidX + (gidY+i*L_WIDTH)*NB_X + NB_X*NB_Y]; + scal_z_m[i] = ghostsZ[gidX + (gidY+i*L_WIDTH)*NB_X + NB_X*NB_Y]; #else - scal_z_m[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + (NB_Z-1)*NB_X*NB_Y]; + scal_z_m[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + (NB_Z-1)*NB_X*NB_Y]; #endif - scal_z[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X]; - } + scal_z[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X]; + } - /* scal_z[nb_parts] */ - /* for i in xrange(nb_parts) */ - /* lidY+i*L_WIDTH */ - /* gidY+i*L_WIDTH */ + lidX += 1; + lidY += 1; - __local float tile_XY[TILE_SIZE+2][TILE_SIZE+2]; + // loop over Z indices but last. + for (gidZ=0; gidZ<(NB_Z-1); gidZ++) { + for(i=0;i<NB_PART;i++) { + // fill the tile + tile_XY[lidX][lidY+i*L_WIDTH] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; - lidX += 1; - lidY += 1; + /* // fill tile edges */ +#if CUT_DIR_X == 1 + tile_XY[0][lidY+i*L_WIDTH] = (t_gidX*TILE_SIZE>=1) ? scal_in[t_gidX*TILE_SIZE-1 + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] : ghostsX[1 + (gidY+i*L_WIDTH)*2 + gidZ*2*NB_Y]; + tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = ((t_gidX+1)*TILE_SIZE<NB_X) ? scal_in[(t_gidX+1)*TILE_SIZE + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]: ghostsX[(gidY+i*L_WIDTH)*2 + gidZ*2*NB_Y]; +#else + tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; +#endif + } +#if CUT_DIR_Y == 1 + tile_XY[lidX][0] = (t_gidY*TILE_SIZE>=1)? scal_in[gidX + (t_gidY*TILE_SIZE-1)*NB_X + gidZ*NB_X*NB_Y] : ghostsY[gidX + NB_X + gidZ*NB_X*2]; + tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE<NB_Y) ? scal_in[gidX + (t_gidY+1)*TILE_SIZE*NB_X + gidZ*NB_X*NB_Y] : ghostsY[gidX + gidZ*NB_X*2]; +#else + tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; +#endif + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=0;i<NB_PART;i++) { + /* get scalar value in Z direction */ + scal_z_p[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + (gidZ+1)*NB_X*NB_Y]; + + // Compute stencil + // central point + s = scal_z[i] * (1.0 - 2.0 * (cx + cy + cz)); + + s += cz*(scal_z_m[i] + scal_z_p[i]); + + s += cy * tile_XY[lidX][lidY+i*L_WIDTH-1]; + s += cy * tile_XY[lidX][lidY+i*L_WIDTH+1]; + s += cx * tile_XY[lidX-1][lidY+i*L_WIDTH]; + s += cx * tile_XY[lidX+1][lidY+i*L_WIDTH]; - // loop over Z indices but last. - for (gidZ=0; gidZ<(NB_Z-1); gidZ++) - { - for(i=0;i<NB_PART;i++){ + // write result + scal_out[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] = s; + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=0;i<NB_PART;i++) { + // Shift Z values + scal_z_m[i] = scal_z[i]; + scal_z[i] = scal_z_p[i]; + } + } + + // Compute last point (from ghosts) + gidZ = NB_Z - 1; + + for(i=0;i<NB_PART;i++) { // fill the tile tile_XY[lidX][lidY+i*L_WIDTH] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; @@ -75,12 +154,16 @@ __kernel void diffusion(__global const float* scal_in, /* Synchronize work-group */ barrier(CLK_LOCAL_MEM_FENCE); - for(i=0;i<NB_PART;i++){ - /* get scalar value in Z direction */ - scal_z_p[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + (gidZ+1)*NB_X*NB_Y]; + for(i=0;i<NB_PART;i++) { + /* // get scalar value in Z direction */ +#if CUT_DIR_Z == 1 + scal_z_p[i] = ghostsZ[gidX + (gidY+i*L_WIDTH)*NB_X]; +#else + scal_z_p[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X]; +#endif // Compute stencil - // central point + /* // central point */ s = scal_z[i] * (1.0 - 2.0 * (cx + cy + cz)); s += cz*(scal_z_m[i] + scal_z_p[i]); @@ -93,64 +176,8 @@ __kernel void diffusion(__global const float* scal_in, // write result scal_out[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] = s; } - /* Synchronize work-group */ barrier(CLK_LOCAL_MEM_FENCE); - - for(i=0;i<NB_PART;i++){ - // Shift Z values - scal_z_m[i] = scal_z[i]; - scal_z[i] = scal_z_p[i]; - } } - - // Compute last point (from ghosts) - gidZ = NB_Z - 1; - - for(i=0;i<NB_PART;i++){ - // fill the tile - tile_XY[lidX][lidY+i*L_WIDTH] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; - - /* // fill tile edges */ -#if CUT_DIR_X == 1 - tile_XY[0][lidY+i*L_WIDTH] = (t_gidX*TILE_SIZE>=1) ? scal_in[t_gidX*TILE_SIZE-1 + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] : ghostsX[1 + (gidY+i*L_WIDTH)*2 + gidZ*2*NB_Y]; - tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = ((t_gidX+1)*TILE_SIZE<NB_X) ? scal_in[(t_gidX+1)*TILE_SIZE + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]: ghostsX[(gidY+i*L_WIDTH)*2 + gidZ*2*NB_Y]; -#else - tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; - tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; -#endif - } -#if CUT_DIR_Y == 1 - tile_XY[lidX][0] = (t_gidY*TILE_SIZE>=1)? scal_in[gidX + (t_gidY*TILE_SIZE-1)*NB_X + gidZ*NB_X*NB_Y] : ghostsY[gidX + NB_X + gidZ*NB_X*2]; - tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE<NB_Y) ? scal_in[gidX + (t_gidY+1)*TILE_SIZE*NB_X + gidZ*NB_X*NB_Y] : ghostsY[gidX + gidZ*NB_X*2]; -#else - tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; - tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; -#endif - - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); - - for(i=0;i<NB_PART;i++){ - /* // get scalar value in Z direction */ -#if CUT_DIR_Z == 1 - scal_z_p[i] = ghostsZ[gidX + (gidY+i*L_WIDTH)*NB_X]; -#else - scal_z_p[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X]; -#endif - - // Compute stencil - /* // central point */ - s = scal_z[i] * (1.0 - 2.0 * (cx + cy + cz)); - - s += cz*(scal_z_m[i] + scal_z_p[i]); - - s += cy * tile_XY[lidX][lidY+i*L_WIDTH-1]; - s += cy * tile_XY[lidX][lidY+i*L_WIDTH+1]; - s += cx * tile_XY[lidX-1][lidY+i*L_WIDTH]; - s += cx * tile_XY[lidX+1][lidY+i*L_WIDTH]; - - // write result - scal_out[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] = s; } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/remeshing.cl b/HySoP/hysop/gpu/cl_src/kernels/remeshing.cl index 2e2774527..809c5ad32 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/remeshing.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/remeshing.cl @@ -31,16 +31,24 @@ __kernel void remeshing_kernel(__global const float* ppos, __constant struct AdvectionMeshInfo* mesh) { uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ - uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ - uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + uint gidY; /* OpenCL work-itme global index (Y) */ + uint gidZ; /* OpenCL work-itme global index (Z) */ // float invdx = 1.0/dx; /* Space step inverse */ uint i; /* Particle index in 1D problem */ float__N__ p; /* Particle position */ __RCOMP_I float__N__ s__ID__; /* Particle scalar */ - uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */ + uint line_index; /* Current 1D problem index */ __RCOMP_I__local float gscal_loc__ID__[NB_I]; /* Local buffer for result */ +#ifdef NB_Z + for(gidZ=get_global_id(2); gidZ<NB_III; gidZ+=get_global_size(2)) { +#else + gidZ=get_global_id(2); { +#endif + for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) { + line_index = gidY*NB_I+ gidZ*NB_I*NB_II; + for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__)) { /* Initialize result buffer */ @@ -69,4 +77,7 @@ __kernel void remeshing_kernel(__global const float* ppos, __RCOMP_Ivstore__N__((float__N__)(gscal_loc__ID__[noBC_id(i+__NN__)], ),(i + line_index)/__N__, gscal__ID__); } + barrier(CLK_LOCAL_MEM_FENCE); +} +} } diff --git a/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl index c8dbd3a8a..15db5730c 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl @@ -31,15 +31,23 @@ __kernel void remeshing_kernel(__global const float* ppos, __constant struct AdvectionMeshInfo* mesh) { uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ - uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ - uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + uint gidY; /* OpenCL work-itme global index (Y) */ + uint gidZ; /* OpenCL work-itme global index (Z) */ uint i; /* Particle index in 1D problem */ float p; /* Particle position */ __RCOMP_I float s__ID__; /* Particle scalar */ - uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */ + uint line_index; /* Current 1D problem index */ __RCOMP_I__local float gscal_loc__ID__[NB_I]; /* Local buffer for result */ +#ifdef NB_Z + for(gidZ=get_global_id(2); gidZ<NB_III; gidZ+=get_global_size(2)) { +#else + gidZ=get_global_id(2); { +#endif + for(gidY=get_global_id(1); gidY<NB_II; gidY+=get_global_size(1)) { + line_index = gidY*NB_I+ gidZ*NB_I*NB_II; + for(i=gidX; i<NB_I; i+=WI_NB) { /* Initialize result buffer */ @@ -67,4 +75,7 @@ __kernel void remeshing_kernel(__global const float* ppos, /* Store result */ __RCOMP_Igscal__ID__[i + line_index] = gscal_loc__ID__[noBC_id(i)]; } + barrier(CLK_LOCAL_MEM_FENCE); +} +} } diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl index 8e1725737..c430da22e 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl @@ -39,28 +39,40 @@ __kernel void transpose_xy(__global const float* in, uint group_id_y; /* Work-group coordinate in global space index Y */ uint lid_x = get_local_id(0); uint lid_y = get_local_id(1); + + unint xIndex, yIndex, zIndex; + uint index_in, index_out; + uint gidI, gidII, i; + + __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */ + +#ifdef NB_Z + for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2)) { +#else + zIndex=get_global_id(2); { +#endif + for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { + for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) { + /* Use of diagonal coordinates */ #if NB_II == NB_I - group_id_x = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); - group_id_y = get_group_id(0); + group_id_x = (gidI + gidII) % NB_GROUPS_I; + group_id_y = gidI; #else - uint bid = get_group_id(0) + get_group_id(1) * get_num_groups(0); - group_id_y = bid%get_num_groups(1); - group_id_x = ((bid/get_num_groups(1)) + group_id_y)%get_num_groups(0); + uint bid = gidI + gidII * NB_GROUPS_I; + group_id_y = bid%NB_GROUPS_II; + group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I; #endif /* Global input index for work-item */ - uint xIndex = group_id_x * TILE_DIM_XY + lid_x*__N__; - uint yIndex = group_id_y * TILE_DIM_XY + lid_y; - uint zIndex = get_global_id(2); - uint index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I; + xIndex = group_id_x * TILE_DIM_XY + lid_x; + yIndex = group_id_y * TILE_DIM_XY + lid_y; + index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I; /* Global output index */ xIndex = group_id_y * TILE_DIM_XY + lid_x*__N__; yIndex = group_id_x * TILE_DIM_XY + lid_y; - uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - - __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */ + index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) { @@ -79,5 +91,9 @@ __kernel void transpose_xy(__global const float* in, ); vstore__N__(temp, (index_out + i*NB_I)/__N__, out); } + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); +} +} +} } - diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl index 6c0f64489..dde7ba936 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl @@ -38,42 +38,57 @@ __kernel void transpose_xy(__global const float* in, uint group_id_y; /* Work-group coordinate in global space index Y */ uint lid_x = get_local_id(0); uint lid_y = get_local_id(1); + + uint xIndex, yIndex, zIndex; + uint index_in, index_out; + uint gidI, gidII, i; + + __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */ + +#ifdef NB_Z + for(zIndex=get_global_id(2); zIndex<NB_III; zIndex+=get_global_size(2)) { +#else + zIndex=get_global_id(2); { +#endif + for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { + for(gidII=get_group_id(1); gidII<NB_GROUPS_II; gidII+=get_num_groups(1)) { + /* Use of diagonal coordinates */ #if NB_II == NB_I - group_id_x = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); - group_id_y = get_group_id(0); + group_id_x = (gidI + gidII) % NB_GROUPS_I; + group_id_y = gidI; #else - uint bid = get_group_id(0) + get_group_id(1) * get_num_groups(0); - group_id_y = bid%get_num_groups(1); - group_id_x = ((bid/get_num_groups(1)) + group_id_y)%get_num_groups(0); + uint bid = gidI + gidII * NB_GROUPS_I; + group_id_y = bid%NB_GROUPS_II; + group_id_x = ((bid/NB_GROUPS_II) + group_id_y)%NB_GROUPS_I; #endif /* Global input index for work-item */ - uint xIndex = group_id_x * TILE_DIM_XY + lid_x; - uint yIndex = group_id_y * TILE_DIM_XY + lid_y; - uint zIndex = get_global_id(2); - uint index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I; + xIndex = group_id_x * TILE_DIM_XY + lid_x; + yIndex = group_id_y * TILE_DIM_XY + lid_y; + index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I; /* Global output index */ xIndex = group_id_y * TILE_DIM_XY + lid_x; yIndex = group_id_x * TILE_DIM_XY + lid_y; - uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - - __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */ + index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) - { - /* Fill the tile */ - tile[lid_y + i][lid_x] = in[index_in + i * NB_II]; - } + for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) { + /* Fill the tile */ + tile[lid_y + i][lid_x] = in[index_in + i * NB_II]; + } /* Synchronize work-group */ barrier(CLK_LOCAL_MEM_FENCE); - for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) - { - /* Write transposed data */ - out[index_out + i*NB_I] = tile[lid_x][lid_y + i]; - } -} + for(i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) { + /* Write transposed data */ + out[index_out + i*NB_I] = tile[lid_x][lid_y + i]; + } + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); +} +} +} +} diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl index 8adc81f43..294925000 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl @@ -32,28 +32,36 @@ __kernel void transpose_xz(__global const float* in, uint lid_y = get_local_id(1); uint lid_z = get_local_id(2); + /* Global input index for work-item */ + uint xIndex, yIndex, zIndex; + uint index_in, index_out; + uint gidI, gidIII, j; + + __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ + + for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) { + for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { + for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) { + /* Use of diagonal coordinates */ #if NB_III == NB_I - group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0); - group_id_z = get_group_id(0); + group_id_x = (gidI + gidIII) % NB_GROUPS_I; + group_id_z = gidI; #else - uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0); - group_id_z = bid%get_num_groups(2); - group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0); + uint bid = gidI + gidIII * NB_GROUPS_I; + group_id_z = bid%NB_GROUPS_III; + group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I; #endif /* Global input index for work-item */ - uint xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__; - uint yIndex = get_group_id(1) * TILE_DIM_XZ + lid_y; - uint zIndex = group_id_z * TILE_DIM_XZ + lid_z; - uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; + xIndex = group_id_x * TILE_DIM_XZ + lid_x; + zIndex = group_id_z * TILE_DIM_XZ + lid_z; + index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; /* Global output index */ xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__; zIndex = group_id_x * TILE_DIM_XZ + lid_z; - uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - - __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ + index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { @@ -78,4 +86,8 @@ __kernel void transpose_xz(__global const float* in, vstore__N__(temp, (index_out + i*NB_I + j*NB_I*NB_II)/__N__, out); } } + barrier(CLK_LOCAL_MEM_FENCE); +} +} +} } diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl index 96b771b20..37c811cf2 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl @@ -31,28 +31,36 @@ __kernel void transpose_xz(__global const float* in, uint lid_y = get_local_id(1); uint lid_z = get_local_id(2); + /* Global input index for work-item */ + uint xIndex, yIndex, zIndex; + uint index_in, index_out; + uint gidI, gidIII, j; + + __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ + + for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) { + for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { + for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) { + /* Use of diagonal coordinates */ #if NB_III == NB_I - group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0); - group_id_z = get_group_id(0); + group_id_x = (gidI + gidIII) % NB_GROUPS_I; + group_id_z = gidI; #else - uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0); - group_id_z = bid%get_num_groups(2); - group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0); + uint bid = gidI + gidIII * NB_GROUPS_I; + group_id_z = bid%NB_GROUPS_III; + group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I; #endif /* Global input index for work-item */ - uint xIndex = group_id_x * TILE_DIM_XZ + lid_x; - uint yIndex = get_group_id(1) * TILE_DIM_XZ + lid_y; - uint zIndex = group_id_z * TILE_DIM_XZ + lid_z; - uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; + xIndex = group_id_x * TILE_DIM_XZ + lid_x; + zIndex = group_id_z * TILE_DIM_XZ + lid_z; + index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; /* Global output index */ xIndex = group_id_z * TILE_DIM_XZ + lid_x; zIndex = group_id_x * TILE_DIM_XZ + lid_z; - uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - - __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ + index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { @@ -74,4 +82,8 @@ __kernel void transpose_xz(__global const float* in, out[index_out + i*NB_I + j*NB_I*NB_II] = tile[lid_x][lid_y+i][lid_z + j]; } } + barrier(CLK_LOCAL_MEM_FENCE); +} +} +} } diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl index b64960b0d..91475e45d 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl @@ -31,28 +31,36 @@ __kernel void transpose_xz(__global const float* in, uint lid_x = get_local_id(0); uint lid_z = get_local_id(2); + /* Global input index for work-item */ + uint xIndex, yIndex, zIndex; + uint index_in, index_out; + uint gidI, gidIII, j; + + __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ + + for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) { + for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { + for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) { + /* Use of diagonal coordinates */ #if NB_III == NB_I - group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0); - group_id_z = get_group_id(0); + group_id_x = (gidI + gidIII) % NB_GROUPS_I; + group_id_z = gidI; #else - uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0); - group_id_z = bid%get_num_groups(2); - group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0); + uint bid = gidI + gidIII * NB_GROUPS_I; + group_id_z = bid%NB_GROUPS_III; + group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I; #endif /* Global input index for work-item */ - uint xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__; - uint yIndex = get_global_id(1); - uint zIndex = group_id_z * TILE_DIM_XZ + lid_z; - uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; + xIndex = group_id_x * TILE_DIM_XZ + lid_x; + zIndex = group_id_z * TILE_DIM_XZ + lid_z; + index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; /* Global output index */ xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__; zIndex = group_id_x * TILE_DIM_XZ + lid_z; - uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - - __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ + index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { @@ -71,4 +79,8 @@ __kernel void transpose_xz(__global const float* in, ); vstore__N__(temp, (index_out + j*NB_I*NB_II)/__N__, out); } + barrier(CLK_LOCAL_MEM_FENCE); +} +} +} } diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl index 7b06e13fb..9325ccfff 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl @@ -30,41 +30,50 @@ __kernel void transpose_xz(__global const float* in, uint lid_x = get_local_id(0); uint lid_z = get_local_id(2); + /* Global input index for work-item */ + uint xIndex, yIndex, zIndex; + uint index_in, index_out; + uint gidI, gidIII, j; + + __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ + + for(yIndex=get_global_id(1); yIndex<NB_II; yIndex+=get_global_size(1)) { + for(gidI=get_group_id(0); gidI<NB_GROUPS_I; gidI+=get_num_groups(0)) { + for(gidIII=get_group_id(2); gidIII<NB_GROUPS_III; gidIII+=get_num_groups(2)) { + /* Use of diagonal coordinates */ #if NB_III == NB_I - group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0); - group_id_z = get_group_id(0); + group_id_x = (gidI + gidIII) % NB_GROUPS_I; + group_id_z = gidI; #else - uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0); - group_id_z = bid%get_num_groups(2); - group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0); + uint bid = gidI + gidIII * NB_GROUPS_I; + group_id_z = bid%NB_GROUPS_III; + group_id_x = ((bid/NB_GROUPS_III) + group_id_z)%NB_GROUPS_I; #endif - /* Global input index for work-item */ - uint xIndex = group_id_x * TILE_DIM_XZ + lid_x; - uint yIndex = get_global_id(1); - uint zIndex = group_id_z * TILE_DIM_XZ + lid_z; - uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; + xIndex = group_id_x * TILE_DIM_XZ + lid_x; + zIndex = group_id_z * TILE_DIM_XZ + lid_z; + index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; /* Global output index */ xIndex = group_id_z * TILE_DIM_XZ + lid_x; zIndex = group_id_x * TILE_DIM_XZ + lid_z; - uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; + index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; - __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ - - for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) - { - /* Fill the tile */ - tile[lid_z + j][lid_x] = in[index_in + j*NB_III*NB_II]; - - } + for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { + /* Fill the tile */ + tile[lid_z + j][lid_x] = in[index_in + j*NB_III*NB_II]; + } /* Synchronize work-group */ barrier(CLK_LOCAL_MEM_FENCE); - for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) - { - /* Write transposed data */ - out[index_out + j*NB_I*NB_II] = tile[lid_x][lid_z + j]; - } + for(j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { + /* Write transposed data */ + out[index_out + j*NB_I*NB_II] = tile[lid_x][lid_z + j]; + tile[lid_x][lid_z + j] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); +} +} +} } diff --git a/HySoP/hysop/gpu/config_default.py b/HySoP/hysop/gpu/config_default.py index ee9ffc465..2f87511db 100644 --- a/HySoP/hysop/gpu/config_default.py +++ b/HySoP/hysop/gpu/config_default.py @@ -5,43 +5,62 @@ OpenCL kernels default configurations. """ from hysop.constants import np FLOAT_GPU, DOUBLE_GPU = np.float32, np.float64 +MAX_GWI = (256, 256, 256) #build empty dictionaries kernels_config = {} kernels_config[2] = {FLOAT_GPU: {}, DOUBLE_GPU: {}} kernels_config[3] = {FLOAT_GPU: {}, DOUBLE_GPU: {}} -# Copy kernel: -def copy_space_index_2d(size, t_dim, b_rows, vec): - gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), 1) - lwi = (t_dim / vec, b_rows, 1) - return gwi, lwi -def copy_space_index_3d(size, t_dim, b_rows, vec): - gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), int(size[2])) - lwi = (t_dim / vec, b_rows, 1) - return gwi, lwi -# Configs : sources, tile size, block rows, vector size, index space function -kernels_config[3][FLOAT_GPU]['copy'] = \ - ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_3d) -kernels_config[3][DOUBLE_GPU]['copy'] = \ - ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_3d) -kernels_config[2][FLOAT_GPU]['copy'] = \ - ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_2d) -kernels_config[2][DOUBLE_GPU]['copy'] = \ - ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_2d) +def _clamp_max(w, m): + while w > m: + w /= 2 + return int(w) + + +def check_max(t_gwi): + return tuple([_clamp_max(w, m) for w, m in zip(t_gwi, MAX_GWI)]) + +## Copy kernel is replaced by copy function from OpenCL API +# # Copy kernel: +# def copy_space_index_2d(size, t_dim, b_rows, vec): +# gwi = check_max((size[0] / vec, b_rows * size[1] / t_dim, 1)) +# lwi = (t_dim / vec, b_rows, 1) +# blocks_nb = ((size[0] / vec) / lwi[0], +# (b_rows * size[1] / t_dim) / lwi[1], None) +# return gwi, lwi, blocks_nb +# def copy_space_index_3d(size, t_dim, b_rows, vec): +# gwi = check_max((size[0] / vec, b_rows * size[1] / t_dim, size[2])) +# lwi = (t_dim / vec, b_rows, 1) +# blocks_nb = ((size[0] / vec) / lwi[0], +# (b_rows * size[1] / t_dim) / lwi[1], None) +# return gwi, lwi, blocks_nb +# # Configs : sources, tile size, block rows, vector size, index space function +# kernels_config[3][FLOAT_GPU]['copy'] = \ +# ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_3d) +# kernels_config[3][DOUBLE_GPU]['copy'] = \ +# ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_3d) +# kernels_config[2][FLOAT_GPU]['copy'] = \ +# ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_2d) +# kernels_config[2][DOUBLE_GPU]['copy'] = \ +# ('kernels/copy_noVec.cl', 32, 2, 1, copy_space_index_2d) # Transpositions kernels: # XY transposition # Settings are taken from destination layout as current layout. # gwi is computed form input layout (appears as transposed layout) def xy_space_index_2d(size, t_dim, b_rows, vec): - gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), 1) + gwi = check_max((size[1] / vec, b_rows * size[0] / t_dim, 1)) lwi = (t_dim / vec, b_rows, 1) - return gwi, lwi + blocs_nb = ((size[1] / vec) / lwi[0], + (b_rows * size[0] / t_dim) / lwi[1], None) + return gwi, lwi, blocs_nb def xy_space_index_3d(size, t_dim, b_rows, vec): - gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), int(size[2])) + gwi = check_max((size[1] / vec, b_rows * size[0] / t_dim, size[2])) lwi = (t_dim / vec, b_rows, 1) - return gwi, lwi + blocs_nb = ((size[1] / vec) / lwi[0], + (b_rows * size[0] / t_dim) / lwi[1], None) + return gwi, lwi, blocs_nb # Configs : sources, tile size, block rows, is padding, vector size, # index space function kernels_config[3][FLOAT_GPU]['transpose_xy'] = \ @@ -57,9 +76,11 @@ kernels_config[2][DOUBLE_GPU]['transpose_xy'] = \ # Settings are taken from destination layout as current layout. # gwi is computed form input layout (appears as transposed layout) def xz_space_index_3d(size, t_dim, b_rows, b_deph, vec): - gwi = (int(size[2] / vec), int(size[1]), int(b_deph * size[0] / t_dim)) + gwi = check_max((size[2] / vec, size[1], b_deph * size[0] / t_dim)) lwi = (t_dim / vec, 1, b_deph) - return gwi, lwi + blocs_nb = (((size[2]) / vec) / lwi[0], None, + (b_deph * (size[0]) / t_dim) / lwi[2]) + return gwi, lwi, blocs_nb # Configs : sources, tile size, block rows, is padding, vector size, # index space function kernels_config[3][FLOAT_GPU]['transpose_xz'] = \ @@ -77,11 +98,13 @@ def computational_kernels_index_space(wi, size, vec): wi = size[0] // vec if len(size) == 3: - gwi = (int(wi), int(size[1]), int(size[2])) + gwi = (int(wi), + _clamp_max(size[1], MAX_GWI[1]), + _clamp_max(size[2], MAX_GWI[2])) lwi = (int(wi), 1, 1) else: - gwi = (int(wi), int(size[1])) - lwi = (int(wi), 1) + gwi = (int(wi), _clamp_max(size[1], MAX_GWI[1]), 1) + lwi = (int(wi), 1, 1) return gwi, lwi def advection_index_space_3d(size, vec): @@ -102,7 +125,7 @@ def remeshing_index_space_2d(size, vec): return computational_kernels_index_space(wi, size, vec) def advection_and_remeshing_index_space(size, vec): - wi = min(max(32, size[0] / 2), 128) + wi = min(size[0] / 2, 128) return computational_kernels_index_space(wi, size, vec) @@ -172,11 +195,17 @@ kernels_config[2][DOUBLE_GPU]['advec_and_remesh'] = \ "kernels/advection_and_remeshing_noVec.cl"], False, 1, advection_and_remeshing_index_space) + +def diffusion_space_index_3d(size, nb_part, tile): + gwi = check_max((size[0], size[1] / nb_part)) + lwi = (tile, tile / nb_part) + blocs_nb = (size[0] / tile, size[1] / tile) + return gwi, lwi, blocs_nb + + kernels_config[3][FLOAT_GPU]['diffusion'] = \ (["common.cl", "kernels/diffusion.cl"], - 16, 1, 1, - lambda size, nb_part, tile: ((size[0], size[1] / nb_part), - (tile, tile / nb_part))) + 16, 1, 1, diffusion_space_index_3d) kernels_config[3][DOUBLE_GPU]['advec_comm'] = \ diff --git a/HySoP/hysop/gpu/config_k20m.py b/HySoP/hysop/gpu/config_k20m.py index e9a23eb69..cb39f7d2e 100644 --- a/HySoP/hysop/gpu/config_k20m.py +++ b/HySoP/hysop/gpu/config_k20m.py @@ -5,43 +5,60 @@ OpenCL kernels configurations. """ from hysop.constants import np FLOAT_GPU, DOUBLE_GPU = np.float32, np.float64 +MAX_GWI = (1024, 1024, 1024) + + +def _clamp_max(w, m): + while w > m: + w /= 2 + return int(w) + + +def check_max(t_gwi): + return tuple([_clamp_max(w, m) for w, m in zip(t_gwi, MAX_GWI)]) + #build empty dictionaries kernels_config = {} kernels_config[2] = {FLOAT_GPU: {}, DOUBLE_GPU: {}} kernels_config[3] = {FLOAT_GPU: {}, DOUBLE_GPU: {}} -# Copy kernel: -def copy_space_index_2d(size, t_dim, b_rows, vec): - gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), 1) - lwi = (t_dim / vec, b_rows, 1) - return gwi, lwi -def copy_space_index_3d(size, t_dim, b_rows, vec): - gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), int(size[2])) - lwi = (t_dim / vec, b_rows, 1) - return gwi, lwi -# Configs : sources, tile size, block rows, vector size, index space function -kernels_config[3][FLOAT_GPU]['copy'] = \ - ('kernels/copy_noVec.cl', 32, 8, 1, copy_space_index_3d) -kernels_config[3][DOUBLE_GPU]['copy'] = \ - ('kernels/copy_noVec.cl', 16, 16, 1, copy_space_index_3d) -kernels_config[2][FLOAT_GPU]['copy'] = \ - ('kernels/copy_noVec.cl', 32, 8, 1, copy_space_index_2d) -kernels_config[2][DOUBLE_GPU]['copy'] = \ - ('kernels/copy_noVec.cl', 16, 16, 1, copy_space_index_2d) +## Copy kernel is replaced by copy function from OpenCL API +# # Copy kernel: +# def copy_space_index_2d(size, t_dim, b_rows, vec): +# gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), 1) +# lwi = (t_dim / vec, b_rows, 1) +# return gwi, lwi +# def copy_space_index_3d(size, t_dim, b_rows, vec): +# gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), int(size[2])) +# lwi = (t_dim / vec, b_rows, 1) +# return gwi, lwi +# # Configs : sources, tile size, block rows, vector size, index space function +# kernels_config[3][FLOAT_GPU]['copy'] = \ +# ('kernels/copy_noVec.cl', 32, 8, 1, copy_space_index_3d) +# kernels_config[3][DOUBLE_GPU]['copy'] = \ +# ('kernels/copy_noVec.cl', 16, 16, 1, copy_space_index_3d) +# kernels_config[2][FLOAT_GPU]['copy'] = \ +# ('kernels/copy_noVec.cl', 32, 8, 1, copy_space_index_2d) +# kernels_config[2][DOUBLE_GPU]['copy'] = \ +# ('kernels/copy_noVec.cl', 16, 16, 1, copy_space_index_2d) # Transpositions kernels: # XY transposition # Settings are taken from destination layout as current layout. # gwi is computed form input layout (appears as transposed layout) def xy_space_index_2d(size, t_dim, b_rows, vec): - gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), 1) + gwi = check_max((size[1] / vec, b_rows * size[0] / t_dim, 1)) lwi = (t_dim / vec, b_rows, 1) - return gwi, lwi + blocs_nb = ((size[1] / vec) / lwi[0], + (b_rows * size[0] / t_dim) / lwi[1], None) + return gwi, lwi, blocs_nb def xy_space_index_3d(size, t_dim, b_rows, vec): - gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), int(size[2])) + gwi = check_max((size[1] / vec, b_rows * size[0] / t_dim, size[2])) lwi = (t_dim / vec, b_rows, 1) - return gwi, lwi + block_nb = ((size[1] / vec) / lwi[0], + (b_rows * size[0] / t_dim) / lwi[1], None) + return gwi, lwi, block_nb # Configs : sources, tile size, block rows, is padding, vector size, # index space function kernels_config[3][FLOAT_GPU]['transpose_xy'] = \ @@ -57,9 +74,11 @@ kernels_config[2][DOUBLE_GPU]['transpose_xy'] = \ # Settings are taken from destination layout as current layout. # gwi is computed form input layout (appears as transposed layout) def xz_space_index_3d(size, t_dim, b_rows, b_deph, vec): - gwi = (int(size[2] / vec), int(size[1]), int(b_deph * size[0] / t_dim)) + gwi = check_max((size[2] / vec, size[1], b_deph * size[0] / t_dim)) lwi = (t_dim / vec, 1, b_deph) - return gwi, lwi + blocs_nb = ((size[2] / vec) / lwi[0], None, + (b_deph * size[0] / t_dim) / lwi[2]) + return gwi, lwi, blocs_nb # Configs : sources, tile size, block rows, is padding, vector size, # index space function kernels_config[3][FLOAT_GPU]['transpose_xz'] = \ @@ -77,15 +96,17 @@ def computational_kernels_index_space(wi, size, vec): wi = size[0] // vec if len(size) == 3: - gwi = (int(wi), int(size[1]), int(size[2])) + gwi = (int(wi), + _clamp_max(size[1], MAX_GWI[1]), + _clamp_max(size[2], MAX_GWI[2])) lwi = (int(wi), 1, 1) else: - gwi = (int(wi), int(size[1])) + gwi = (int(wi), _clamp_max(size[1], MAX_GWI[1])) lwi = (int(wi), 1) return gwi, lwi def advection_index_space_3d(size, vec): - wi = min(max(64, size[0] / 4), 1024) + wi = min(size[0] / 4, 1024) return computational_kernels_index_space(wi, size, vec) def advection_index_space_2d_SP(size, vec): wi = min(size[0] / 8, 1024) @@ -180,11 +201,17 @@ kernels_config[2][DOUBLE_GPU]['advec_and_remesh'] = \ "kernels/advection_and_remeshing_noVec.cl"], False, 1, advection_and_remeshing_index_space) + +def diffusion_space_index_3d(size, nb_part, tile): + gwi = check_max((size[0], size[1] / nb_part)) + lwi = (tile, tile / nb_part) + blocs_nb = (size[0] / tile, size[1] / tile) + return gwi, lwi, blocs_nb + + kernels_config[3][DOUBLE_GPU]['diffusion'] = \ (["common.cl", "kernels/diffusion.cl"], - 16, 4, 1, - lambda size, nb_part, tile: ((size[0], size[1] / nb_part), - (tile, tile / nb_part))) + 16, 4, 1, diffusion_space_index_3d) kernels_config[3][DOUBLE_GPU]['advec_comm'] = \ diff --git a/HySoP/hysop/gpu/gpu_diffusion.py b/HySoP/hysop/gpu/gpu_diffusion.py index 0758a4581..c1ba26ec4 100644 --- a/HySoP/hysop/gpu/gpu_diffusion.py +++ b/HySoP/hysop/gpu/gpu_diffusion.py @@ -154,10 +154,11 @@ class GPUDiffusion(DiscreteOperator, GPUOperator): build_options += " -D CUT_DIR" + S_DIR[d] + "=" build_options += str(1 if topo.shape[d] > 1 else 0) + gwi, lwi, blocs_nb = f_space(self.field.data[0].shape, + nb_part_per_wi, tile_size) + build_options += " -D NB_GROUPS_I={0}".format(blocs_nb[0]) + build_options += " -D NB_GROUPS_II={0}".format(blocs_nb[1]) prg = self.cl_env.build_src(src, build_options, vec) - - gwi, lwi = f_space(self.field.data[0].shape, - nb_part_per_wi, tile_size) self.num_diffusion = KernelLauncher( prg.diffusion, self.cl_env.queue, gwi, lwi) self.copy = KernelLauncher(cl.enqueue_copy, diff --git a/HySoP/hysop/gpu/gpu_kernel.py b/HySoP/hysop/gpu/gpu_kernel.py index abc0e9fbd..411e141ce 100644 --- a/HySoP/hysop/gpu/gpu_kernel.py +++ b/HySoP/hysop/gpu/gpu_kernel.py @@ -2,10 +2,10 @@ @file gpu_kernel.py """ from hysop.constants import debug, S_DIR -from hysop import __VERBOSE__ +#from hysop import __VERBOSE__ from hysop.gpu import cl, CL_PROFILE from hysop.tools.profiler import FProfiler - +__VERBOSE__ = True class KernelListLauncher(object): """ @@ -73,8 +73,11 @@ class KernelListLauncher(object): @return OpenCL Event. """ if __VERBOSE__: - print "OpenCL kernel:", self.kernel[d].function_name - print d, args[0], args[1], args, kwargs + try: + print "OpenCL kernel:", self.kernel[d].function_name, d, args[0], args[1] + #print d, args[0], args[1], args, kwargs + except AttributeError: + print "OpenCL kernel:", self.kernel[d].__name__ evt = self.kernel[d](self.queue, *args, **kwargs) if CL_PROFILE: evt.wait() diff --git a/HySoP/hysop/gpu/gpu_particle_advection.py b/HySoP/hysop/gpu/gpu_particle_advection.py index 18a7ed70a..0504675eb 100644 --- a/HySoP/hysop/gpu/gpu_particle_advection.py +++ b/HySoP/hysop/gpu/gpu_particle_advection.py @@ -393,7 +393,7 @@ class GPUParticleAdvection(ParticleAdvection, GPUOperator): while t_dim > resol_tmp[0] or t_dim > resol_tmp[1] or \ (resol_tmp[0] % t_dim) > 0 or (resol_tmp[1] % t_dim) > 0: t_dim /= 2 - gwi, lwi = f_space(resol_tmp, t_dim, b_rows, vec) + gwi, lwi, blocs_nb = f_space(resol_tmp, t_dim, b_rows, vec) if is_padding: build_options += " -D PADDING_XY=1" @@ -401,6 +401,8 @@ class GPUParticleAdvection(ParticleAdvection, GPUOperator): build_options += " -D PADDING_XY=0" build_options += " -D TILE_DIM_XY={0}".format(t_dim) build_options += " -D BLOCK_ROWS_XY={0}".format(b_rows) + build_options += " -D NB_GROUPS_I={0}".format(blocs_nb[0]) + build_options += " -D NB_GROUPS_II={0}".format(blocs_nb[1]) build_options += ocl_cte prg = self.cl_env.build_src(src, build_options, vec) return KernelLauncher(prg.transpose_xy, self.cl_env.queue, gwi, lwi) @@ -447,7 +449,7 @@ class GPUParticleAdvection(ParticleAdvection, GPUOperator): while t_dim > resol_tmp[0] or t_dim > resol_tmp[2] or \ (resol_tmp[0] % t_dim) > 0 or (resol_tmp[2] % t_dim) > 0: t_dim /= 2 - gwi, lwi = f_space(resol_tmp, t_dim, b_rows, b_deph, vec) + gwi, lwi, blocs_nb = f_space(resol_tmp, t_dim, b_rows, b_deph, vec) if is_padding: build_options += " -D PADDING_XZ=1" else: @@ -455,6 +457,8 @@ class GPUParticleAdvection(ParticleAdvection, GPUOperator): build_options += " -D TILE_DIM_XZ={0}".format(t_dim) build_options += " -D BLOCK_ROWS_XZ={0}".format(b_rows) build_options += " -D BLOCK_DEPH_XZ={0}".format(b_deph) + build_options += " -D NB_GROUPS_I={0}".format(blocs_nb[0]) + build_options += " -D NB_GROUPS_III={0}".format(blocs_nb[2]) build_options += ocl_cte prg = self.cl_env.build_src( src, diff --git a/HySoP/hysop/gpu/tests/test_advection_randomVelocity.py b/HySoP/hysop/gpu/tests/test_advection_randomVelocity.py index 2e4263578..c151b8f3f 100644 --- a/HySoP/hysop/gpu/tests/test_advection_randomVelocity.py +++ b/HySoP/hysop/gpu/tests/test_advection_randomVelocity.py @@ -54,7 +54,8 @@ def assertion_2D_withPython(scal, velo, advec, advec_py): scal_d.toHost() advec.finalize() - assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07) + err_m = np.max(np.abs(py_res - scal_d.data[0])) + assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07), str(err_m) def assertion_3D_withPython(scal, velo, advec, advec_py): @@ -83,7 +84,8 @@ def assertion_3D_withPython(scal, velo, advec, advec_py): scal_d.toHost() advec.finalize() - assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07) + err_m = np.max(np.abs(py_res - scal_d.data[0])) + assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07), str(err_m) d2d = Discretization([33, 33]) @@ -707,7 +709,8 @@ def test_rectangular_domain2D(): py_res = scal_d.data[0].copy() scal_d.toHost() - assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07) + err_m = np.max(np.abs(py_res - scal_d.data[0])) + assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07), str(err_m) advec.finalize() @@ -753,7 +756,9 @@ def test_rectangular_domain3D(): py_res = scal_d.data[0].copy() scal_d.toHost() - assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07) + err_m = np.max(np.abs(py_res - scal_d.data[0])) + assert np.allclose(py_res, scal_d.data[0], rtol=1e-04, atol=1e-07), str(err_m) + " " + \ + str(np.where(np.abs(py_res - scal_d.data[0]) > 1e-07+1e-04*np.abs(scal_d.data[0]))) advec.finalize() @@ -792,7 +797,6 @@ def test_vector_2D(): scal_d.toDevice() velo_d.toDevice() - print np.max(scal_d.data[0] - scal_d.data[1]) advec_py.apply(Simulation(tinit=0., tend=0.1, nbIter=1)) advec.apply(Simulation(tinit=0., tend=0.1, nbIter=1)) @@ -800,8 +804,12 @@ def test_vector_2D(): py_res_Y = scal_d.data[1].copy() scal_d.toHost() - assert np.allclose(py_res_X, scal_d.data[0], rtol=1e-04, atol=1e-07) - assert np.allclose(py_res_Y, scal_d.data[1], rtol=1e-04, atol=1e-07) + err_m = np.max(np.abs(py_res_X - scal_d.data[0])) + assert np.allclose(py_res_X, scal_d.data[0], rtol=1e-04, atol=1e-07), str(err_m) + " " + \ + str(np.where(np.abs(py_res_X - scal_d.data[0]) > 1e-07+1e-04*np.abs(scal_d.data[0]))) + err_m = np.max(np.abs(py_res_Y - scal_d.data[1])) + assert np.allclose(py_res_Y, scal_d.data[1], rtol=1e-04, atol=1e-07), str(err_m) + " "+ \ + str(np.where(np.abs(py_res_Y - scal_d.data[1]) > 1e-07+1e-04*np.abs(scal_d.data[1]))) advec.finalize() @@ -854,7 +862,13 @@ def test_vector_3D(): py_res_Z = scal_d.data[2].copy() scal_d.toHost() - assert np.allclose(py_res_X, scal_d.data[0], rtol=1e-04, atol=1e-07) - assert np.allclose(py_res_Y, scal_d.data[1], rtol=1e-04, atol=1e-07) - assert np.allclose(py_res_Z, scal_d.data[2], rtol=1e-04, atol=1e-07) + err_m = np.max(np.abs(py_res_X - scal_d.data[0])) + assert np.allclose(py_res_X, scal_d.data[0], rtol=1e-04, atol=1e-07), str(err_m) + " " + \ + str(np.where(np.abs(py_res_X_- scal_d.data[0]) > 1e-07+1e-04*np.abs(scal_d.data[0]))) + err_m = np.max(np.abs(py_res_Y - scal_d.data[0])) + assert np.allclose(py_res_Y, scal_d.data[1], rtol=1e-04, atol=1e-07), str(err_m) + " " + \ + str(np.where(np.abs(py_res_Y_- scal_d.data[1]) > 1e-07+1e-04*np.abs(scal_d.data[1]))) + err_m = np.max(np.abs(py_res_Z - scal_d.data[0])) + assert np.allclose(py_res_Z, scal_d.data[2], rtol=1e-04, atol=1e-07), str(err_m) + " " + \ + str(np.where(np.abs(py_res_Z_- scal_d.data[2]) > 1e-07+1e-04*np.abs(scal_d.data[2]))) advec.finalize() diff --git a/HySoP/hysop/gpu/tests/test_transposition.py b/HySoP/hysop/gpu/tests/test_transposition.py index c6da15f27..ddb81e299 100644 --- a/HySoP/hysop/gpu/tests/test_transposition.py +++ b/HySoP/hysop/gpu/tests/test_transposition.py @@ -372,6 +372,9 @@ def test_transposition_xz3Dslice(): int(resolution[1]), int(resolution[2] / 4)) lwi = (16, 1, 4) + build_options += " -D NB_GROUPS_I=" + str(resolution[0] / lwi[0]) + build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2]) + prg = cl_env.build_src(src_transpose_xz, build_options, vec) init_transpose_xz = KernelLauncher( prg.transpose_xz, cl_env.queue, gwi, lwi) @@ -425,6 +428,8 @@ def test_transposition_xz3Dslice_rect(): int(resolution[1]), int(resolution[2] / 4)) lwi = (16, 1, 4) + build_options += " -D NB_GROUPS_I=" + str(resolution[0] / lwi[0]) + build_options += " -D NB_GROUPS_III=" + str((resolution[2] / 4) / lwi[2]) prg = cl_env.build_src(src_transpose_xz, build_options, vec) init_transpose_xz_x = KernelLauncher( prg.transpose_xz, cl_env.queue, gwi, lwi) @@ -437,6 +442,8 @@ def test_transposition_xz3Dslice_rect(): int(resolution[1]), int(resolution[0] / 4)) lwi = (16, 1, 4) + build_options += " -D NB_GROUPS_I=" + str(resolution[2] / lwi[0]) + build_options += " -D NB_GROUPS_III=" + str((resolution[0] / 4) / lwi[2]) prg = cl_env.build_src(src_transpose_xz, build_options, vec) init_transpose_xz_z = KernelLauncher( prg.transpose_xz, cl_env.queue, gwi, lwi) @@ -463,6 +470,7 @@ def test_transposition_xz3Dslice_rect(): cl_env.queue.finish() cl.enqueue_copy(cl_env.queue, data_out, data_gpu_out) cl_env.queue.finish() + print len(np.where(np.abs(data_out-data_res)>0.0001)[0]) assert np.allclose(data_out, data_res) init_transpose_xz_z(data_gpu_out, data_gpu_out2) -- GitLab