diff --git a/HySoP/hysop/gpu/cl_src/advection_2D_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_2D_builtin.cl index 008be61a924d04004e424cf66c0dadd509ac59c2..0ebb79ffd416783d76971cbf755e255c38463949 100644 --- a/HySoP/hysop/gpu/cl_src/advection_2D_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/advection_2D_builtin.cl @@ -20,7 +20,7 @@ __kernel void advection(__global const float* gvelo, float invdx = 1.0/dx; uint i; float v, y, p; - for(i=gidX; i<WIDTH; i+=WGN) + for(i=gidX; i<WIDTH; i+=WGNADVEC) { v = gvelo[i+gidY*WIDTH]; p = fma((float)(0.5*dt),v,i*dx); diff --git a/HySoP/hysop/gpu/cl_src/advection_2D_opt2_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_2D_opt2_builtin.cl index d99fe53429fe274cca78682038c06150eda981b9..6910c47ad24b958222e61004dfb91ae5c78393d0 100644 --- a/HySoP/hysop/gpu/cl_src/advection_2D_opt2_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/advection_2D_opt2_builtin.cl @@ -23,7 +23,7 @@ __kernel void advection(__global const float* gvelo, float2 v, vp, y, p, coord; __local float velocity_cache[WIDTH]; - for(i=gidX*2; i<WIDTH; i+=WGN*2) + for(i=gidX*2; i<WIDTH; i+=WGNADVEC*2) { v = vload2((i+gidY*WIDTH)/2, gvelo); velocity_cache[i] = v.x; @@ -32,7 +32,7 @@ __kernel void advection(__global const float* gvelo, barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*2; i<WIDTH; i+=WGN*2) + for(i=gidX*2; i<WIDTH; i+=WGNADVEC*2) { v = vload2(i/2, velocity_cache); coord = (float2)(i*dx, (i+1)*dx); diff --git a/HySoP/hysop/gpu/cl_src/advection_2D_opt4_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_2D_opt4_builtin.cl index c9d9637ad6728e8b5346532d706618100e647324..302cc4a0dee8bb152a438966d03f6e00dec22a3f 100644 --- a/HySoP/hysop/gpu/cl_src/advection_2D_opt4_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/advection_2D_opt4_builtin.cl @@ -23,7 +23,7 @@ __kernel void advection(__global const float* gvelo, float4 v, vp, y, p, coord; __local float velocity_cache[WIDTH]; - for(i=gidX*4; i<WIDTH; i+=WGN*4) + for(i=gidX*4; i<WIDTH; i+=WGNADVEC*4) { v = vload4((i+gidY*WIDTH)/4, gvelo); velocity_cache[i] = v.x; @@ -34,7 +34,7 @@ __kernel void advection(__global const float* gvelo, barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*4; i<WIDTH; i+=WGN*4) + for(i=gidX*4; i<WIDTH; i+=WGNADVEC*4) { v = vload4(i/4, velocity_cache); coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); diff --git a/HySoP/hysop/gpu/cl_src/advection_2D_opt8_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_2D_opt8_builtin.cl index cab7c1dd42d87d26fb21c864a80211d430d2a460..3d87393b5fc6b3bf329183c110276b327e3b742d 100644 --- a/HySoP/hysop/gpu/cl_src/advection_2D_opt8_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/advection_2D_opt8_builtin.cl @@ -23,7 +23,7 @@ __kernel void advection(__global const float* gvelo, float8 v, vp, y, p, coord; __local float velocity_cache[WIDTH]; - for(i=gidX*8; i<WIDTH; i+=WGN*8) + for(i=gidX*8; i<WIDTH; i+=WGNADVEC*8) { v = vload8((i+gidY*WIDTH)/8, gvelo); velocity_cache[i] = v.s0; @@ -38,7 +38,7 @@ __kernel void advection(__global const float* gvelo, barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*8; i<WIDTH; i+=WGN*8) + for(i=gidX*8; i<WIDTH; i+=WGNADVEC*8) { v = vload8(i/8, velocity_cache); coord = (float8)(i*dx, diff --git a/HySoP/hysop/gpu/cl_src/advection_3D_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_3D_builtin.cl index c54dd72679192fcca254c170d8669d3cbe80742f..1e0e6883501a2164d96053e59d8dcd80083aa33a 100644 --- a/HySoP/hysop/gpu/cl_src/advection_3D_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/advection_3D_builtin.cl @@ -21,7 +21,7 @@ __kernel void advection(__global const float* gvelo, float invdx = 1.0/dx; uint i; float v, y, p; - for(i=gidX; i<WIDTH; i+=WGN) + for(i=gidX; i<WIDTH; i+=WGNADVEC) { v = gvelo[i+gidY*WIDTH+gidZ*WIDTH*WIDTH]; p = fma((float)(0.5*dt),v,i*dx); diff --git a/HySoP/hysop/gpu/cl_src/advection_3D_opt2_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_3D_opt2_builtin.cl index 3891bb10dcdc5442cb14e4857f385eb4f4c7f4c8..777b9d3150184c9fab2e65217a57acbc46a971fe 100644 --- a/HySoP/hysop/gpu/cl_src/advection_3D_opt2_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/advection_3D_opt2_builtin.cl @@ -24,7 +24,7 @@ __kernel void advection(__global const float* gvelo, float2 v, vp, y, p, coord; __local float velocity_cache[WIDTH]; - for(i=gidX*2; i<WIDTH; i+=WGN*2) + for(i=gidX*2; i<WIDTH; i+=WGNADVEC*2) { v = vload2((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/2, gvelo); velocity_cache[i] = v.x; @@ -33,7 +33,7 @@ __kernel void advection(__global const float* gvelo, barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*2; i<WIDTH; i+=WGN*2) + for(i=gidX*2; i<WIDTH; i+=WGNADVEC*2) { v = vload2(i/2, velocity_cache); coord = (float2)(i*dx, (i+1)*dx); diff --git a/HySoP/hysop/gpu/cl_src/advection_3D_opt4_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_3D_opt4_builtin.cl index 49ae8e1d4bb4ab796e0b659eaeadc7e8177bb0eb..54fff87d16772990c20e1d8fc48563b6362b50f2 100644 --- a/HySoP/hysop/gpu/cl_src/advection_3D_opt4_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/advection_3D_opt4_builtin.cl @@ -24,7 +24,7 @@ __kernel void advection(__global const float* gvelo, float4 v, vp, y, p, coord; __local float velocity_cache[WIDTH]; - for(i=gidX*4; i<WIDTH; i+=WGN*4) + for(i=gidX*4; i<WIDTH; i+=WGNADVEC*4) { v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); velocity_cache[i] = v.x; @@ -35,7 +35,7 @@ __kernel void advection(__global const float* gvelo, barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*4; i<WIDTH; i+=WGN*4) + for(i=gidX*4; i<WIDTH; i+=WGNADVEC*4) { v = vload4(i/4, velocity_cache); coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); diff --git a/HySoP/hysop/gpu/cl_src/advection_3D_opt8_builtin.cl b/HySoP/hysop/gpu/cl_src/advection_3D_opt8_builtin.cl index db3fb09a765970c6a15b7c3f43a99fc3e92c46bb..f019290fb6378c63ae549771bde3f67ac8975407 100644 --- a/HySoP/hysop/gpu/cl_src/advection_3D_opt8_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/advection_3D_opt8_builtin.cl @@ -24,7 +24,7 @@ __kernel void advection(__global const float* gvelo, float8 v, vp, y, p, coord; __local float velocity_cache[WIDTH]; - for(i=gidX*8; i<WIDTH; i+=WGN*8) + for(i=gidX*8; i<WIDTH; i+=WGNADVEC*8) { v = vload8((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/8, gvelo); velocity_cache[i] = v.s0; @@ -39,7 +39,7 @@ __kernel void advection(__global const float* gvelo, barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*8; i<WIDTH; i+=WGN*8) + for(i=gidX*8; i<WIDTH; i+=WGNADVEC*8) { v = vload8(i/8, velocity_cache); coord = (float8)(i*dx, diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt4_builtin_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt4_builtin_noBC.cl index 2eaec614a26d971b549d417d48d85a2ce4f5dd80..b782866e914d2bab073e9bc8f1d17ec2e4c8d334 100644 --- a/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt4_builtin_noBC.cl +++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt4_builtin_noBC.cl @@ -1,6 +1,6 @@ inline uint noBC_id(int id, int nb_part){ - return (id%nb_part)*WGN+(id/nb_part); + return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part); } __kernel void advection_and_remeshing(__global const float* gvelo, __global const float* pscal, @@ -11,14 +11,14 @@ __kernel void advection_and_remeshing(__global const float* gvelo, uint gidY = get_global_id(1); float invdx = 1.0/dx; int4 ind, i_ind, i_ind_p; - uint i, nb_part = WIDTH/WGN; + uint i, nb_part = WIDTH/WGNADVECANDREMESH; float4 p, s, y, gs, v, vp, coord; uint4 index4; __local float gscal_loc[WIDTH]; __local float gvelo_loc[WIDTH]; - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) { v = vload4((i+gidY*WIDTH)/4, gvelo); gvelo_loc[noBC_id(i,nb_part)] = v.x; @@ -90,6 +90,6 @@ __kernel void advection_and_remeshing(__global const float* gvelo, barrier(CLK_LOCAL_MEM_FENCE); } barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal); } diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt8_builtin_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt8_builtin_noBC.cl index 289b9d4a6a0c4fc9bd2f84c09e96d294385485ed..0f0d12414b885bbeb75afbdd55a400f69bbe0a8b 100644 --- a/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt8_builtin_noBC.cl +++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_2D_opt8_builtin_noBC.cl @@ -1,6 +1,6 @@ inline uint noBC_id(int id, int nb_part){ - return (id%nb_part)*WGN+(id/nb_part); + return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part); } __kernel void advection_and_remeshing(__global const float* gvelo, __global const float* pscal, @@ -11,17 +11,17 @@ __kernel void advection_and_remeshing(__global const float* gvelo, uint gidY = get_global_id(1); float invdx = 1.0/dx; int8 ind, i_ind, i_ind_p; - uint i, nb_part = WIDTH/WGN; - float8 p, s, y, gs, v, vp, coord; + uint i, nb_part = WIDTH/WGNADVECANDREMESH; + float8 p, s, y, v, vp, coord; uint8 index8; __local float gscal_loc[WIDTH]; __local float gvelo_loc[WIDTH]; - for(i=gidX*8; i<WIDTH; i+=(WGN*8)) + for(i=gidX*8; i<WIDTH; i+=(WGNADVECANDREMESH*8)) { v = vload8((i+gidY*WIDTH)/8, gvelo); - gvelo_loc[noBC_id(i,nb_part)] = v.s0; + gvelo_loc[noBC_id(i+0,nb_part)] = v.s0; gvelo_loc[noBC_id(i+1,nb_part)] = v.s1; gvelo_loc[noBC_id(i+2,nb_part)] = v.s2; gvelo_loc[noBC_id(i+3,nb_part)] = v.s3; @@ -29,14 +29,14 @@ __kernel void advection_and_remeshing(__global const float* gvelo, gvelo_loc[noBC_id(i+5,nb_part)] = v.s5; gvelo_loc[noBC_id(i+6,nb_part)] = v.s6; gvelo_loc[noBC_id(i+7,nb_part)] = v.s7; - gscal_loc[i] = 0.0; - gscal_loc[i+1] = 0.0; - gscal_loc[i+2] = 0.0; - gscal_loc[i+3] = 0.0; - gscal_loc[i+4] = 0.0; - gscal_loc[i+5] = 0.0; - gscal_loc[i+6] = 0.0; - gscal_loc[i+7] = 0.0; + gscal_loc[noBC_id(i+0,nb_part)] = 0.0; + gscal_loc[noBC_id(i+1,nb_part)] = 0.0; + gscal_loc[noBC_id(i+2,nb_part)] = 0.0; + gscal_loc[noBC_id(i+3,nb_part)] = 0.0; + gscal_loc[noBC_id(i+4,nb_part)] = 0.0; + gscal_loc[noBC_id(i+5,nb_part)] = 0.0; + gscal_loc[noBC_id(i+6,nb_part)] = 0.0; + gscal_loc[noBC_id(i+7,nb_part)] = 0.0; } barrier(CLK_LOCAL_MEM_FENCE); for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=8) @@ -145,7 +145,7 @@ __kernel void advection_and_remeshing(__global const float* gvelo, barrier(CLK_LOCAL_MEM_FENCE); } barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*8; i<WIDTH; i+=(WGN*8)) + for(i=gidX*8; i<WIDTH; i+=(WGNADVECANDREMESH*8)) vstore8((float8)(gscal_loc[noBC_id(i,nb_part)], gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4.cl index 46e9bc59b2ea793d24cd36c429a41bb0e448ff8e..5a380089032e434c0779a1ea1496df8c52cc2582 100644 --- a/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4.cl +++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4.cl @@ -9,14 +9,14 @@ __kernel void advection_and_remeshing(__global const float* gvelo, uint gidZ = get_global_id(2); float invdx = 1.0/dx; int4 ind, i_ind, i_ind_p; - uint i, nb_part = WIDTH/WGN; + uint i, nb_part = WIDTH/WGNADVECANDREMESH; float4 p, s, y, gs, v, vp, coord; uint4 index4; __local float gscal_loc[WIDTH]; __local float gvelo_loc[WIDTH]; - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) { v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); gvelo_loc[i] = v.x; @@ -96,6 +96,6 @@ __kernel void advection_and_remeshing(__global const float* gvelo, gscal_loc[index4.w] += gs.w; } barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal); } diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl index 5d221f67393d24f1f7614f422ef0396ad22d8c20..c66338a804599ac37d49ea03b0808ea2dc824168 100644 --- a/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl +++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl @@ -1,6 +1,6 @@ inline uint noBC_id(int id, int nb_part){ - return (id%nb_part)*WGN+(id/nb_part); + return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part); } __kernel void advection_and_remeshing(__global const float* gvelo, __global const float* pscal, @@ -12,14 +12,14 @@ __kernel void advection_and_remeshing(__global const float* gvelo, uint gidZ = get_global_id(2); float invdx = 1.0/dx; int4 ind, i_ind, i_ind_p; - uint i, nb_part = WIDTH/WGN; + uint i, nb_part = WIDTH/WGNADVECANDREMESH; float4 p, s, y, gs, v, vp, coord; uint4 index4; __local float gscal_loc[WIDTH]; __local float gvelo_loc[WIDTH]; - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) { v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); gvelo_loc[noBC_id(i, nb_part)] = v.x; @@ -96,6 +96,6 @@ __kernel void advection_and_remeshing(__global const float* gvelo, gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; } barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal); } diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_2D_opt4_builtin_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_2D_opt4_builtin_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..da486b5394795be6d42dae6cf35ab7daac782c58 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_2D_opt4_builtin_noBC.cl @@ -0,0 +1,83 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part); +} +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int4 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGNADVECANDREMESH; + float4 p, s, y, gs, v, vp, coord; + uint4 index4; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) + { + v = vload4((i+gidY*WIDTH)/4, gvelo); + gvelo_loc[noBC_id(i,nb_part)] = v.x; + gvelo_loc[noBC_id(i+1,nb_part)] = v.y; + gvelo_loc[noBC_id(i+2,nb_part)] = v.z; + gvelo_loc[noBC_id(i+3,nb_part)] = v.w; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + v = (float4)(gvelo_loc[noBC_id(i,nb_part)], + gvelo_loc[noBC_id(i+1,nb_part)], + gvelo_loc[noBC_id(i+2,nb_part)], + gvelo_loc[noBC_id(i+3,nb_part)]); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)], + gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]); + vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)], + gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]); + p=fma(mix(v,vp,y),dt,coord); + + s = vload4((i + gidY*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 1 + WIDTH) % WIDTH); + gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) + vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_2D_opt8_builtin_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_2D_opt8_builtin_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..371d03ef7a559c804076bda156e2648a806a2a82 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_2D_opt8_builtin_noBC.cl @@ -0,0 +1,137 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part); +} +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt,float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int8 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGNADVECANDREMESH; + float8 p, s, y, gs, v, vp, coord; + uint8 index8; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*8; i<WIDTH; i+=(WGNADVECANDREMESH*8)) + { + v = vload8((i+gidY*WIDTH)/8, gvelo); + gvelo_loc[noBC_id(i,nb_part)] = v.s0; + gvelo_loc[noBC_id(i+1,nb_part)] = v.s1; + gvelo_loc[noBC_id(i+2,nb_part)] = v.s2; + gvelo_loc[noBC_id(i+3,nb_part)] = v.s3; + gvelo_loc[noBC_id(i+4,nb_part)] = v.s4; + gvelo_loc[noBC_id(i+5,nb_part)] = v.s5; + gvelo_loc[noBC_id(i+6,nb_part)] = v.s6; + gvelo_loc[noBC_id(i+7,nb_part)] = v.s7; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + gscal_loc[i+4] = 0.0; + gscal_loc[i+5] = 0.0; + gscal_loc[i+6] = 0.0; + gscal_loc[i+7] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=8) + { + v = (float8)(gvelo_loc[noBC_id(i,nb_part)], + gvelo_loc[noBC_id(i+1,nb_part)], + gvelo_loc[noBC_id(i+2,nb_part)], + gvelo_loc[noBC_id(i+3,nb_part)], + gvelo_loc[noBC_id(i+4,nb_part)], + gvelo_loc[noBC_id(i+5,nb_part)], + gvelo_loc[noBC_id(i+6,nb_part)], + gvelo_loc[noBC_id(i+7,nb_part)]); + coord = (float8)(i*dx, + (i+1)*dx, + (i+2)*dx, + (i+3)*dx, + (i+4)*dx, + (i+5)*dx, + (i+6)*dx, + (i+7)*dx); + p = fma((float8)(0.5*dt),v,coord); + ind = convert_int8_rtn(p * invdx); + y = (fma(- convert_float8(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float8)(gvelo_loc[noBC_id(i_ind.s0,nb_part)], + gvelo_loc[noBC_id(i_ind.s1,nb_part)], + gvelo_loc[noBC_id(i_ind.s2,nb_part)], + gvelo_loc[noBC_id(i_ind.s3,nb_part)], + gvelo_loc[noBC_id(i_ind.s4,nb_part)], + gvelo_loc[noBC_id(i_ind.s5,nb_part)], + gvelo_loc[noBC_id(i_ind.s6,nb_part)], + gvelo_loc[noBC_id(i_ind.s7,nb_part)]); + vp = (float8)(gvelo_loc[noBC_id(i_ind_p.s0,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s1,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s2,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s3,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s4,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s5,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s6,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s7,nb_part)]); + p = fma(mix(v,vp,y),dt,coord); + + s = vload8((i + gidY*WIDTH)/8, pscal); + ind = convert_int8_rtn(p * invdx); + y = (p - convert_float8(ind) * dx) * invdx; + index8 = convert_uint8((ind - 1 + WIDTH) % WIDTH); + gscal_loc[noBC_id(index8.s0,nb_part)] += (alpha(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (alpha(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (alpha(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (alpha(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (alpha(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (alpha(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (alpha(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (alpha(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (beta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (beta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (beta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (beta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (beta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (beta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (beta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (beta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (gamma(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (gamma(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (gamma(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (gamma(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (gamma(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (gamma(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (gamma(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (gamma(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (delta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (delta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (delta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (delta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (delta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (delta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (delta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (delta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*8; i<WIDTH; i+=(WGNADVECANDREMESH*8)) + vstore8((float8)(gscal_loc[noBC_id(i,nb_part)], + gscal_loc[noBC_id(i+1,nb_part)], + gscal_loc[noBC_id(i+2,nb_part)], + gscal_loc[noBC_id(i+3,nb_part)], + gscal_loc[noBC_id(i+4,nb_part)], + gscal_loc[noBC_id(i+5,nb_part)], + gscal_loc[noBC_id(i+6,nb_part)], + gscal_loc[noBC_id(i+7,nb_part)]), (i + gidY*WIDTH)/8, gscal); +} diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_3D_opt4_builtin_private4.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_3D_opt4_builtin_private4.cl new file mode 100644 index 0000000000000000000000000000000000000000..10600e47a45b654f02e61a56e1ecfea040fcf16d --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_3D_opt4_builtin_private4.cl @@ -0,0 +1,86 @@ + +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0/dx; + int4 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGNADVECANDREMESH; + float4 p, s, y, gs, v, vp, coord; + uint4 index4; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) + { + v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); + gvelo_loc[i] = v.x; + gvelo_loc[i+1] = v.y; + gvelo_loc[i+2] = v.z; + gvelo_loc[i+3] = v.w; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + v = vload4(i/4, gvelo_loc); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(gvelo_loc[i_ind.x], gvelo_loc[i_ind.y], + gvelo_loc[i_ind.z], gvelo_loc[i_ind.w]); + vp = (float4)(gvelo_loc[i_ind_p.x], gvelo_loc[i_ind_p.y], + gvelo_loc[i_ind_p.z], gvelo_loc[i_ind_p.w]); + p=fma(mix(v,vp,y),dt,coord); + + s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 1 + WIDTH) % WIDTH); + barrier(CLK_LOCAL_MEM_FENCE); + gs = (alpha(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (beta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (gamma(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (delta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) + vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_3D_opt4_builtin_private4_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_3D_opt4_builtin_private4_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..b6e5fd1c0c7afb4bb513f66b8541260edb5c8174 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m4prime_3D_opt4_builtin_private4_noBC.cl @@ -0,0 +1,115 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part); +} +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0/dx; + int4 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGNADVECANDREMESH; + float4 p, s, y, gs, v, vp, coord; + uint4 index4; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) + { + v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); + gvelo_loc[noBC_id(i, nb_part)] = v.x; + gvelo_loc[noBC_id(i+1, nb_part)] = v.y; + gvelo_loc[noBC_id(i+2, nb_part)] = v.z; + gvelo_loc[noBC_id(i+3, nb_part)] = v.w; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + v = (float4)(gvelo_loc[noBC_id(i,nb_part)], + gvelo_loc[noBC_id(i+1,nb_part)], + gvelo_loc[noBC_id(i+2,nb_part)], + gvelo_loc[noBC_id(i+3,nb_part)]); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)], + gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]); + vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)], + gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]); + p=fma(mix(v,vp,y),dt,coord); + + s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH); + gs = (alpha(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (beta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (gamma(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (delta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (eta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (zeta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (theta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (iota(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) + vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..0e506b6f454872d8ad13bfb0213fe1095ea0d44b --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl @@ -0,0 +1,107 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part); +} +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int4 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGNADVECANDREMESH; + float4 p, s, y, gs, v, vp, coord; + uint4 index4; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) + { + v = vload4((i+gidY*WIDTH)/4, gvelo); + gvelo_loc[noBC_id(i,nb_part)] = v.x; + gvelo_loc[noBC_id(i+1,nb_part)] = v.y; + gvelo_loc[noBC_id(i+2,nb_part)] = v.z; + gvelo_loc[noBC_id(i+3,nb_part)] = v.w; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + v = (float4)(gvelo_loc[noBC_id(i,nb_part)], + gvelo_loc[noBC_id(i+1,nb_part)], + gvelo_loc[noBC_id(i+2,nb_part)], + gvelo_loc[noBC_id(i+3,nb_part)]); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)], + gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]); + vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)], + gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]); + p=fma(mix(v,vp,y),dt,coord); + + s = vload4((i + gidY*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH); + gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (eta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (eta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (eta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (eta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (zeta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (zeta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (zeta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (zeta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (theta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (theta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (theta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (theta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (iota(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (iota(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (iota(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (iota(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) + vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..8c5efa160d98d70d55772668214fd288618f8a1b --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl @@ -0,0 +1,177 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part); +} +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt,float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int8 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGNADVECANDREMESH; + float8 p, s, y, gs, v, vp, coord; + uint8 index8; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*8; i<WIDTH; i+=(WGNADVECANDREMESH*8)) + { + v = vload8((i+gidY*WIDTH)/8, gvelo); + gvelo_loc[noBC_id(i,nb_part)] = v.s0; + gvelo_loc[noBC_id(i+1,nb_part)] = v.s1; + gvelo_loc[noBC_id(i+2,nb_part)] = v.s2; + gvelo_loc[noBC_id(i+3,nb_part)] = v.s3; + gvelo_loc[noBC_id(i+4,nb_part)] = v.s4; + gvelo_loc[noBC_id(i+5,nb_part)] = v.s5; + gvelo_loc[noBC_id(i+6,nb_part)] = v.s6; + gvelo_loc[noBC_id(i+7,nb_part)] = v.s7; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + gscal_loc[i+4] = 0.0; + gscal_loc[i+5] = 0.0; + gscal_loc[i+6] = 0.0; + gscal_loc[i+7] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=8) + { + v = (float8)(gvelo_loc[noBC_id(i,nb_part)], + gvelo_loc[noBC_id(i+1,nb_part)], + gvelo_loc[noBC_id(i+2,nb_part)], + gvelo_loc[noBC_id(i+3,nb_part)], + gvelo_loc[noBC_id(i+4,nb_part)], + gvelo_loc[noBC_id(i+5,nb_part)], + gvelo_loc[noBC_id(i+6,nb_part)], + gvelo_loc[noBC_id(i+7,nb_part)]); + coord = (float8)(i*dx, + (i+1)*dx, + (i+2)*dx, + (i+3)*dx, + (i+4)*dx, + (i+5)*dx, + (i+6)*dx, + (i+7)*dx); + p = fma((float8)(0.5*dt),v,coord); + ind = convert_int8_rtn(p * invdx); + y = (fma(- convert_float8(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float8)(gvelo_loc[noBC_id(i_ind.s0,nb_part)], + gvelo_loc[noBC_id(i_ind.s1,nb_part)], + gvelo_loc[noBC_id(i_ind.s2,nb_part)], + gvelo_loc[noBC_id(i_ind.s3,nb_part)], + gvelo_loc[noBC_id(i_ind.s4,nb_part)], + gvelo_loc[noBC_id(i_ind.s5,nb_part)], + gvelo_loc[noBC_id(i_ind.s6,nb_part)], + gvelo_loc[noBC_id(i_ind.s7,nb_part)]); + vp = (float8)(gvelo_loc[noBC_id(i_ind_p.s0,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s1,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s2,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s3,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s4,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s5,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s6,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s7,nb_part)]); + p = fma(mix(v,vp,y),dt,coord); + + s = vload8((i + gidY*WIDTH)/8, pscal); + ind = convert_int8_rtn(p * invdx); + y = (p - convert_float8(ind) * dx) * invdx; + index8 = convert_uint8((ind - 3 + WIDTH) % WIDTH); + gscal_loc[noBC_id(index8.s0,nb_part)] += (alpha(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (alpha(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (alpha(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (alpha(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (alpha(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (alpha(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (alpha(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (alpha(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (beta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (beta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (beta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (beta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (beta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (beta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (beta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (beta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (gamma(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (gamma(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (gamma(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (gamma(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (gamma(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (gamma(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (gamma(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (gamma(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (delta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (delta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (delta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (delta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (delta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (delta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (delta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (delta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (eta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (eta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (eta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (eta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (eta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (eta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (eta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (eta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (zeta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (zeta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (zeta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (zeta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (zeta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (zeta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (zeta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (zeta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (theta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (theta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (theta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (theta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (theta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (theta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (theta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (theta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (iota(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (iota(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (iota(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (iota(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (iota(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (iota(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (iota(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (iota(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*8; i<WIDTH; i+=(WGNADVECANDREMESH*8)) + vstore8((float8)(gscal_loc[noBC_id(i,nb_part)], + gscal_loc[noBC_id(i+1,nb_part)], + gscal_loc[noBC_id(i+2,nb_part)], + gscal_loc[noBC_id(i+3,nb_part)], + gscal_loc[noBC_id(i+4,nb_part)], + gscal_loc[noBC_id(i+5,nb_part)], + gscal_loc[noBC_id(i+6,nb_part)], + gscal_loc[noBC_id(i+7,nb_part)]), (i + gidY*WIDTH)/8, gscal); +} diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl new file mode 100644 index 0000000000000000000000000000000000000000..45731a83edb2e3b018ead2419b63379bac7aa900 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl @@ -0,0 +1,115 @@ + +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0/dx; + int4 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGNADVECANDREMESH; + float4 p, s, y, gs, v, vp, coord; + uint4 index4; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) + { + v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); + gvelo_loc[i] = v.x; + gvelo_loc[i+1] = v.y; + gvelo_loc[i+2] = v.z; + gvelo_loc[i+3] = v.w; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + v = vload4(i/4, gvelo_loc); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(gvelo_loc[i_ind.x], gvelo_loc[i_ind.y], + gvelo_loc[i_ind.z], gvelo_loc[i_ind.w]); + vp = (float4)(gvelo_loc[i_ind_p.x], gvelo_loc[i_ind_p.y], + gvelo_loc[i_ind_p.z], gvelo_loc[i_ind_p.w]); + p=fma(mix(v,vp,y),dt,coord); + + s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH); + barrier(CLK_LOCAL_MEM_FENCE); + gs = (alpha(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (beta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (gamma(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (delta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (eta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (zeta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (theta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (iota(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) + vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..dd942553dd0f23743da88ca7da5c40347bdb210d --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl @@ -0,0 +1,88 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGNADVECANDREMESH+(id/nb_part); +} +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0/dx; + int4 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGNADVECANDREMESH; + float4 p, s, y, gs, v, vp, coord; + uint4 index4; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) + { + v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); + gvelo_loc[noBC_id(i, nb_part)] = v.x; + gvelo_loc[noBC_id(i+1, nb_part)] = v.y; + gvelo_loc[noBC_id(i+2, nb_part)] = v.z; + gvelo_loc[noBC_id(i+3, nb_part)] = v.w; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + v = (float4)(gvelo_loc[noBC_id(i,nb_part)], + gvelo_loc[noBC_id(i+1,nb_part)], + gvelo_loc[noBC_id(i+2,nb_part)], + gvelo_loc[noBC_id(i+3,nb_part)]); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)], + gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]); + vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)], + gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]); + p=fma(mix(v,vp,y),dt,coord); + + s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 1 + WIDTH) % WIDTH); + gs = (alpha(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (beta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (gamma(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (delta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGNADVECANDREMESH*4)) + vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_noBC.cl b/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_noBC.cl index 895250723bdd1eaa04357817122fd5205eb10e1c..7347bb0c5ef9e4fa13ecda9272d6bf5e0e7699fc 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_noBC.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_noBC.cl @@ -7,7 +7,7 @@ * @return Index in a 2D space. */ inline uint noBC_id(int id, int nb_part){ - return (id%nb_part)*WGN+(id/nb_part); + return (id%nb_part)*WGNREMESH+(id/nb_part); } /** @@ -27,13 +27,13 @@ __kernel void remeshing(__global const float* ppos, uint gidY = get_global_id(1); float invdx = 1.0/dx; int4 ind; - uint i, nb_part = WIDTH/WGN; + uint i, nb_part = WIDTH/WGNREMESH; float4 p, s, y; uint4 index4; __local float gscal_loc[WIDTH]; - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) { gscal_loc[i] = 0.0; gscal_loc[i+1] = 0.0; @@ -85,6 +85,6 @@ __kernel void remeshing(__global const float* ppos, barrier(CLK_LOCAL_MEM_FENCE); } barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal); } diff --git a/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4.cl b/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4.cl index bd831d5abf3d791b984c67d3e4dee9d3b447f004..6e63f6408e790973f3a0e1155f64f02a4c5a349f 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4.cl @@ -7,13 +7,13 @@ __kernel void remeshing(__global const float* ppos, uint gidY = get_global_id(1); float invdx = 1.0/dx; int4 ind; - uint i, nb_part = WIDTH/WGN; + uint i, nb_part = WIDTH/WGNREMESH; float4 p, s, y, gs; uint4 index4; __local float gscal_loc[WIDTH]; - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) { gscal_loc[i] = 0.0; gscal_loc[i+1] = 0.0; @@ -71,6 +71,6 @@ __kernel void remeshing(__global const float* ppos, barrier(CLK_LOCAL_MEM_FENCE); } barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH)/4, gscal); } diff --git a/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4_noBC.cl b/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4_noBC.cl index 014edd31671f10731a58ba826b876bb5bf612fa1..9e9447359fdd45e8d5e411b35276c372f7816fc6 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4_noBC.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing_2D_opt4_private4_noBC.cl @@ -1,6 +1,6 @@ inline uint noBC_id(int id, int nb_part){ - return (id%nb_part)*WGN+(id/nb_part); + return (id%nb_part)*WGNREMESH+(id/nb_part); } __kernel void remeshing(__global const float* ppos, __global const float* pscal, @@ -10,13 +10,13 @@ __kernel void remeshing(__global const float* ppos, uint gidY = get_global_id(1); float invdx = 1.0/dx; int4 ind; - uint i, nb_part = WIDTH/WGN; + uint i, nb_part = WIDTH/WGNREMESH; float4 p, s, y, gs; uint4 index4; __local float gscal_loc[WIDTH]; - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) { gscal_loc[i] = 0.0; gscal_loc[i+1] = 0.0; @@ -74,6 +74,6 @@ __kernel void remeshing(__global const float* ppos, barrier(CLK_LOCAL_MEM_FENCE); } barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH)/4, gscal); } diff --git a/HySoP/hysop/gpu/cl_src/remeshing_3D_opt4_private4.cl b/HySoP/hysop/gpu/cl_src/remeshing_3D_opt4_private4.cl index eb67791af74bdfb46e79c38c87fcc216d5fc6032..922a8fe28e19c354330364dc0e6b5fc05f6315cd 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing_3D_opt4_private4.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing_3D_opt4_private4.cl @@ -8,13 +8,13 @@ __kernel void remeshing(__global const float* ppos, uint gidZ = get_global_id(2); float invdx = 1.0/dx; int4 ind; - uint i, nb_part = WIDTH/WGN; + uint i, nb_part = WIDTH/WGNREMESH; float4 p, s, y, gs; uint4 index4; __local float gscal_loc[WIDTH]; - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) { gscal_loc[i] = 0.0; gscal_loc[i+1] = 0.0; @@ -72,7 +72,7 @@ __kernel void remeshing(__global const float* ppos, barrier(CLK_LOCAL_MEM_FENCE); } barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) vstore4((float4)(gscal_loc[i], gscal_loc[i+1], gscal_loc[i+2], diff --git a/HySoP/hysop/gpu/cl_src/remeshing_m4prime_2D_opt4_noBC.cl b/HySoP/hysop/gpu/cl_src/remeshing_m4prime_2D_opt4_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..ed9edaeb09bec2acf955d41e7fe2f9f0efe0367f --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/remeshing_m4prime_2D_opt4_noBC.cl @@ -0,0 +1,78 @@ +/** + * Local memory as a 2D array to avoid bank conflicts. + * + * @param id Index 1D + * @param nb_part Particle number per work-item + * + * @return Index in a 2D space. + */ +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGNREMESH+(id/nb_part); +} + +/** + * 2D Remeshing kernel. + * + * @param ppos Particle position + * @param pscal Particle scalar + * @param gscal Grid scalar + * @param min_position Domain lower point + * @param dx Space step + */ +__kernel void remeshing(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int4 ind; + uint i, nb_part = WIDTH/WGNREMESH; + float4 p, s, y; + uint4 index4; + + __local float gscal_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) + { + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position); + s = vload4((i + gidY*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 1 + WIDTH) % WIDTH); // i-1 + gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i + gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i+1 + gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i+2 + gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) + vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/gpu/cl_src/remeshing_m4prime_3D_opt4_private4.cl b/HySoP/hysop/gpu/cl_src/remeshing_m4prime_3D_opt4_private4.cl new file mode 100644 index 0000000000000000000000000000000000000000..c30200bb883bfcb34181236bda74d87a44ab524a --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/remeshing_m4prime_3D_opt4_private4.cl @@ -0,0 +1,67 @@ + +__kernel void remeshing(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0/dx; + int4 ind; + uint i, nb_part = WIDTH/WGNREMESH; + float4 p, s, y, gs; + uint4 index4; + + __local float gscal_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) + { + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + p = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, ppos) - (float4)(min_position); + s = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 1 + WIDTH) % WIDTH); + gs = (alpha(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (beta(y, s)); + gscal_loc[index4.x]+= gs.x; + gscal_loc[index4.y]+= gs.y; + gscal_loc[index4.z]+= gs.z; + gscal_loc[index4.w]+= gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (gamma(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (delta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) + vstore4((float4)(gscal_loc[i], + gscal_loc[i+1], + gscal_loc[i+2], + gscal_loc[i+3]), + (i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/gpu/cl_src/remeshing_m8prime_2D_opt4_noBC.cl b/HySoP/hysop/gpu/cl_src/remeshing_m8prime_2D_opt4_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..65ef443d956024b80b433dbc6560c910828f27a6 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/remeshing_m8prime_2D_opt4_noBC.cl @@ -0,0 +1,102 @@ +/** + * Local memory as a 2D array to avoid bank conflicts. + * + * @param id Index 1D + * @param nb_part Particle number per work-item + * + * @return Index in a 2D space. + */ +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGNREMESH+(id/nb_part); +} + +/** + * 2D Remeshing kernel. + * + * @param ppos Particle position + * @param pscal Particle scalar + * @param gscal Grid scalar + * @param min_position Domain lower point + * @param dx Space step + */ +__kernel void remeshing(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int4 ind; + uint i, nb_part = WIDTH/WGNREMESH; + float4 p, s, y; + uint4 index4; + + __local float gscal_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) + { + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position); + s = vload4((i + gidY*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH); // i-3 + gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i-2 + gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i-1 + gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i + gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; //i+1 + gscal_loc[noBC_id(index4.x,nb_part)] += (eta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (eta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (eta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (eta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i+2 + gscal_loc[noBC_id(index4.x,nb_part)] += (zeta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (zeta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (zeta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (zeta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i+3 + gscal_loc[noBC_id(index4.x,nb_part)] += (theta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (theta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (theta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (theta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i+4 + gscal_loc[noBC_id(index4.x,nb_part)] += (iota(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (iota(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (iota(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (iota(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) + vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/gpu/cl_src/remeshing_m8prime_3D_opt4_private4.cl b/HySoP/hysop/gpu/cl_src/remeshing_m8prime_3D_opt4_private4.cl new file mode 100644 index 0000000000000000000000000000000000000000..b983724dadbb652ce735579feacc61456f3bdf1a --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/remeshing_m8prime_3D_opt4_private4.cl @@ -0,0 +1,95 @@ + +__kernel void remeshing(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0/dx; + int4 ind; + uint i, nb_part = WIDTH/WGNREMESH; + float4 p, s, y, gs; + uint4 index4; + + __local float gscal_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) + { + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + p = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, ppos) - (float4)(min_position); + s = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH); + gs = (alpha(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (beta(y, s)); + gscal_loc[index4.x]+= gs.x; + gscal_loc[index4.y]+= gs.y; + gscal_loc[index4.z]+= gs.z; + gscal_loc[index4.w]+= gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (gamma(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (delta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (eta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (zeta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (theta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (iota(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGNREMESH*4)) + vstore4((float4)(gscal_loc[i], + gscal_loc[i+1], + gscal_loc[i+2], + gscal_loc[i+3]), + (i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/gpu/cl_src/weights_m4prime.cl b/HySoP/hysop/gpu/cl_src/weights_m4prime.cl new file mode 100644 index 0000000000000000000000000000000000000000..7bac233787cd3e32f336b5569a86140d1d72a11c --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/weights_m4prime.cl @@ -0,0 +1,16 @@ +/** + * M4prime weights. + * + * @param y Distance between the particle and left-hand grid point + * @param s Scalar of the particle + * + * @return weights + */ +inline float alpha(float y, float s){ + return (y * (y * (-0.5 * y + 1.0) - 0.5)) * s;} +inline float beta(float y, float s){ + return (y * y * (1.5 * y - 2.5) + 1.0) * s;} +inline float gamma(float y, float s){ + return (y * (y * (-1.5 * y + 2.0) + 0.5)) * s;} +inline float delta(float y, float s){ + return (y * y * (0.5 * y - 0.5)) * s;} diff --git a/HySoP/hysop/gpu/cl_src/weights_m4prime_builtin.cl b/HySoP/hysop/gpu/cl_src/weights_m4prime_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..9d1e63caf52ccacecb6567c8cd8be7025998748b --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/weights_m4prime_builtin.cl @@ -0,0 +1,9 @@ + +inline float alpha(float y, float s){ + return ((y * fma(y * fma(y, -0.5, 1.0), - 0.5))) * s;} +inline float beta(float y, float s){ + return (y * y * fma(1.5, y, - 2.5), 1.0) * s;} +inline float gamma(float y, float s){ + return (y * fma(y * fma(-1.5, y, 2.0), 0.5)) * s;} +inline float delta(float y, float s){ + return (y * y * fma(0.5, y, - 0.5)) * s;} diff --git a/HySoP/hysop/gpu/cl_src/weights_m4prime_opt4.cl b/HySoP/hysop/gpu/cl_src/weights_m4prime_opt4.cl new file mode 100644 index 0000000000000000000000000000000000000000..6ead89d937370b08319d14a6e17a942e2390cfe5 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/weights_m4prime_opt4.cl @@ -0,0 +1,9 @@ + +inline float4 alpha(float4 y, float4 s){ + return (y * (y * (-0.5 * y + 1.0) - 0.5)) * s;} +inline float4 beta(float4 y, float4 s){ + return (y * y * (1.5 * y - 2.5) + 1.0) * s;} +inline float4 gamma(float4 y, float4 s){ + return (y * (y * (-1.5 * y + 2.0) + 0.5)) * s;} +inline float4 delta(float4 y, float4 s){ + return (y * y * (0.5 * y - 0.5)) * s;} diff --git a/HySoP/hysop/gpu/cl_src/weights_m4prime_opt4_builtin.cl b/HySoP/hysop/gpu/cl_src/weights_m4prime_opt4_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..43d6a2806bfc21238d9f2e1aec1f809342905322 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/weights_m4prime_opt4_builtin.cl @@ -0,0 +1,9 @@ + +inline float4 alpha(float4 y, float4 s){ + return ((y * fma(y * fma(y, -0.5, 1.0), - 0.5))) * s;} +inline float4 beta(float4 y, float4 s){ + return (y * y * fma(1.5, y, - 2.5), 1.0) * s;} +inline float4 gamma(float4 y, float4 s){ + return (y * fma(y * fma(-1.5, y, 2.0), 0.5)) * s;} +inline float4 delta(float4 y, float4 s){ + return (y * y * fma(0.5, y, - 0.5)) * s;} diff --git a/HySoP/hysop/gpu/cl_src/weights_m8prime_builtin.cl b/HySoP/hysop/gpu/cl_src/weights_m8prime_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..94d393d333c91e7404f2c7e4ce8a58b2683a8189 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/weights_m8prime_builtin.cl @@ -0,0 +1,33 @@ +/** + * M8prime weights. + * + * @param y Distance between the particle and left-hand grid point + * @param s Scalar of the particle + * + * @return weights + * + * Use OpenCL fma builtin function. + */ +inline float alpha(float y, float s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) / 3360.0) * s;} + +inline float beta(float y, float s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(70.0,y, - 175.0), - 140.0), + 770.0), - 560.0), - 350.0), + 504.0), - 102.0) / 3360.0)*s;} + +inline float gamma(float y, float s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-210.0,y, + 609.0), + 224.0), - 2135.0), + 910.0), + 2765.0), - 2520.0), + 255.0) / 3360.0)*s;} + +inline float delta(float y, float s){ + return (fma(y*y, fma(y*y, fma(y*y, fma(70.0,y, - 231.0), + 588.0), - 980.0), + 604.0) / 672.0)*s;} + +inline float eta(float y, float s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-70.0,y, 259.0), - 84.0), - 427.0), - 182.0), + 553.0), + 504.0), + 51.0) / 672.0)*s;} + +inline float zeta(float y, float s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(210.0,y,- 861.0), + 532.0), + 770.0), + 560.0), - 350.0), - 504.0), - 102.0) / 3360.0)*s;} + +inline float theta(float y, float s){ + return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) / 3360.0)*s;} + +inline float iota(float y, float s){ + return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) / 3360.0)*s;} diff --git a/HySoP/hysop/gpu/cl_src/weights_m8prime_opt4_builtin.cl b/HySoP/hysop/gpu/cl_src/weights_m8prime_opt4_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..90f28e492a222e5cadf18f25227db59e3c8a51a4 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/weights_m8prime_opt4_builtin.cl @@ -0,0 +1,33 @@ +/** + * M8prime weights. + * + * @param y Distance between the particle and left-hand grid point + * @param s Scalar of the particle + * + * @return weights + * + * Use OpenCL fma builtin function, computed by float4. + */ +inline float4 alpha(float4 y, float4 s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) / 3360.0) * s;} + +inline float4 beta(float4 y, float4 s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(70.0,y, - 175.0), - 140.0), + 770.0), - 560.0), - 350.0), + 504.0), - 102.0) / 3360.0)*s;} + +inline float4 gamma(float4 y, float4 s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-210.0,y, + 609.0), + 224.0), - 2135.0), + 910.0), + 2765.0), - 2520.0), + 255.0) / 3360.0)*s;} + +inline float4 delta(float4 y, float4 s){ + return (fma(y*y, fma(y*y, fma(y*y, fma(70.0,y, - 231.0), + 588.0), - 980.0), + 604.0) / 672.0)*s;} + +inline float4 eta(float4 y, float4 s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-70.0,y, 259.0), - 84.0), - 427.0), - 182.0), + 553.0), + 504.0), + 51.0) / 672.0)*s;} + +inline float4 zeta(float4 y, float4 s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(210.0,y,- 861.0), + 532.0), + 770.0), + 560.0), - 350.0), - 504.0), - 102.0) / 3360.0)*s;} + +inline float4 theta(float4 y, float4 s){ + return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) / 3360.0)*s;} + +inline float4 iota(float4 y, float4 s){ + return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) / 3360.0)*s;} diff --git a/HySoP/hysop/gpu/tools.py b/HySoP/hysop/gpu/tools.py index d3a12e0f5fb8ab4a3894083dfec9d63f1bc7fbf4..5b5d8c115be72129b0829820337ba60b3ac3b270 100644 --- a/HySoP/hysop/gpu/tools.py +++ b/HySoP/hysop/gpu/tools.py @@ -100,7 +100,7 @@ def _check_double_precision_capability(device, prec): 'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']: try: - if eval('(sdevice.double_fp_config &' + + if eval('(device.double_fp_config &' + ' cl.device_fp_config.' + v + ') == cl.device_fp_config.' + v): print v diff --git a/HySoP/hysop/numerics/gpu_kernel.py b/HySoP/hysop/numerics/gpu_kernel.py index 144e520ec5fefd2830c889317020c3a77d6cb7b5..16459e4f3363894cc524912ae620ce2e60b1ef53 100644 --- a/HySoP/hysop/numerics/gpu_kernel.py +++ b/HySoP/hysop/numerics/gpu_kernel.py @@ -38,9 +38,11 @@ class KernelListLauncher: @param args : kernel arguments. @return OpenCL Event - OpenCL global size and local sizes are not given in args. Class member are used. + OpenCL global size and local sizes are not given in + args. Class member are used. """ - return KernelListLauncher.launch_sizes_in_args(self, d, self.global_size[d], self.local_size[d], *args) + return KernelListLauncher.launch_sizes_in_args( + self, d, self.global_size[d], self.local_size[d], *args) def launch_sizes_in_args(self, d, *args): """ @@ -65,14 +67,16 @@ class KernelListLauncher: if d is not None: return self.kernel[d].get_info(cl.kernel_info.FUNCTION_NAME) else: - return [k.get_info(cl.kernel_info.FUNCTION_NAME) for k in self.kernel] + return [k.get_info(cl.kernel_info.FUNCTION_NAME) + for k in self.kernel] class KernelLauncher(KernelListLauncher): """ OpenCL kernel launcher. - Manage launching of one OpenCL kernel as a KernelListLauncher with a list of one kernel. + Manage launching of one OpenCL kernel as a KernelListLauncher + with a list of one kernel. """ def __init__(self, kernel, queue, gsize, lsize): """ @@ -109,7 +113,8 @@ class KernelLauncher(KernelListLauncher): @param args : kernel arguments. @return OpenCL Event - OpenCL global size and local sizes are not given in args. Class member are used. + OpenCL global size and local sizes are not given in args. + Class member are used. """ return KernelListLauncher.launch(self, 0, *args) diff --git a/HySoP/hysop/operator/advection.py b/HySoP/hysop/operator/advection.py index 6304e9421a28f9dd87146b820c0aa9fbe9520794..c695657c73bc51aab90be11cf08c3a1c939701e7 100644 --- a/HySoP/hysop/operator/advection.py +++ b/HySoP/hysop/operator/advection.py @@ -42,16 +42,16 @@ class Advection(Operator): if self.discreteOperator is None: if self.method.find('gpu_2k') >= 0: from parmepy.operator.gpu_particle_advection_2k \ - import GPUParticleAdvection + import GPUParticleAdvection2k self.discreteOperator = \ - GPUParticleAdvection(self, method=self.method, - **self.config) + GPUParticleAdvection2k(self, method=self.method, + **self.config) elif self.method.find('gpu_1k') >= 0: from parmepy.operator.gpu_particle_advection_1k \ - import GPUParticleAdvection + import GPUParticleAdvection1k self.discreteOperator = \ - GPUParticleAdvection(self, method=self.method, - **self.config) + GPUParticleAdvection1k(self, method=self.method, + **self.config) elif self.method.find('scales') >= 0: from parmepy.operator.scales_advection \ import ScalesAdvection diff --git a/HySoP/hysop/operator/continuous.py b/HySoP/hysop/operator/continuous.py index e59ccc948d99e4ecd58626ef0f234da7f314c7c0..67d814d8875db3a100f4dd73df7091dc977f5693 100644 --- a/HySoP/hysop/operator/continuous.py +++ b/HySoP/hysop/operator/continuous.py @@ -42,6 +42,13 @@ class Operator(object): for v in self.variables: v.discretize(self.resolutions[v]) + def finalize(self): + """ + Method called on simulation finalization. + Clean operator at time loop end. + """ + self.discreteOperator.finalize() + def apply(self, *args): """ Apply operator. diff --git a/HySoP/hysop/operator/discrete.py b/HySoP/hysop/operator/discrete.py index 26e94fe1214938ba8736d64d924e5c682ae68277..81931cf1604a567068fa4edc5733877a2e0a02f7 100644 --- a/HySoP/hysop/operator/discrete.py +++ b/HySoP/hysop/operator/discrete.py @@ -48,6 +48,13 @@ class DiscreteOperator(object): raise NotImplementedError("Need to override method in " + "a subclass of DiscreteOperator") + def finalize(self): + """ + Method called on simulation finalization. + Clean operator at time loop end. + """ + pass + def __str__(self): """ToString method""" s = "Applying on following fields : \n" diff --git a/HySoP/hysop/operator/gpu_particle_advection.py b/HySoP/hysop/operator/gpu_particle_advection.py new file mode 100644 index 0000000000000000000000000000000000000000..93076531e6d42313da50ca26a7eece11bdfba895 --- /dev/null +++ b/HySoP/hysop/operator/gpu_particle_advection.py @@ -0,0 +1,319 @@ +""" +@package parmepy.operator.gpu_particle_advection_1k + +Discrete advection representation +""" +from abc import ABCMeta, abstractmethod +from parmepy.constants import np +from parmepy.operator.particle_advection import ParticleAdvection +from parmepy.fields.continuous import Field +from parmepy.fields.gpu_discrete import GPUVectorField +from parmepy.fields.gpu_discrete import GPUScalarField +from parmepy.gpu import PARMES_REAL_GPU, PARMES_DOUBLE_GPU, \ + PARMES_INTEGER_GPU, GPU_SRC, cl +from parmepy.gpu.tools import get_opencl_environment, \ + _check_double_precision_capability +from parmepy.numerics.gpu_kernel import KernelLauncher +from parmepy.numerics.gpu_kernel import KernelListLauncher +from parmepy.numerics.splitting import Splitting + + +class GPUParticleAdvection(ParticleAdvection): + """ + Particle advection operator representation on GPU. + + """ + __metaclass__ = ABCMeta + + @abstractmethod + def __init__(self, advec, + platform_id=0, device_id=0, + device_type='gpu', + method='', src=None, precision=PARMES_REAL_GPU): + """ + Create a Advection operator. + Work on a given scalar at a given velocity to produce scalar + distribution at new positions. + + @param advec : Transport operator + @param idVelocityD : Index of velocity discretisation to use. + @param idScalarD : Index of scalar discretisation to use. + @param method : the method to use. + """ + ParticleAdvection.__init__(self, advec) + self.name = "advection_gpu" + self.method = method + self.user_gpu_src = src + self.gpu_precision = precision + self.num_method = None + self.resolution = self.scalar.topology.mesh.resolution + self.workItemNumber = None + self.platform, self.device, self.ctx, self.queue = \ + get_opencl_environment(platform_id, device_id, device_type) + + def _set_WorkItems(self, vector_width=1): + ## Optimal work item number + if len(self.resolution) == 3: + workItemNumber = 64 if min(self.resolution) >= 64 \ + else min(self.resolution) + else: + workItemNumber = 256 if min(self.resolution) >= 256 \ + else min(self.resolution) + ## Change work-item regarding vector_width + if workItemNumber * vector_width > self.resolution[0]: + if self.resolution[0] % vector_width > 0: + raise ValueError( + "Resolution ({0}) must be a multiple of {1}".format( + self.resolution[0], vector_width)) + workItemNumber = self.resolution[0] // vector_width + if len(self.resolution) == 3: + gwi = (int(workItemNumber), + int(self.resolution[1]), int(self.resolution[2])) + lwi = (int(workItemNumber), 1, 1) + else: + gwi = (int(workItemNumber), + int(self.resolution[1])) + lwi = (int(workItemNumber), 1) + return workItemNumber, gwi, lwi + + def setUp(self): + self.workItemNumber, self.gwi, self.lwi = self._set_WorkItems() + print "Work-items basic config : ", + print self.workItemNumber, self.gwi, self.lwi + dim = self.scalar.topology.dim + self.coord_min = np.ones(4, dtype=self.gpu_precision) + self.mesh_size = np.ones(4, dtype=self.gpu_precision) + self.coord_min[0:dim] = np.asarray(self.scalar.topology.mesh.origin, + dtype=self.gpu_precision) + self.mesh_size[0:dim] = np.asarray( + self.scalar.topology.mesh.space_step, + dtype=self.gpu_precision) + print "=== Kernel sources ===" + self.build_options = "" + if _check_double_precision_capability(self.device, + self.gpu_precision): + self.build_options += " -cl-fp32-correctly-rounded-divide-sqrt " + if self.gpu_precision is PARMES_REAL_GPU: + self.build_options += " -cl-single-precision-constant" + self.build_options += " -D WIDTH=" + str(self.resolution[0]) + self.build_options += " -D WGN=" + str(self.workItemNumber) + self.gpu_src = self._collect_usr_cl_src(self.user_gpu_src) + self.gpu_src += self._collect_kernels_cl_src() + print "===\n" + print "=== Kernel sources compiling ===" + ## Build code + if self.gpu_precision is PARMES_REAL_GPU: + prg = cl.Program(self.ctx, self.gpu_src.replace('.0', '.0f')) + else: + prg = cl.Program(self.ctx, self.gpu_src.replace('float', 'double')) + ## OpenCL program + print self.build_options + self.prg = prg.build(self.build_options) + print "Compiler options : ", + print self.prg.get_build_info( + self.device, cl.program_build_info.OPTIONS) + print "Compiler status : ", + print self.prg.get_build_info( + self.device, cl.program_build_info.STATUS) + print "Compiler log : ", + print self.prg.get_build_info(self.device, cl.program_build_info.LOG) + print "===\n" + print "=== OpenCL Buffer allocations ===" + self._buffer_allocations() + print "===\n" + print "=== OpenCL Buffer initialisation ===" + self._buffer_initialisations() + print "===\n" + print "=== OpenCL Kernels affectation ===" + self._kernel_affectations() + print "===\n" + self.numMethod = Splitting(self._apply_in_dir, self.scalar.dimension) + print "Method used:", self.name + + def _buffer_initialisations(self): + data, transfer_time, compute_time = 0, 0., 0. + for gpudf in self.variables: + match = 'init' + gpudf.name + initKernel = None + # Looking for initKernel + for k in self.prg.all_kernels(): + k_name = k.get_info(cl.kernel_info.FUNCTION_NAME) + if match.find(k_name) >= 0: + initKernel = cl.Kernel(self.prg, k_name) + if initKernel is not None: + kl = KernelLauncher(initKernel, self.queue, + self.gwi, self.lwi) + if gpudf.vector: + args = [gpudf.gpu_data[d] + for d in xrange(len(self.resolution))] + args.append(self.coord_min) + args.append(self.mesh_size) + evt = kl.launch(*tuple(args)) + else: + args = [gpudf.gpu_data] + args.append(self.coord_min) + args.append(self.mesh_size) + evt = kl.launch(*tuple(args)) + self.queue.finish() + temp_time = (evt.profile.end - evt.profile.start) * 1e-9 + print "Done in ", temp_time, "sec" + compute_time += temp_time + gpudf.contains_data = False + gpudf.data_on_device = True + else: + if gpudf.contains_data: + print gpudf.name, ":", + temp_data, temp_time = gpudf.toDevice() + data += temp_data + transfer_time += temp_time + if data > 0: + print "Total Transfers : ", data, "Bytes transfered at ", + print "{0:.3f} GBytes/sec".format((data * 1e-9) / transfer_time) + if compute_time > 0.: + print "Total Computing : ", compute_time, "sec" + + def _collect_kernels_cl_src_copy(self): + gpu_src = "" + ## copy + if len(self.resolution) == 3: + if self.gpu_precision is PARMES_REAL_GPU: + src_copy = 'copy_3D_opt4.cl' + self.build_options += " -D TILE_DIM_COPY=16" + self.build_options += " -D BLOCK_ROWS_COPY=8" + self.copy_gwi = (int(self.resolution[0] / 4), + int(self.resolution[1] / 2), + int(self.resolution[2])) + self.copy_lwi = (4, 8, 1) + else: + src_copy = 'copy_3D_locMem.cl' + self.build_options += " -D TILE_DIM_COPY=32" + self.build_options += " -D BLOCK_ROWS_COPY=8" + self.copy_gwi = (int(self.resolution[0]), + int(self.resolution[1] / 4), + int(self.resolution[2])) + self.copy_lwi = (32, 8, 1) + else: + if self.gpu_precision is PARMES_REAL_GPU: + src_copy = 'copy_2D_opt2.cl' + self.build_options += " -D TILE_DIM_COPY=16" + self.build_options += " -D BLOCK_ROWS_COPY=8" + self.copy_gwi = (int(self.resolution[0] / 2), + int(self.resolution[1] / 2)) + self.copy_lwi = (8, 8) + else: + src_copy = 'copy_2D_opt2.cl' + self.build_options += " -D TILE_DIM_COPY=32" + self.build_options += " -D BLOCK_ROWS_COPY=2" + self.copy_gwi = (int(self.resolution[0] / 2), + int(self.resolution[1] / 16)) + self.copy_lwi = (16, 2) + f = open(GPU_SRC + src_copy) + print " - copy:", f.name + gpu_src += "".join(f.readlines()) + f.close() + print " - config :", self.copy_gwi, self.copy_lwi + return gpu_src + + def _collect_kernels_cl_src_transpositions(self): + gpu_src = "" + ## transpose + if len(self.resolution) == 3: + if self.gpu_precision is PARMES_REAL_GPU: + src_transpose_xy = \ + 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl' + self.build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8" + self.transpose_xy_gwi = (int(self.resolution[0] / 2), + int(self.resolution[1] / 2), + int(self.resolution[2])) + self.transpose_xy_lwi = (8, 8, 1) + src_transpose_xz = \ + 'transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl' + self.build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4" + self.transpose_xz_gwi = (int(self.resolution[0]), + int(self.resolution[1] / 4), + int(self.resolution[2] / 4)) + self.transpose_xz_lwi = (16, 4, 4) + else: + src_transpose_xy = \ + 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl' + self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=4" + self.transpose_xy_gwi = (int(self.resolution[0] / 4), + int(self.resolution[1] / 8), + int(self.resolution[2])) + self.transpose_xy_lwi = (8, 4, 1) + src_transpose_xz = \ + 'transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl' + self.build_options += " -D TILE_DIM_XZ=8 -D BLOCK_ROWS_XZ=2" + self.transpose_xz_gwi = (int(self.resolution[0]), + int(self.resolution[1] / 4), + int(self.resolution[2] / 4)) + self.transpose_xz_lwi = (8, 2, 2) + else: + if self.gpu_precision is PARMES_REAL_GPU: + src_transpose_xy = \ + 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl' + self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8" + self.transpose_xy_gwi = (int(self.resolution[0] / 4), + int(self.resolution[1]) / 4) + self.transpose_xy_lwi = (8, 8) + else: + src_transpose_xy = \ + 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl' + self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=2" + self.transpose_xy_gwi = (int(self.resolution[0] / 4), + int(self.resolution[1]) / 16) + self.transpose_xy_lwi = (8, 2) + f = open(GPU_SRC + src_transpose_xy) + print " - transpose_xy:", f.name + gpu_src += "".join(f.readlines()) + f.close() + print " - config :", self.transpose_xy_gwi, self.transpose_xy_lwi + if len(self.resolution) == 3: + print " - transpose_xz:", f.name + f = open(GPU_SRC + src_transpose_xz) + gpu_src += "".join(f.readlines()) + f.close() + print " - config :", + print self.transpose_xz_gwi, self.transpose_xz_lwi + return gpu_src + + def _collect_usr_cl_src(self, usr_src): + print "usr src", usr_src + gpu_src = "" + if usr_src is not None: + if isinstance(usr_src, list): + for src in usr_src: + print "Sources files (user): ", src + f = open(src, 'r') + gpu_src += "".join(f.readlines()) + f.close() + else: + print "Sources files (user): ", usr_src + f = open(usr_src, 'r') + gpu_src += "".join(f.readlines()) + f.close() + return gpu_src + + def apply(self, t, dt, ite): + self.numMethod(t, dt) + + def finalize(self): + """ + Free OpenCL memory. + """ + for gpudf in self.variables: + print "deallocate :", gpudf.name + if gpudf.vector: + [gpudf.gpu_data[d].release() + for d in xrange(len(self.resolution))] + else: + gpudf.gpu_data.release() + + def __str__(self): + s = "Advection_P (DiscreteOperator). " + DiscreteOperator.__str__(self) + return s + +if __name__ == "__main__": + print __doc__ + print "- Provided class : Transport_d" + print AdvectionDOp.__doc__ diff --git a/HySoP/hysop/operator/gpu_particle_advection_1k.py b/HySoP/hysop/operator/gpu_particle_advection_1k.py index 3cc58a390ccd67e825447516971b863625e1d9d6..8a24071199d464c50fd43e1625f2dc595b630672 100644 --- a/HySoP/hysop/operator/gpu_particle_advection_1k.py +++ b/HySoP/hysop/operator/gpu_particle_advection_1k.py @@ -4,7 +4,7 @@ Discrete advection representation """ from parmepy.constants import np -from parmepy.operator.particle_advection import ParticleAdvection +from parmepy.operator.gpu_particle_advection import GPUParticleAdvection from parmepy.fields.continuous import Field from parmepy.fields.gpu_discrete import GPUVectorField from parmepy.fields.gpu_discrete import GPUScalarField @@ -17,11 +17,10 @@ from parmepy.numerics.gpu_kernel import KernelListLauncher from parmepy.numerics.splitting import Splitting -class GPUParticleAdvection(ParticleAdvection): +class GPUParticleAdvection1k(GPUParticleAdvection): """ Particle advection operator representation on GPU. - Advection and Remeshing are performed in a single kernel. """ def __init__(self, advec, @@ -38,83 +37,21 @@ class GPUParticleAdvection(ParticleAdvection): @param idScalarD : Index of scalar discretisation to use. @param method : the method to use. """ - ParticleAdvection.__init__(self, advec) - self.name = "advection_gpu" - self.method = method - self.user_gpu_src = src - self.gpu_precision = precision - self.num_method = None - self.resolution = self.scalar.topology.mesh.resolution - self.compute_time_remesh = [0., 0., 0.] - self.platform, self.device, self.ctx, self.queue = \ - get_opencl_environment(platform_id, device_id, device_type) + GPUParticleAdvection.__init__(self, advec, + platform_id, device_id, + device_type, + method, src, + precision) def setUp(self): - ## Optimal work item number - if len(self.resolution) == 3: - self.workItemNumber = 64 if min(self.resolution) >= 64 \ - else min(self.resolution) - else: - self.workItemNumber = 256 if min(self.resolution) >= 256 \ - else min(self.resolution) - print " Work-Item number: ", self.workItemNumber - dim = self.scalar.topology.dim - self.coord_min = np.ones(4, dtype=self.gpu_precision) - self.mesh_size = np.ones(4, dtype=self.gpu_precision) - self.coord_min[0:dim] = np.asarray(self.scalar.topology.mesh.origin, - dtype=self.gpu_precision) - self.mesh_size[0:dim] = np.asarray( - self.scalar.topology.mesh.space_step, - dtype=self.gpu_precision) - if len(self.resolution) == 3: - self.gwi = (int(self.workItemNumber), - int(self.resolution[1]), (self.resolution[2])) - self.lwi = (int(self.workItemNumber), 1, 1) - else: - self.gwi = (int(self.workItemNumber), - int(self.resolution[1])) - self.lwi = (int(self.workItemNumber), 1) - print "=== Kernel sources ===" - self.build_options = "" - if _check_double_precision_capability(self.device, - self.gpu_precision): - self.build_options += " -cl-fp32-correctly-rounded-divide-sqrt " - if self.gpu_precision is PARMES_REAL_GPU: - self.build_options += " -cl-single-precision-constant" - self.build_options += " -D WIDTH=" + str(self.resolution[0]) - self.build_options += " -D WGN=" + str(self.workItemNumber) - self.gpu_src = self._collect_usr_cl_src(self.user_gpu_src) - self.gpu_src += self._collect_kernels_cl_src() - print "===\n" - print "=== Kernel sources compiling ===" - ## Build code - if self.gpu_precision is PARMES_REAL_GPU: - prg = cl.Program(self.ctx, self.gpu_src.replace('.0', '.0f')) - else: - prg = cl.Program(self.ctx, self.gpu_src.replace('float', 'double')) - ## OpenCL program - self.prg = prg.build(self.build_options) - print "Compiler options : ", - print self.prg.get_build_info( - self.device, cl.program_build_info.OPTIONS) - print "Compiler status : ", - print self.prg.get_build_info( - self.device, cl.program_build_info.STATUS) - print "Compiler log : ", - print self.prg.get_build_info(self.device, cl.program_build_info.LOG) - print "===\n" - print "=== OpenCL Buffer allocations ===" - self._buffer_allocations() - print "===\n" - print "=== OpenCL Buffer initialisation ===" - self._buffer_initialisations() - print "===\n" - print "=== OpenCL Kernels affectation ===" + GPUParticleAdvection.setUp(self) + + def _kernel_affectations(self): self.num_advec_and_remesh = KernelLauncher( self.prg.advection_and_remeshing, self.queue, - self.gwi, - self.lwi) + self.gwi_advec_and_remesh, + self.lwi_advec_and_remesh) self.init_copy = KernelLauncher(self.prg.copy, self.queue, self.copy_gwi, @@ -134,51 +71,6 @@ class GPUParticleAdvection(ParticleAdvection): self.queue, self.transpose_xy_gwi, self.transpose_xy_lwi) - print "===\n" - self.numMethod = Splitting(self._apply_in_dir, self.scalar.dimension) - print "Method used:", self.name - - def _buffer_initialisations(self): - data, transfer_time, compute_time = 0, 0., 0. - for gpudf in self.variables: - match = 'init' + gpudf.name - initKernel = None - # Looking for initKernel - for k in self.prg.all_kernels(): - k_name = k.get_info(cl.kernel_info.FUNCTION_NAME) - if match.find(k_name) >= 0: - initKernel = cl.Kernel(self.prg, k_name) - if initKernel is not None: - kl = KernelLauncher(initKernel, self.queue, - self.gwi, self.lwi) - if gpudf.vector: - args = [gpudf.gpu_data[d] - for d in xrange(len(self.resolution))] - args.append(self.coord_min) - args.append(self.mesh_size) - evt = kl.launch(*tuple(args)) - else: - args = [gpudf.gpu_data] - args.append(self.coord_min) - args.append(self.mesh_size) - evt = kl.launch(*tuple(args)) - self.queue.finish() - temp_time = (evt.profile.end - evt.profile.start) * 1e-9 - print "Done in ", temp_time, "sec" - compute_time += temp_time - gpudf.contains_data = False - gpudf.data_on_device = True - else: - if gpudf.contains_data: - print gpudf.name, ":", - temp_data, temp_time = gpudf.toDevice() - data += temp_data - transfer_time += temp_time - if data > 0: - print "Total Transfers : ", data, "Bytes transfered at ", - print "{0:.3f} GBytes/sec".format((data * 1e-9) / transfer_time) - if compute_time > 0.: - print "Total Computing : ", compute_time, "sec" def _buffer_allocations(self): ## Velocity. @@ -196,7 +88,8 @@ class GPUParticleAdvection(ParticleAdvection): self.queue, particle_scalar.discreteField[0], self.gpu_precision) self.part_scalar = particle_scalar.discreteField[0] - self.variables = [self.scalar, self.velocity, self.part_scalar] + self.variables = [self.scalar, self.velocity, + self.part_scalar] total_mem_used = 0 for gpuv in self.variables: total_mem_used += gpuv.mem_size @@ -207,128 +100,91 @@ class GPUParticleAdvection(ParticleAdvection): def _collect_kernels_cl_src(self): gpu_src = "" + ## advection and remeshing + advec_and_remesh_vector_width = 4 + src_advec_and_remesh = 'advection_and_remesh_' if len(self.resolution) == 3: if self.gpu_precision is PARMES_REAL_GPU: - if self.method.find('m6prime'): + if self.method.find('m4prime') >= 0: + src_advec_and_remesh += \ + 'm4prime_3D_opt4_builtin_private4.cl' + src_remesh_weights = 'weights_m4prime_opt4_builtin.cl' + elif self.method.find('m6prime') >= 0: + src_advec_and_remesh += '3D_opt4_builtin_private4.cl' src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' - src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4.cl' - else: + elif self.method.find('m8prime') >= 0: + src_advec_and_remesh += \ + 'm8prime_3D_opt4_builtin_private4.cl' src_remesh_weights = 'weights_m8prime_opt4_builtin.cl' - src_advec_and_remesh = 'advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl' + else: + print 'No remeshing fromula specified, use default m6prime' + src_advec_and_remesh += '3D_opt4_builtin_private4.cl' + src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' else: - if self.method.find('m6prime'): + if self.method.find('m4prime') >= 0: + src_advec_and_remesh += \ + 'm4prime_3D_opt4_builtin_private4_noBC.cl' + src_remesh_weights = 'weights_m4prime_opt4_builtin.cl' + elif self.method.find('m6prime') >= 0: + src_advec_and_remesh += '3D_opt4_builtin_private4_noBC.cl' src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' - src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4_noBC.cl' - else: + elif self.method.find('m8prime') >= 0: + src_advec_and_remesh += \ + 'm8prime_3D_opt4_builtin_private4_noBC.cl' src_remesh_weights = 'weights_m8prime_opt4_builtin.cl' - src_advec_and_remesh = 'advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl' + else: + print 'No remeshing fromula specified, use default m6prime' + src_advec_and_remesh += '3D_opt4_builtin_private4_noBC.cl' + src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' else: if self.gpu_precision is PARMES_REAL_GPU: - if self.method.find('m6prime'): + advec_and_remesh_vector_width = 8 + if self.method.find('m4prime') >= 0: + src_advec_and_remesh += 'm4prime_2D_opt8_builtin_noBC.cl' + src_remesh_weights = 'weights_m4prime_builtin.cl' + elif self.method.find('m6prime') >= 0: + src_advec_and_remesh += '2D_opt8_builtin_noBC.cl' src_remesh_weights = 'weights_m6prime_builtin.cl' - src_advec_and_remesh = 'advection_and_remesh_2D_opt8_builtin_noBC.cl' - else: + elif self.method.find('m8prime') >= 0: + src_advec_and_remesh += 'm8prime_2D_opt8_builtin_noBC.cl' src_remesh_weights = 'weights_m8prime_builtin.cl' - src_advec_and_remesh = 'advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl' + else: + print 'No remeshing fromula specified, use default m6prime' + src_advec_and_remesh += '2D_opt8_builtin_noBC.cl' + src_remesh_weights = 'weights_m6prime_builtin.cl' else: - if self.method.find('m6prime'): + if self.method.find('m4prime') >= 0: + src_advec_and_remesh += 'm4prime_2D_opt4_builtin_noBC.cl' + src_remesh_weights = 'weights_m4prime_builtin.cl' + elif self.method.find('m6prime') >= 0: + src_advec_and_remesh += '2D_opt4_builtin_noBC.cl' src_remesh_weights = 'weights_m6prime_builtin.cl' - src_advec_and_remesh = 'advection_and_remesh_2D_opt4_builtin_noBC.cl' - else: + elif self.method.find('m8prime') >= 0: + src_advec_and_remesh += 'm8prime_2D_opt4_builtin_noBC.cl' src_remesh_weights = 'weights_m8prime_builtin.cl' - src_advec_and_remesh = 'advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl' + else: + print 'No remeshing fromula specified, use default m6prime' + src_advec_and_remesh += '2D_opt4_builtin_noBC.cl' + src_remesh_weights = 'weights_m6prime_builtin.cl' + self.WINb_advec_and_remesh, self.gwi_advec_and_remesh, \ + self.lwi_advec_and_remesh = self._set_WorkItems( + advec_and_remesh_vector_width) + self.build_options += " -D WGNADVECANDREMESH=" + + self.build_options += str(self.WINb_advec_and_remesh) f = open(GPU_SRC + src_remesh_weights) - print " - remeshing weights:", f.name gpu_src += "".join(f.readlines()) + print " - remeshing weights:", f.name f.close() f = open(GPU_SRC + src_advec_and_remesh) - print " - advection and remesh:", f.name + print " - advection and remeshing:", f.name gpu_src += "".join(f.readlines()) f.close() - ## copy - if len(self.resolution) == 3: - if self.gpu_precision is PARMES_REAL_GPU: - src_copy = 'copy_3D_opt4.cl' - self.build_options += " -D TILE_DIM_COPY=16 -D BLOCK_ROWS_COPY=8" - self.copy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1] / 2), int(self.resolution[2])) - self.copy_lwi = (4, 8, 1) - else: - src_copy = 'copy_3D_locMem.cl' - self.build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=8" - self.copy_gwi = (int(self.resolution[0]), int(self.resolution[1] / 4), int(self.resolution[2])) - self.copy_lwi = (32, 8, 1) - else: - if self.gpu_precision is PARMES_REAL_GPU: - src_copy = 'copy_2D_opt2.cl' - self.build_options += " -D TILE_DIM_COPY=16 -D BLOCK_ROWS_COPY=8" - self.copy_gwi = (int(self.resolution[0] / 2), int(self.resolution[1] / 2)) - self.copy_lwi = (8, 8) - else: - src_copy = 'copy_2D_opt2.cl' - self.build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=2" - self.copy_gwi = (int(self.resolution[0] / 2), int(self.resolution[1] / 16)) - self.copy_lwi = (16, 2) - f = open(GPU_SRC + src_copy) - print " - copy:", f.name - gpu_src += "".join(f.readlines()) - f.close() - ## transpose - if len(self.resolution) == 3: - if self.gpu_precision is PARMES_REAL_GPU: - src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl' - self.build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8" - self.transpose_xy_gwi = (int(self.resolution[0] / 2), int(self.resolution[1] / 2), int(self.resolution[2])) - self.transpose_xy_lwi = (8, 8, 1) - src_transpose_xz = 'transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl' - self.build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4" - self.transpose_xz_gwi = (int(self.resolution[0]), int(self.resolution[1] / 4), int(self.resolution[2] / 4)) - self.transpose_xz_lwi = (16, 4, 4) - else: - src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl' - self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=4" - self.transpose_xy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1] / 8), int(self.resolution[2])) - self.transpose_xy_lwi = (8, 4, 1) - src_transpose_xz = 'transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl' - self.build_options += " -D TILE_DIM_XZ=8 -D BLOCK_ROWS_XZ=2" - self.transpose_xz_gwi = (int(self.resolution[0]), int(self.resolution[1] / 4), int(self.resolution[2] / 4)) - self.transpose_xz_lwi = (8, 2, 2) - else: - if self.gpu_precision is PARMES_REAL_GPU: - src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl' - self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8" - self.transpose_xy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1]) / 4) - self.transpose_xy_lwi = (8, 8) - else: - src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl' - self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=2" - self.transpose_xy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1]) / 16) - self.transpose_xy_lwi = (8, 2) - f = open(GPU_SRC + src_transpose_xy) - print " - transpose_xy:", f.name - gpu_src += "".join(f.readlines()) - f.close() - if len(self.resolution) == 3: - f = open(GPU_SRC + src_transpose_xz) - gpu_src += "".join(f.readlines()) - f.close() + print " - config :", + print self.gwi_advec_and_remesh, self.lwi_advec_and_remesh + gpu_src += self._collect_kernels_cl_src_copy() + gpu_src += self._collect_kernels_cl_src_transpositions() return gpu_src - def _collect_usr_cl_src(self, usr_src): - print "usr src", usr_src - gpu_src = "" - if usr_src is not None: - if isinstance(usr_src, list): - for src in usr_src: - print "Sources files (user): ", src - f = open(src, 'r') - gpu_src += "".join(f.readlines()) - f.close() - else: - print "Sources files (user): ", usr_src - f = open(usr_src, 'r') - gpu_src += "".join(f.readlines()) - f.close() - return gpu_src def _apply_in_dir(self, t, dt, d, split_id): """ @@ -364,16 +220,15 @@ class GPUParticleAdvection(ParticleAdvection): self.part_scalar.gpu_data) else: evt_init = \ - self.init_transpose.launch(1, self.gpu_scalar.gpu_data, + self.init_transpose.launch(1, self.scalar.gpu_data, self.part_scalar.gpu_data) - # Advection and remeshing + # Advection evt_advec = self.num_advec_and_remesh.launch(self.velocity.gpu_data[d], self.part_scalar.gpu_data, self.scalar.gpu_data, self.gpu_precision(dt), self.coord_min[d], self.mesh_size[d]) - self.num_advec_and_remesh.finish() for df in self.output: df.data_on_device = True df.contains_data = False @@ -389,8 +244,6 @@ class GPUParticleAdvection(ParticleAdvection): self.total_time += (c_time_advec + c_time_init) return (c_time_advec + c_time_init) - def apply(self, t, dt, ite): - self.numMethod(t, dt) def printComputeTime(self): self.time_info[0] = "\"Advection total\" " diff --git a/HySoP/hysop/operator/gpu_particle_advection_2k.py b/HySoP/hysop/operator/gpu_particle_advection_2k.py index f94f201a5eb80c1039234e804c19e3f4769ddb51..598eebf5eceb4685c519f4cab0751e499a6e33d3 100644 --- a/HySoP/hysop/operator/gpu_particle_advection_2k.py +++ b/HySoP/hysop/operator/gpu_particle_advection_2k.py @@ -4,7 +4,7 @@ Discrete advection representation """ from parmepy.constants import np -from parmepy.operator.particle_advection import ParticleAdvection +from parmepy.operator.gpu_particle_advection import GPUParticleAdvection from parmepy.fields.continuous import Field from parmepy.fields.gpu_discrete import GPUVectorField from parmepy.fields.gpu_discrete import GPUScalarField @@ -17,7 +17,7 @@ from parmepy.numerics.gpu_kernel import KernelListLauncher from parmepy.numerics.splitting import Splitting -class GPUParticleAdvection(ParticleAdvection): +class GPUParticleAdvection2k(GPUParticleAdvection): """ Particle advection operator representation on GPU. @@ -37,125 +37,25 @@ class GPUParticleAdvection(ParticleAdvection): @param idScalarD : Index of scalar discretisation to use. @param method : the method to use. """ - ParticleAdvection.__init__(self, advec) - self.name = "advection_gpu" - self.method = method - self.user_gpu_src = src - self.gpu_precision = precision - self.num_method = None - self.resolution = self.scalar.topology.mesh.resolution + GPUParticleAdvection.__init__(self, advec, + platform_id, device_id, + device_type, + method, src, + precision) self.compute_time_remesh = [0., 0., 0.] - print "=== OpenCL environment ===" - #Get platform. - try: - ## OpenCL platform - self.platform = cl.get_platforms()[platform_id] - except IndexError: - print " Incorrect platform_id :", platform_id, ".", - print " Only ", len(cl.get_platforms()), " available.", - print " Getting defalut platform. " - self.platform = cl.get_platforms()[0] - print " Platform " - print " - Name :", self.platform.name - print " - Version :", self.platform.version - #Get device. - try: - ## OpenCL device - self.device = self.platform.get_devices( - eval("cl.device_type." + device_type.upper()))[device_id] - except cl.RuntimeError as e: - print "RuntimeError:", e - self.device = cl.create_some_context().devices[0] - except AttributeError as e: - print "AttributeError:", e - self.device = cl.create_some_context().devices[0] - print " Device" - print " - Name :", - print self.device.name - print " - Type :", - print cl.device_type.to_string(self.device.type) - print " - C Version :", - print self.device.opencl_c_version - print " - Global mem size :", - print self.device.global_mem_size / (1024 ** 3), "GB" - print "===\n" - #Creates GPU Context - ## OpenCL context - self.ctx = cl.Context([self.device]) - #Create CommandQueue on the GPU Context - ## OpenCL command queue - self.queue = cl.CommandQueue( - self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) def setUp(self): - ## Optimal work item number - if len(self.resolution) == 3: - self.workItemNumber = 64 if min(self.resolution) >= 64 \ - else min(self.resolution) - else: - self.workItemNumber = 256 if min(self.resolution) >= 256 \ - else min(self.resolution) - print " Work-Item number: ", self.workItemNumber - dim = self.scalar.topology.dim - self.coord_min = np.ones(4, dtype=self.gpu_precision) - self.mesh_size = np.ones(4, dtype=self.gpu_precision) - self.coord_min[0:dim] = np.asarray(self.scalar.topology.mesh.origin, - dtype=self.gpu_precision) - self.mesh_size[0:dim] = np.asarray( - self.scalar.topology.mesh.space_step, - dtype=self.gpu_precision) - if len(self.resolution) == 3: - self.gwi = (int(self.workItemNumber), - int(self.resolution[1]), (self.resolution[2])) - self.lwi = (int(self.workItemNumber), 1, 1) - else: - self.gwi = (int(self.workItemNumber), - int(self.resolution[1])) - self.lwi = (int(self.workItemNumber), 1) - print "=== Kernel sources ===" - self.build_options = "" - if _check_double_precision_capability(self.device, - self.gpu_precision): - self.build_options += " -cl-fp32-correctly-rounded-divide-sqrt " - if self.gpu_precision is PARMES_REAL_GPU: - self.build_options += " -cl-single-precision-constant" - self.build_options += " -D WIDTH=" + str(self.resolution[0]) - self.build_options += " -D WGN=" + str(self.workItemNumber) - self.gpu_src = self._collect_usr_cl_src(self.user_gpu_src) - self.gpu_src += self._collect_kernels_cl_src() - print "===\n" - print "=== Kernel sources compiling ===" - ## Build code - if self.gpu_precision is PARMES_REAL_GPU: - prg = cl.Program(self.ctx, self.gpu_src.replace('.0', '.0f')) - else: - prg = cl.Program(self.ctx, self.gpu_src.replace('float', 'double')) - ## OpenCL program - self.prg = prg.build(self.build_options) - print "Compiler options : ", - print self.prg.get_build_info( - self.device, cl.program_build_info.OPTIONS) - print "Compiler status : ", - print self.prg.get_build_info( - self.device, cl.program_build_info.STATUS) - print "Compiler log : ", - print self.prg.get_build_info(self.device, cl.program_build_info.LOG) - print "===\n" - print "=== OpenCL Buffer allocations ===" - self._buffer_allocations() - print "===\n" - print "=== OpenCL Buffer initialisation ===" - self._buffer_initialisations() - print "===\n" - print "=== OpenCL Kernels affectation ===" + GPUParticleAdvection.setUp(self) + + def _kernel_affectations(self): self.num_advec = KernelLauncher(self.prg.advection, self.queue, - self.gwi, - self.lwi) + self.gwi_advec, + self.lwi_advec) self.num_remesh = KernelLauncher(self.prg.remeshing, self.queue, - self.gwi, - self.lwi) + self.gwi_remesh, + self.lwi_remesh) self.init_copy = KernelLauncher(self.prg.copy, self.queue, self.copy_gwi, @@ -175,51 +75,6 @@ class GPUParticleAdvection(ParticleAdvection): self.queue, self.transpose_xy_gwi, self.transpose_xy_lwi) - print "===\n" - self.numMethod = Splitting(self._apply_in_dir, self.scalar.dimension) - print "Method used:", self.name - - def _buffer_initialisations(self): - data, transfer_time, compute_time = 0, 0., 0. - for gpudf in self.variables: - match = 'init' + gpudf.name - initKernel = None - # Looking for initKernel - for k in self.prg.all_kernels(): - k_name = k.get_info(cl.kernel_info.FUNCTION_NAME) - if match.find(k_name) >= 0: - initKernel = cl.Kernel(self.prg, k_name) - if initKernel is not None: - kl = KernelLauncher(initKernel, self.queue, - self.gwi, self.lwi) - if gpudf.vector: - args = [gpudf.gpu_data[d] - for d in xrange(len(self.resolution))] - args.append(self.coord_min) - args.append(self.mesh_size) - evt = kl.launch(*tuple(args)) - else: - args = [gpudf.gpu_data] - args.append(self.coord_min) - args.append(self.mesh_size) - evt = kl.launch(*tuple(args)) - self.queue.finish() - temp_time = (evt.profile.end - evt.profile.start) * 1e-9 - print "Done in ", temp_time, "sec" - compute_time += temp_time - gpudf.contains_data = False - gpudf.data_on_device = True - else: - if gpudf.contains_data: - print gpudf.name, ":", - temp_data, temp_time = gpudf.toDevice() - data += temp_data - transfer_time += temp_time - if data > 0: - print "Total Transfers : ", data, "Bytes transfered at ", - print "{0:.3f} GBytes/sec".format((data * 1e-9) / transfer_time) - if compute_time > 0.: - print "Total Computing : ", compute_time, "sec" def _buffer_allocations(self): ## Velocity. @@ -258,35 +113,57 @@ class GPUParticleAdvection(ParticleAdvection): def _collect_kernels_cl_src(self): gpu_src = "" + print self.method if len(self.resolution) == 3: if self.gpu_precision is PARMES_REAL_GPU: src_advec = 'advection_3D_opt4_builtin.cl' + advec_vector_width = 4 else: src_advec = 'advection_3D_opt2_builtin.cl' + advec_vector_width = 2 else: if self.gpu_precision is PARMES_REAL_GPU: src_advec = 'advection_2D_opt4_builtin.cl' + advec_vector_width = 4 else: src_advec = 'advection_2D_builtin.cl' + advec_vector_width = 1 + self.WINb_advec, self.gwi_advec, \ + self.lwi_advec = self._set_WorkItems(advec_vector_width) + self.build_options += " -D WGNADVEC=" + str(self.WINb_advec) f = open(GPU_SRC + src_advec) print " - advection:", f.name gpu_src += "".join(f.readlines()) f.close() + print " - config :", + print self.gwi_advec, self.lwi_advec ## remeshing + remesh_vector_width = 4 if len(self.resolution) == 3: - if self.method.find('m6prime'): + if self.method.find('m6prime') >= 0: src_remesh = 'remeshing_3D_opt4_private4.cl' src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' - else: + elif self.method.find('m8prime') >= 0: src_remesh = 'remeshing_m8prime_3D_opt4_private4.cl' src_remesh_weights = 'weights_m8prime_opt4_builtin.cl' + else: + print 'No remeshing fromula specified, use default m6prime' + src_remesh = 'remeshing_3D_opt4_private4.cl' + src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' else: - if self.method.find('m6prime'): + if self.method.find('m6prime') >= 0: src_remesh = 'remeshing_2D_opt4_noBC.cl' src_remesh_weights = 'weights_m6prime_builtin.cl' - else: + elif self.method.find('m8prime') >= 0: src_remesh = 'remeshing_m8prime_2D_opt4_noBC.cl' src_remesh_weights = 'weights_m8prime_builtin.cl' + else: + print 'No remeshing fromula specified, use default m6prime' + src_remesh = 'remeshing_2D_opt4_noBC.cl' + src_remesh_weights = 'weights_m6prime_builtin.cl' + self.WINb_remesh, self.gwi_remesh, \ + self.lwi_remesh = self._set_WorkItems(remesh_vector_width) + self.build_options += " -D WGNREMESH=" + str(self.WINb_remesh) f = open(GPU_SRC + src_remesh_weights) gpu_src += "".join(f.readlines()) print " - remeshing weights:", f.name @@ -295,90 +172,12 @@ class GPUParticleAdvection(ParticleAdvection): print " - remeshing:", f.name gpu_src += "".join(f.readlines()) f.close() - ## copy - if len(self.resolution) == 3: - if self.gpu_precision is PARMES_REAL_GPU: - src_copy = 'copy_3D_opt4.cl' - self.build_options += " -D TILE_DIM_COPY=16 -D BLOCK_ROWS_COPY=8" - self.copy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1] / 2), int(self.resolution[2])) - self.copy_lwi = (4, 8, 1) - else: - src_copy = 'copy_3D_locMem.cl' - self.build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=8" - self.copy_gwi = (int(self.resolution[0]), int(self.resolution[1] / 4), int(self.resolution[2])) - self.copy_lwi = (32, 8, 1) - else: - if self.gpu_precision is PARMES_REAL_GPU: - src_copy = 'copy_2D_opt2.cl' - self.build_options += " -D TILE_DIM_COPY=16 -D BLOCK_ROWS_COPY=8" - self.copy_gwi = (int(self.resolution[0] / 2), int(self.resolution[1] / 2)) - self.copy_lwi = (8, 8) - else: - src_copy = 'copy_2D_opt2.cl' - self.build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=2" - self.copy_gwi = (int(self.resolution[0] / 2), int(self.resolution[1] / 16)) - self.copy_lwi = (16, 2) - f = open(GPU_SRC + src_copy) - print " - copy:", f.name - gpu_src += "".join(f.readlines()) - f.close() - ## transpose - if len(self.resolution) == 3: - if self.gpu_precision is PARMES_REAL_GPU: - src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl' - self.build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8" - self.transpose_xy_gwi = (int(self.resolution[0] / 2), int(self.resolution[1] / 2), int(self.resolution[2])) - self.transpose_xy_lwi = (8, 8, 1) - src_transpose_xz = 'transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl' - self.build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4" - self.transpose_xz_gwi = (int(self.resolution[0]), int(self.resolution[1] / 4), int(self.resolution[2] / 4)) - self.transpose_xz_lwi = (16, 4, 4) - else: - src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl' - self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=4" - self.transpose_xy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1] / 8), int(self.resolution[2])) - self.transpose_xy_lwi = (8, 4, 1) - src_transpose_xz = 'transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl' - self.build_options += " -D TILE_DIM_XZ=8 -D BLOCK_ROWS_XZ=2" - self.transpose_xz_gwi = (int(self.resolution[0]), int(self.resolution[1] / 4), int(self.resolution[2] / 4)) - self.transpose_xz_lwi = (8, 2, 2) - else: - if self.gpu_precision is PARMES_REAL_GPU: - src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl' - self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8" - self.transpose_xy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1]) / 4) - self.transpose_xy_lwi = (8, 8) - else: - src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl' - self.build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=2" - self.transpose_xy_gwi = (int(self.resolution[0] / 4), int(self.resolution[1]) / 16) - self.transpose_xy_lwi = (8, 2) - f = open(GPU_SRC + src_transpose_xy) - print " - transpose_xy:", f.name - gpu_src += "".join(f.readlines()) - f.close() - if len(self.resolution) == 3: - f = open(GPU_SRC + src_transpose_xz) - gpu_src += "".join(f.readlines()) - f.close() + print " - config :", + print self.gwi_remesh, self.lwi_remesh + gpu_src += self._collect_kernels_cl_src_copy() + gpu_src += self._collect_kernels_cl_src_transpositions() return gpu_src - def _collect_usr_cl_src(self, usr_src): - print "usr src", usr_src - gpu_src = "" - if usr_src is not None: - if isinstance(usr_src, list): - for src in usr_src: - print "Sources files (user): ", src - f = open(src, 'r') - gpu_src += "".join(f.readlines()) - f.close() - else: - print "Sources files (user): ", usr_src - f = open(usr_src, 'r') - gpu_src += "".join(f.readlines()) - f.close() - return gpu_src def _apply_in_dir(self, t, dt, d, split_id): """ @@ -414,7 +213,7 @@ class GPUParticleAdvection(ParticleAdvection): self.part_scalar.gpu_data) else: evt_init = \ - self.init_transpose.launch(1, self.gpu_scalar.gpu_data, + self.init_transpose.launch(1, self.scalar.gpu_data, self.part_scalar.gpu_data) # Advection evt_advec = self.num_advec.launch(self.velocity.gpu_data[d], @@ -443,12 +242,11 @@ class GPUParticleAdvection(ParticleAdvection): if split_id == 0: self.compute_time_copy[d] += c_time_init else: - self.compute_time_swap[min(split_id, d)] += c_time_init + if self.scalar.topology.dim == 2: + self.compute_time_swap[0] += c_time_init self.total_time += (c_time_advec + c_time_remesh + c_time_init) return (c_time_advec + c_time_remesh + c_time_init) - def apply(self, t, dt, ite): - self.numMethod(t, dt) def printComputeTime(self): self.time_info[0] = "\"Advection total\" " diff --git a/HySoP/hysop/operator/monitors/monitoring.py b/HySoP/hysop/operator/monitors/monitoring.py index ac23e3aaa3e31ae5f1e36757c7077d0dfaa09458..ca7347b927ac2aa115b4f3fe00368890de1999ad 100644 --- a/HySoP/hysop/operator/monitors/monitoring.py +++ b/HySoP/hysop/operator/monitors/monitoring.py @@ -19,7 +19,10 @@ class Monitoring(Operator): ## Monitor frequency self.freq = frequency - def setUp(self, *spec): + def setUp(self): + pass + + def finalize(self): pass def printComputeTime(self): diff --git a/HySoP/hysop/operator/monitors/printer.py b/HySoP/hysop/operator/monitors/printer.py index 1e6c7c1dda56ba039ed1bb34f362fcb700c86caa..bae205043edc4506508c880f24d1b121d413d903 100644 --- a/HySoP/hysop/operator/monitors/printer.py +++ b/HySoP/hysop/operator/monitors/printer.py @@ -92,47 +92,48 @@ class Printer(Monitoring): filename = self.outputPrefix + "results_{0:05d}.dat".format(ite) print "Print file : " + filename ## VTK output \todo: Need fix in 2D, getting an IOError. - if self.fields[0].domain.dimension == 3: + try: evtk.imageToVTK(filename, pointData=self._build_vtk_dict()) - else: - ## Standard output - f = open(filename, 'w') - shape = self.fields[0].discreteField[0].topology.mesh.resolution - coords = self.fields[0].discreteField[0].topology.mesh.coords - if len(shape) == 2: - for i in xrange(shape[0] - 1): - for j in xrange(shape[1] - 1): - f.write("{0:8.12} {1:8.12} ".format( - coords[0][i, 0], coords[1][0, j])) - for field in self.fields: - if field.vector: - f.write("{0:8.12} {1:8.12} ".format( - field.discreteField[0][0][i, j], - field.discreteField[0][1][i, j])) - else: - f.write("{0:8.12} ".format( - field.discreteField[0][i, j])) - f.write("\n") - else: - for i in xrange(shape[0] - 1): - for j in xrange(shape[1] - 1): - for k in xrange(shape[2] - 1): - f.write("{0:8.12} {1:8.12} {2:8.12} ".format( - coords[0][i, 0, 0], - coords[1][0, j, 0], - coords[2][0, 0, k])) + except IOError: + if self.fields[0].domain.dimension == 2: + ## Standard output + f = open(filename, 'w') + shape = self.fields[0].discreteField[0].topology.mesh.resolution + coords = self.fields[0].discreteField[0].topology.mesh.coords + if len(shape) == 2: + for i in xrange(shape[0] - 1): + for j in xrange(shape[1] - 1): + f.write("{0:8.12} {1:8.12} ".format( + coords[0][i, 0], coords[1][0, j])) for field in self.fields: if field.vector: - f.write("{0:8.12} {1:8.12} {2:8.12} ".format( - field.discreteField[0][0][i, j, k], - field.discreteField[0][1][i, j, k], - field.discreteField[0][2][i, j, k])) + f.write("{0:8.12} {1:8.12} ".format( + field.discreteField[0][0][i, j], + field.discreteField[0][1][i, j])) else: f.write("{0:8.12} ".format( - field.discreteField[0][i, j, k])) + field.discreteField[0][i, j])) f.write("\n") - f.close() - print "==\n" + else: + for i in xrange(shape[0] - 1): + for j in xrange(shape[1] - 1): + for k in xrange(shape[2] - 1): + f.write("{0:8.12} {1:8.12} {2:8.12} ".format( + coords[0][i, 0, 0], + coords[1][0, j, 0], + coords[2][0, 0, k])) + for field in self.fields: + if field.vector: + f.write("{0:8.12} {1:8.12} {2:8.12} ".format( + field.discreteField[0][0][i, j, k], + field.discreteField[0][1][i, j, k], + field.discreteField[0][2][i, j, k])) + else: + f.write("{0:8.12} ".format( + field.discreteField[0][i, j, k])) + f.write("\n") + f.close() + print "==\n" self.compute_time += (time.time() - t) def printComputeTime(self): diff --git a/HySoP/hysop/problem/problem.py b/HySoP/hysop/problem/problem.py index a793bbca71da48527145fe6c3d6478820b6073ec..69f0ac13b09c1d6f5989ff0571cb8b84ae66627c 100644 --- a/HySoP/hysop/problem/problem.py +++ b/HySoP/hysop/problem/problem.py @@ -79,6 +79,10 @@ class Problem(object): op.apply(self.timer.t, self.timer.dt, ite) self.timer.step() print "\n\n End solving\n" + print "==== Finalization ====" + for op in self.operators: + op.finalize() + print "===\n" print "=== Timings ===" if (self.domain.topologies[0].rank == 0): for op in self.operators: