diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl index 87127b396f8f0ec6ec3b9db261f1a6bf8aeb6f66..90b055bad3fbd8e72e56ecc616c9dce232b7fa6f 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_MS_and_remeshing_noVec.cl @@ -2,16 +2,366 @@ +__kernel void buff_advec_and_remesh_l(__global const float* gvelo, + __global float* v_l_buff, + __global const float* pscal, + __global float* s_l_buff, + float dt, + float inv_v_dx_y, float inv_v_dx_z, + __constant struct AdvectionMeshInfo* mesh) +{ + int gidY = get_global_id(0); /* OpenCL work-itme global index (Y) */ + int gidZ = get_global_id(1); /* OpenCL work-itme global index (Z) */ + int i; /* Particle index in 1D problem */ + int line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */ + float p,v,c,s,y,w; + float2 hY, hZ; + int i_ind, i_indY, i_indZ; + int ind, index; + + + float velocity_cache[V_NB_I]; + float v_l_buff_loc[V_BUFF_WIDTH]; + float s_l_buff_loc[BUFF_WIDTH]; + float* loc_ptr; + + // Initialize buffers + for (i=0; i<BUFF_WIDTH; i++) + s_l_buff_loc[i] = 0.0; + + barrier(CLK_LOCAL_MEM_FENCE); + + hY.s0 = (gidY * mesh->dx.y) * inv_v_dx_y; + hZ.s0 = (gidZ * mesh->dx.z) * inv_v_dx_z; + i_indY = convert_int_rtn(hY.s0); + i_indZ = convert_int_rtn(hZ.s0); + hY.s0 = hY.s0 - convert_float(i_indY); + hZ.s0 = hZ.s0 - convert_float(i_indZ); + hY.s1 = (1.0-hY.s0); + hZ.s1 = (1.0-hZ.s0); + + i_indY = i_indY + V_GHOSTS_NB; + i_indZ = i_indZ + V_GHOSTS_NB; + + for (i=0; i<V_NB_I; i++){ + velocity_cache[noBC_id(i)] = hY.s1*hZ.s1 * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY.s1*hZ.s0 * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY.s0*hZ.s1 * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY.s0*hZ.s0 * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; + } + + for (i=0; i<V_BUFF_WIDTH; i++){ + v_l_buff_loc[i] = hY.s1*hZ.s1*v_l_buff[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)]; + v_l_buff_loc[i] += hY.s1*hZ.s0*v_l_buff[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)]; + v_l_buff_loc[i] += hY.s0*hZ.s1*v_l_buff[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)]; + v_l_buff_loc[i] += hY.s0*hZ.s0*v_l_buff[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)]; + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=0; i<2*BUFF_WIDTH; i++) + { + c = i * mesh->dx.x + mesh->min_position; + // multi-scale : interpolate v from velocity buffer (of length V_NB_I) + p = c * mesh->v_invdx; + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind = i_ind - (V_START_INDEX-V_GHOSTS_NB) - MS_INTERPOL_SHIFT; + v = mix(velocity_cache[noBC_id(i_ind)], + velocity_cache[noBC_id(i_ind+1)],p); + p = (c + 0.5*dt*v) * mesh->v_invdx; + + i_ind = convert_int_rtn(p) - MS_INTERPOL_SHIFT; + p = p - convert_float(i_ind); + loc_ptr = (i_ind>=(V_START_INDEX-V_GHOSTS_NB)) ? velocity_cache+noBC_id(i_ind - (V_START_INDEX-V_GHOSTS_NB)) : v_l_buff_loc+i_ind-(V_START_INDEX-V_GHOSTS_NB-1-V_BUFF_WIDTH+1); + v = (1.0-p)*(*loc_ptr); + i_ind = i_ind + 1; + loc_ptr = (i_ind>=(V_START_INDEX-V_GHOSTS_NB)) ? velocity_cache+noBC_id(i_ind - (V_START_INDEX-V_GHOSTS_NB)) : v_l_buff_loc+i_ind-(V_START_INDEX-V_GHOSTS_NB-1-V_BUFF_WIDTH+1); + v += p*(*loc_ptr); + p = c + dt * v; + + + /* Read particle scalar */ + s = pscal[i + line_index]; + + + + + ind = convert_int_rtn(p * mesh->invdx); + y = (p - convert_float(ind) * mesh->dx.x) * mesh->invdx; + + index = ind - REMESH_SHIFT; + + w = REMESH(alpha)(y); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(beta)(y); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(gamma)(y); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(delta)(y); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + +#if REMESH_SHIFT > 1 + index = index + 1; + w = REMESH(eta)(y); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(zeta)(y); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 2 + index = index + 1; + w = REMESH(theta)(y); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(iota)(y); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 3 + index = index + 1; + w = REMESH(kappa)(y); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(mu)(y); + if (index<START_INDEX) {loc_ptr = s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); +#endif + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + // Store buffers + for (i=0; i<BUFF_WIDTH; i++) + s_l_buff[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_l_buff_loc[i]; + +} + +__kernel void buff_advec_and_remesh_r(__global const float* gvelo, + __global float* v_r_buff, + __global const float* pscal, + __global float* s_r_buff, + float dt, + float inv_v_dx_y, float inv_v_dx_z, + __constant struct AdvectionMeshInfo* mesh) +{ + int gidY = get_global_id(0); /* OpenCL work-itme global index (Y) */ + int gidZ = get_global_id(1); /* OpenCL work-itme global index (Z) */ + int i; /* Particle index in 1D problem */ + int line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */ + float p,v,c,s,y,w; + float2 hY, hZ; + int i_ind, i_indY, i_indZ; + int ind, index; + + + float velocity_cache[V_NB_I]; + float v_r_buff_loc[V_BUFF_WIDTH]; + float s_r_buff_loc[BUFF_WIDTH]; + float* loc_ptr; + + // Initialize buffers + for(i=0; i<BUFF_WIDTH; i++) + s_r_buff_loc[i] = 0.0; + + barrier(CLK_LOCAL_MEM_FENCE); + + hY.s0 = (gidY * mesh->dx.y) * inv_v_dx_y; + hZ.s0 = (gidZ * mesh->dx.z) * inv_v_dx_z; + i_indY = convert_int_rtn(hY.s0); + i_indZ = convert_int_rtn(hZ.s0); + hY.s0 = hY.s0 - convert_float(i_indY); + hZ.s0 = hZ.s0 - convert_float(i_indZ); + hY.s1 = (1.0-hY.s0); + hZ.s1 = (1.0-hZ.s0); + + i_indY = i_indY + V_GHOSTS_NB; + i_indZ = i_indZ + V_GHOSTS_NB; + + for(i=0;i<V_NB_I; i++){ + velocity_cache[noBC_id(i)] = hY.s1*hZ.s1 * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY.s1*hZ.s0 * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY.s0*hZ.s1 * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY.s0*hZ.s0 * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; + } + + for(i=0;i<V_BUFF_WIDTH; i++){ + v_r_buff_loc[i] = hY.s1*hZ.s1*v_r_buff[i + V_BUFF_WIDTH*(i_indY + i_indZ*V_NB_II)]; + v_r_buff_loc[i] += hY.s1*hZ.s0*v_r_buff[i + V_BUFF_WIDTH*(i_indY + (i_indZ+1)*V_NB_II)]; + v_r_buff_loc[i] += hY.s0*hZ.s1*v_r_buff[i + V_BUFF_WIDTH*(i_indY+1 + i_indZ*V_NB_II)]; + v_r_buff_loc[i] += hY.s0*hZ.s0*v_r_buff[i + V_BUFF_WIDTH*(i_indY+1 + (i_indZ+1)*V_NB_II)]; + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=NB_I-2*BUFF_WIDTH; i<NB_I; i++) + { + c = i * mesh->dx.x + mesh->min_position; + // multi-scale : interpolate v from velocity buffer (of length V_NB_I) + p = c * mesh->v_invdx; + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind = i_ind - (V_START_INDEX-V_GHOSTS_NB) - MS_INTERPOL_SHIFT; + v = mix(velocity_cache[noBC_id(i_ind)], + velocity_cache[noBC_id(i_ind+1)],p); + p = (c + 0.5*dt*v) * mesh->v_invdx; + + i_ind = convert_int_rtn(p) - MS_INTERPOL_SHIFT; + p = p - convert_float(i_ind); + loc_ptr = (i_ind <= (V_STOP_INDEX+V_GHOSTS_NB)) ? velocity_cache+noBC_id(i_ind - (V_START_INDEX-V_GHOSTS_NB)) : v_r_buff_loc+i_ind-(V_STOP_INDEX+V_GHOSTS_NB+1) ; + v = (1.0-p)*(*loc_ptr); + i_ind = i_ind + 1; + loc_ptr = (i_ind <= (V_STOP_INDEX+V_GHOSTS_NB)) ? velocity_cache+noBC_id(i_ind - (V_START_INDEX-V_GHOSTS_NB)) : v_r_buff_loc+i_ind-(V_STOP_INDEX+V_GHOSTS_NB+1) ; + v += p*(*loc_ptr); + p = c + dt * v; + + + /* Read particle scalar */ + s = pscal[i + line_index]; + + + + + ind = convert_int_rtn(p * mesh->invdx); + y = (p - convert_float(ind) * mesh->dx.x) * mesh->invdx; + + index = ind - REMESH_SHIFT; + + w = REMESH(alpha)(y); + if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(beta)(y); + if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(gamma)(y); + if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(delta)(y); + if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + +#if REMESH_SHIFT > 1 + index = index + 1; + w = REMESH(eta)(y); + if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(zeta)(y); + if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 2 + index = index + 1; + w = REMESH(theta)(y); + if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(iota)(y); + if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 3 + index = index + 1; + w = REMESH(kappa)(y); + if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(mu)(y); + if (index > STOP_INDEX){ loc_ptr = s_r_buff_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); +#endif + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + // Store buffers + for(i=0;i<BUFF_WIDTH;i++) + s_r_buff[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_r_buff_loc[i]; + +} + __kernel void buff_advec_and_remesh(__global const float* gvelo, - __global float* v_l_buff, - __global float* v_r_buff, - __global const float* pscal, - __global float* gscal, - __global float* s_l_buff, - __global float* s_r_buff, - float dt, - float inv_v_dx_y, float inv_v_dx_z, - __constant struct AdvectionMeshInfo* mesh) + __global float* v_l_buff, + __global float* v_r_buff, + __global const float* pscal, + __global float* gscal, + float dt, + float inv_v_dx_y, float inv_v_dx_z, + __constant struct AdvectionMeshInfo* mesh) { int gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ int gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ @@ -28,16 +378,8 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo, __local float v_l_buff_loc[V_BUFF_WIDTH]; __local float v_r_buff_loc[V_BUFF_WIDTH]; __local float gscal_loc[NB_I]; - __local float s_l_buff_loc[BUFF_WIDTH]; - __local float s_r_buff_loc[BUFF_WIDTH]; __local float* loc_ptr; - // Initialize buffers - if(gidX < BUFF_WIDTH) - s_l_buff_loc[gidX] = 0.0; - if(gidX < BUFF_WIDTH) - s_r_buff_loc[gidX] = 0.0; - for(i=gidX; i<NB_I; i+=WI_NB) { /* Initialize result buffer */ @@ -117,77 +459,77 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo, index = ind - REMESH_SHIFT; w = REMESH(alpha)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(beta)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(gamma)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(delta)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); #if REMESH_SHIFT > 1 index = index + 1; w = REMESH(eta)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(zeta)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); #endif #if REMESH_SHIFT > 2 index = index + 1; w = REMESH(theta)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(iota)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); #endif #if REMESH_SHIFT > 3 index = index + 1; w = REMESH(kappa)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(mu)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_l_buff_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_r_buff_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); #endif } @@ -200,11 +542,4 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo, /* Store result */ gscal[i + line_index] = gscal_loc[i]; } - - // Store buffers - if (gidX < BUFF_WIDTH) - s_l_buff[gidX + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_l_buff_loc[gidX]; - if (gidX < BUFF_WIDTH) - s_r_buff[gidX + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_r_buff_loc[gidX]; - } diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_and_remeshing_noVec.cl index 7db76aab5dc1b7349666484f3abb240abf57eaee..ae019d2aded9acbe527aca653bbb86ea05c84a05 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_and_remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_and_remeshing_noVec.cl @@ -1,13 +1,317 @@ +__kernel void buff_advec_and_remesh_l(__global const float* gvelo, + __global float* v_buffer_l, + __global const float* pscal, + __global float* s_buffer_l, + float dt, __constant struct AdvectionMeshInfo* mesh) +{ + int gidY = get_global_id(0); /* OpenCL work-itme global index (Y) */ + int gidZ = get_global_id(1); /* OpenCL work-itme global index (Z) */ + int i; /* Particle index in 1D problem */ + int line_index ; /* Current 1D problem index */ + + float v,vp,p,c,s,y,w, hdt = 0.5 * dt; + int i_ind, i_ind_p, ind, index; + + float velocity_cache[V_NB_I]; + float v_buff_l_loc[V_BUFF_WIDTH]; + float s_buff_l_loc[BUFF_WIDTH]; + float* loc_ptr; + + // Initialize buffers + for (i=0;i<BUFF_WIDTH;i++) + s_buff_l_loc[i] = 0.0; + + for(i=0; i<V_BUFF_WIDTH; i++) + v_buff_l_loc[i] = v_buffer_l[i + V_BUFF_WIDTH*(gidY + gidZ*V_NB_II)]; + + line_index = gidY*V_NB_I + gidZ*V_NB_I*V_NB_II; + /* Read velocity */ + /* Fill velocity cache */ + for(i=0;i<V_NB_I;i++) + velocity_cache[i] = gvelo[i+line_index]; + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + line_index = gidY*NB_I+gidZ*NB_I*NB_II; + for(i=0; i<2*BUFF_WIDTH; i++) + { + /* Read particle scalar */ + s = pscal[i + line_index]; + + c = i * mesh->dx.x + mesh->min_position; + v = velocity_cache[i + V_GHOSTS_NB]; + p = (c + hdt*v) * mesh->v_invdx; + + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind_p = i_ind + 1; + loc_ptr = (i_ind>=(V_START_INDEX-V_GHOSTS_NB)) ? velocity_cache + i_ind - (V_START_INDEX-V_GHOSTS_NB) : v_buff_l_loc+i_ind-(V_START_INDEX-V_GHOSTS_NB-1-V_BUFF_WIDTH+1); + v = *loc_ptr; + + loc_ptr = (i_ind_p>=(V_START_INDEX-V_GHOSTS_NB)) ? velocity_cache+i_ind_p - (V_START_INDEX-V_GHOSTS_NB) : v_buff_l_loc+i_ind_p-(V_START_INDEX-V_GHOSTS_NB-1-V_BUFF_WIDTH+1); + vp = *loc_ptr; + + v = (p*(vp-v) + v); + p = c + dt * v; + + + + ind = convert_int_rtn(p * mesh->invdx); + y = (p - convert_float(ind) * mesh->dx.x) * mesh->invdx; + + index = ind - REMESH_SHIFT; + + w = REMESH(alpha)(y); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(beta)(y); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(gamma)(y); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(delta)(y); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + +#if REMESH_SHIFT > 1 + index = index + 1; + w = REMESH(eta)(y); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(zeta)(y); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 2 + index = index + 1; + w = REMESH(theta)(y); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(iota)(y); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 3 + index = index + 1; + w = REMESH(kappa)(y); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(mu)(y); + if (index<START_INDEX){ loc_ptr = s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + // Store buffers + for(i=0;i<BUFF_WIDTH;i++) + s_buffer_l[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_buff_l_loc[i]; +} + + + + + + + + +__kernel void buff_advec_and_remesh_r(__global const float* gvelo, + __global float* v_buffer_r, + __global const float* pscal, + __global float* s_buffer_r, + float dt, __constant struct AdvectionMeshInfo* mesh) +{ + int gidY = get_global_id(0); /* OpenCL work-itme global index (Y) */ + int gidZ = get_global_id(1); /* OpenCL work-itme global index (Z) */ + int i; /* Particle index in 1D problem */ + int line_index ; /* Current 1D problem index */ + + float v,vp,p,c,s,y,w, hdt = 0.5 * dt; + int i_ind, i_ind_p, ind, index; + + float velocity_cache[V_NB_I]; + float v_buff_r_loc[V_BUFF_WIDTH]; + float s_buff_r_loc[BUFF_WIDTH]; + float* loc_ptr; + + // Initialize buffers + for(i=0;i<BUFF_WIDTH;i++) + s_buff_r_loc[i] = 0.0; + + for(i=0;i<V_BUFF_WIDTH;i++) + v_buff_r_loc[i] = v_buffer_r[i + V_BUFF_WIDTH*(gidY + gidZ*V_NB_II)]; + + line_index = gidY*V_NB_I + gidZ*V_NB_I*V_NB_II; + /* Read velocity */ + /* Fill velocity cache */ + for(i=0;i<V_NB_I; i++) + velocity_cache[i] = gvelo[i+line_index]; + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + line_index = gidY*NB_I+gidZ*NB_I*NB_II; + for(i=NB_I-2*BUFF_WIDTH; i<NB_I; i++) + { + /* Read particle scalar */ + s = pscal[i + line_index]; + + c = i * mesh->dx.x + mesh->min_position; + v = velocity_cache[i + V_GHOSTS_NB]; + p = (c + hdt*v) * mesh->v_invdx; + + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind_p = i_ind + 1; + loc_ptr = (i_ind <= (V_STOP_INDEX+V_GHOSTS_NB)) ? velocity_cache + i_ind - (V_START_INDEX-V_GHOSTS_NB) : v_buff_r_loc+i_ind-(V_STOP_INDEX+V_GHOSTS_NB+1) ; + v = *loc_ptr; + + loc_ptr = (i_ind_p <= (V_STOP_INDEX+V_GHOSTS_NB)) ? velocity_cache+i_ind_p - (V_START_INDEX-V_GHOSTS_NB) : v_buff_r_loc+i_ind_p-(V_STOP_INDEX+V_GHOSTS_NB+1) ; + vp = *loc_ptr; + + v = (p*(vp-v) + v); + p = c + dt * v; + + + + ind = convert_int_rtn(p * mesh->invdx); + y = (p - convert_float(ind) * mesh->dx.x) * mesh->invdx; + + index = ind - REMESH_SHIFT; + + w = REMESH(alpha)(y); + if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(beta)(y); + if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(gamma)(y); + if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(delta)(y); + if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + +#if REMESH_SHIFT > 1 + index = index + 1; + w = REMESH(eta)(y); + if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(zeta)(y); + if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 2 + index = index + 1; + w = REMESH(theta)(y); + if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(iota)(y); + if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 3 + index = index + 1; + w = REMESH(kappa)(y); + if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(mu)(y); + if (index > STOP_INDEX){ loc_ptr = s_buff_r_loc + index-(STOP_INDEX+1); + w = w * s; + (*loc_ptr) += w;} + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=0;i<BUFF_WIDTH;i++) + s_buffer_r[i + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_buff_r_loc[i]; + +} + + __kernel void buff_advec_and_remesh(__global const float* gvelo, __global float* v_buffer_l, __global float* v_buffer_r, __global const float* pscal, __global float* gscal, - __global float* s_buffer_l, - __global float* s_buffer_r, float dt, __constant struct AdvectionMeshInfo* mesh) { int gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ @@ -23,37 +327,24 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo, __local float v_buff_l_loc[V_BUFF_WIDTH]; __local float v_buff_r_loc[V_BUFF_WIDTH]; __local float gscal_loc[NB_I]; - __local float s_buff_l_loc[BUFF_WIDTH]; - __local float s_buff_r_loc[BUFF_WIDTH]; __local float* loc_ptr; - // Initialize buffers - if(gidX < BUFF_WIDTH) - s_buff_l_loc[gidX] = 0.0; - if(gidX < BUFF_WIDTH) - s_buff_r_loc[gidX] = 0.0; for(i=gidX; i<NB_I; i+=WI_NB) - { - /* Initialize result buffer */ - gscal_loc[i] = 0.0; - } + /* Initialize result buffer */ + gscal_loc[i] = 0.0; - for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)){ - v_buff_l_loc[i] = v_buffer_l[i + V_BUFF_WIDTH*(gidY + gidZ*V_NB_II)]; - } + for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)) + v_buff_l_loc[i] = v_buffer_l[i + V_BUFF_WIDTH*(gidY + gidZ*V_NB_II)]; - for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)){ - v_buff_r_loc[i] = v_buffer_r[i + V_BUFF_WIDTH*(gidY + gidZ*V_NB_II)]; - } + for(i=gidX; i<V_BUFF_WIDTH; i+=(WI_NB)) + v_buff_r_loc[i] = v_buffer_r[i + V_BUFF_WIDTH*(gidY + gidZ*V_NB_II)]; line_index = gidY*V_NB_I + gidZ*V_NB_I*V_NB_II; + /* Read velocity */ + /* Fill velocity cache */ for(i=gidX; i<V_NB_I; i+=(WI_NB)) - { - /* Read velocity */ - /* Fill velocity cache */ velocity_cache[i] = gvelo[i+line_index]; - } /* Synchronize work-group */ barrier(CLK_LOCAL_MEM_FENCE); @@ -88,77 +379,77 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo, index = ind - REMESH_SHIFT; w = REMESH(alpha)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(beta)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(gamma)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(delta)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); #if REMESH_SHIFT > 1 index = index + 1; w = REMESH(eta)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(zeta)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); #endif #if REMESH_SHIFT > 2 index = index + 1; w = REMESH(theta)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(iota)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); #endif #if REMESH_SHIFT > 3 index = index + 1; w = REMESH(kappa)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); index = index + 1; w = REMESH(mu)(y); - loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +index-START_INDEX : ( (index<START_INDEX)? s_buff_l_loc+index-(START_INDEX-1-BUFF_WIDTH+1) : s_buff_r_loc + index-(STOP_INDEX+1) ); + if (index>=START_INDEX && index <= STOP_INDEX){ loc_ptr = gscal_loc +index-START_INDEX; w = w * s; - (*loc_ptr) += w; + (*loc_ptr) += w;} barrier(CLK_LOCAL_MEM_FENCE); #endif @@ -172,11 +463,4 @@ __kernel void buff_advec_and_remesh(__global const float* gvelo, /* Store result */ gscal[i + line_index] = gscal_loc[i]; } - - // Store buffers - if (gidX < BUFF_WIDTH) - s_buffer_l[gidX + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_buff_l_loc[gidX]; - if (gidX < BUFF_WIDTH) - s_buffer_r[gidX + gidY*BUFF_WIDTH + gidZ*BUFF_WIDTH*NB_II] = s_buff_r_loc[gidX]; - } diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl index ad4d4dd9e8ddee507d7a8141ab2943d74cb3d5b2..4d12e508826ebc3f4f19b1e0beae31ee75c78297 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl @@ -176,7 +176,7 @@ __kernel void buff_remesh_r(__global const float* ppos, float* loc_ptr; // Initialize buffers - for(i=0; i<BUFF_WIDTH; i++) + for(i=0; i<BUFF_WIDTH; i++) r_buff_loc[i] = 0.0; /* Synchronize work-group */ diff --git a/HySoP/hysop/gpu/multi_gpu_particle_advection.py b/HySoP/hysop/gpu/multi_gpu_particle_advection.py index 126b4e686d1cd61ae17f139acf2b482cdfd90ce6..cf87210d6dcc8ec974ed9cb1dd31c9305a8ab96e 100644 --- a/HySoP/hysop/gpu/multi_gpu_particle_advection.py +++ b/HySoP/hysop/gpu/multi_gpu_particle_advection.py @@ -355,6 +355,12 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): build_options += " -D BUFF_WIDTH=" + str(self._s_buff_width) prg = self.cl_env.build_src( src, build_options, 1) + self.num_advec_and_remesh_comm_l = KernelLauncher( + prg.buff_advec_and_remesh_l, self.cl_env.queue, + (gwi[1], gwi[2]), (32, 1)) + self.num_advec_and_remesh_comm_r = KernelLauncher( + prg.buff_advec_and_remesh_r, self.cl_env.queue, + (gwi[1], gwi[2]), (32, 1)) self.num_advec_and_remesh = KernelLauncher( prg.buff_advec_and_remesh, self.cl_env.queue, gwi, lwi) @@ -734,31 +740,63 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): wait_evts = self.velocity.events + self._evt_l_v + self._evt_r_v + \ self._init_events[self.fields_on_grid[0]] if self._isMultiScale: - evt_num = self.num_advec_and_remesh( + evt_comm_l = self.num_advec_and_remesh_comm_l( self.velocity.gpu_data[self.direction], self._cl_v_l_buff, - self._cl_v_r_buff, self.fields_on_part[self.fields_on_grid[0]][0], - self.fields_on_grid[0].gpu_data[0], self._cl_s_l_buff, + self.gpu_precision(dt), + self.gpu_precision(1. / self._v_mesh_size[1]), + self.gpu_precision(1. / self._v_mesh_size[2]), + self._cl_mesh_info, + wait_for=wait_evts) + evt_comm_r = self.num_advec_and_remesh_comm_r( + self.velocity.gpu_data[self.direction], + self._cl_v_r_buff, + self.fields_on_part[self.fields_on_grid[0]][0], self._cl_s_r_buff, self.gpu_precision(dt), self.gpu_precision(1. / self._v_mesh_size[1]), self.gpu_precision(1. / self._v_mesh_size[2]), self._cl_mesh_info, wait_for=wait_evts) - else: evt_num = self.num_advec_and_remesh( self.velocity.gpu_data[self.direction], self._cl_v_l_buff, self._cl_v_r_buff, self.fields_on_part[self.fields_on_grid[0]][0], self.fields_on_grid[0].gpu_data[0], + self.gpu_precision(dt), + self.gpu_precision(1. / self._v_mesh_size[1]), + self.gpu_precision(1. / self._v_mesh_size[2]), + self._cl_mesh_info, + wait_for=wait_evts) + else: + evt_comm_l = self.num_advec_and_remesh_comm_l( + self.velocity.gpu_data[self.direction], + self._cl_v_l_buff, + self.fields_on_part[self.fields_on_grid[0]][0], self._cl_s_l_buff, + self.gpu_precision(dt), + self._cl_mesh_info, + wait_for=wait_evts) + evt_comm_r = self.num_advec_and_remesh_comm_r( + self.velocity.gpu_data[self.direction], + self._cl_v_r_buff, + self.fields_on_part[self.fields_on_grid[0]][0], self._cl_s_r_buff, self.gpu_precision(dt), self._cl_mesh_info, wait_for=wait_evts) + evt_num = self.num_advec_and_remesh( + self.velocity.gpu_data[self.direction], + self._cl_v_l_buff, + self._cl_v_r_buff, + self.fields_on_part[self.fields_on_grid[0]][0], + self.fields_on_grid[0].gpu_data[0], + self.gpu_precision(dt), + self._cl_mesh_info, + wait_for=wait_evts) self.evt_num_remesh = [evt_num] # Prepare the MPI receptions @@ -783,7 +821,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): buffer_pitches=(self._s_l_buff.nbytes, 0), region=(self._s_block_size, 1, 1), is_blocking=False, - wait_for=[evt_num]) + wait_for=[evt_comm_l]) # Send the left buffer ctime = MPI.Wtime() @@ -805,7 +843,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): buffer_pitches=(self._s_l_buff.nbytes, 0), region=(self._s_block_size, 1, 1), is_blocking=False, - wait_for=[evt_num]) + wait_for=[evt_comm_r]) # Send the right buffer for b in xrange(self._s_n_blocks): self._evt_get_r[b].wait() @@ -926,7 +964,10 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): for k in [self.copy, self.transpose_xy, self.transpose_xy_r, self.transpose_xz, self.transpose_xz_r, self.num_advec_and_remesh, - self.num_advec, self.num_remesh, self.num_remesh_comm_l, self.num_remesh_comm_r]: + self.num_advec, self.num_remesh, self.num_remesh_comm_l, + self.num_remesh_comm_r, self.num_advec_and_remesh, + self.num_advec_and_remesh_comm_r, + self.num_advec_and_remesh_comm_l]: if k is not None: for p in k.profile: self.profiler += p