From def467499b30793c413d53403e27b2ca1184c738 Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@imag.fr> Date: Mon, 4 Mar 2013 07:45:46 +0000 Subject: [PATCH] Add m8 remeshing --- HySoP/hysop/particular_solvers/gpu.py | 97 +++++++--- ...and_remesh_m8prime_2D_opt4_builtin_noBC.cl | 107 +++++++++++ ...and_remesh_m8prime_2D_opt8_builtin_noBC.cl | 177 ++++++++++++++++++ ...remesh_m8prime_3D_opt4_builtin_private4.cl | 115 ++++++++++++ ...h_m8prime_3D_opt4_builtin_private4_noBC.cl | 115 ++++++++++++ .../gpu_src/remeshing_m8prime_2D_opt4_noBC.cl | 102 ++++++++++ .../remeshing_m8prime_3D_opt4_private4.cl | 95 ++++++++++ .../gpu_src/weights_m8_prime.cl | 31 +++ .../gpu_src/weights_m8prime_builtin.cl | 33 ++++ .../gpu_src/weights_m8prime_opt4_builtin.cl | 33 ++++ 10 files changed, 874 insertions(+), 31 deletions(-) create mode 100644 HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl create mode 100644 HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl create mode 100644 HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl create mode 100644 HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl create mode 100644 HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_2D_opt4_noBC.cl create mode 100644 HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_3D_opt4_private4.cl create mode 100644 HySoP/hysop/particular_solvers/gpu_src/weights_m8_prime.cl create mode 100644 HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_builtin.cl create mode 100644 HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_opt4_builtin.cl diff --git a/HySoP/hysop/particular_solvers/gpu.py b/HySoP/hysop/particular_solvers/gpu.py index f3c55e6af..f0c9f39c3 100644 --- a/HySoP/hysop/particular_solvers/gpu.py +++ b/HySoP/hysop/particular_solvers/gpu.py @@ -20,7 +20,8 @@ class GPUParticleSolver(ParticleSolver): """ GPU Particular solver description. - Link with differents numericals methods used. Prepare GPU side (memory, kernels, ...) + Link with differents numericals methods used. + Prepare GPU side (memory, kernels, ...) """ def __init__(self, problem, t_end, dt, ODESolver=None, @@ -68,7 +69,8 @@ class GPUParticleSolver(ParticleSolver): #Get device. try: ## OpenCL device - self.device = self.platform.get_devices(eval("cl.device_type." + device_type.upper()))[device_id] + self.device = self.platform.get_devices( + eval("cl.device_type." + device_type.upper()))[device_id] except cl.RuntimeError as e: print "RuntimeError:", e self.device = cl.create_some_context().devices[0] @@ -76,17 +78,22 @@ class GPUParticleSolver(ParticleSolver): print "AttributeError:", e self.device = cl.create_some_context().devices[0] print " Device" - print " - Name :", self.device.name - print " - Type :", cl.device_type.to_string(self.device.type) - print " - C Version :", self.device.opencl_c_version - print " - Global mem size :", self.device.global_mem_size / (1024 ** 3), "GB" + print " - Name :", + print self.device.name + print " - Type :", + print cl.device_type.to_string(self.device.type) + print " - C Version :", + print self.device.opencl_c_version + print " - Global mem size :", + print self.device.global_mem_size / (1024 ** 3), "GB" print "===\n" #Creates GPU Context ## OpenCL context self.ctx = cl.Context([self.device]) #Create CommandQueue on the GPU Context ## OpenCL command queue - self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + self.queue = cl.CommandQueue( + self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) def initialize(self): """ @@ -129,27 +136,31 @@ class GPUParticleSolver(ParticleSolver): 'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF', 'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']: try: - if eval('(self.device.single_fp_config & cl.device_fp_config.' + + if eval('(self.device.single_fp_config &' + + ' cl.device_fp_config.' + v + ') == cl.device_fp_config.' + v): print v except AttributeError as ae: if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT': self.correctlyroundeddividesqrt = False - print v, 'is not supported in OpenCL C 1.2.\n Exception catched : ', ae + print v, 'is not supported in OpenCL C 1.2.\n', + print ' Exception catched : ', ae else: print "for Double Precision: " if self.device.double_fp_config > 0: - for v in ['DENORM', 'INF_NAN', + for v in ['DENORM', 'INF_NAN', 'FMA', 'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF', - 'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']: + 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']: try: - if eval('(self.device.double_fp_config & cl.device_fp_config.' + + if eval('(self.device.double_fp_config &' + + ' cl.device_fp_config.' + v + ') == cl.device_fp_config.' + v): print v except AttributeError as ae: if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT': self.correctlyroundeddividesqrt = False - print v, 'is supported in OpenCL C 1.2.\n Exception catched : ', ae + print v, 'is supported in OpenCL C 1.2.\n', + print ' Exception catched : ', ae else: raise ValueError("Double Precision is not supported by device") ## Building options and kernel settings @@ -167,20 +178,36 @@ class GPUParticleSolver(ParticleSolver): if self.advection.include_remesh: if len(resolution) == 3: if self.gpu_precision is np.float32: - src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' - src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4.cl' + if self.RemeshingMethod is 'm6prime': + src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' + src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4.cl' + else: + src_remesh_weights = 'weights_m8prime_opt4_builtin.cl' + src_advec_and_remesh = 'advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl' else: - src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' - src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4_noBC.cl' + if self.RemeshingMethod is 'm6prime': + src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' + src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4_noBC.cl' + else: + src_remesh_weights = 'weights_m8prime_opt4_builtin.cl' + src_advec_and_remesh = 'advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl' advec_and_remesh_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2])) advec_and_remesh_lwi = (int(workItemNumber), 1, 1) else: if self.gpu_precision is np.float32: - src_remesh_weights = 'weights_m6prime_builtin.cl' - src_advec_and_remesh = 'advection_and_remesh_2D_opt8_builtin_noBC.cl' + if self.RemeshingMethod is 'm6prime': + src_remesh_weights = 'weights_m6prime_builtin.cl' + src_advec_and_remesh = 'advection_and_remesh_2D_opt8_builtin_noBC.cl' + else: + src_remesh_weights = 'weights_m8prime_builtin.cl' + src_advec_and_remesh = 'advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl' else: - src_remesh_weights = 'weights_m6prime_builtin.cl' - src_advec_and_remesh = 'advection_and_remesh_2D_opt4_builtin_noBC.cl' + if self.RemeshingMethod is 'm6prime': + src_remesh_weights = 'weights_m6prime_builtin.cl' + src_advec_and_remesh = 'advection_and_remesh_2D_opt4_builtin_noBC.cl' + else: + src_remesh_weights = 'weights_m8prime_builtin.cl' + src_advec_and_remesh = 'advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl' advec_and_remesh_gwi = (int(workItemNumber), int(resolution[1])) advec_and_remesh_lwi = (int(workItemNumber), 1) f = open(os.path.join(GPU_SRC, src_remesh_weights)) @@ -212,13 +239,21 @@ class GPUParticleSolver(ParticleSolver): f.close() ## remeshing if len(resolution) == 3: - src_remesh = 'remeshing_3D_opt4_private4.cl' - src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' + if self.RemeshingMethod is 'm6prime': + src_remesh = 'remeshing_3D_opt4_private4.cl' + src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' + else: + src_remesh = 'remeshing_m8prime_3D_opt4_private4.cl' + src_remesh_weights = 'weights_m8prime_opt4_builtin.cl' remesh_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2])) remesh_lwi = (int(workItemNumber), 1, 1) else: - src_remesh = 'remeshing_2D_opt4_noBC.cl' - src_remesh_weights = 'weights_m6prime_builtin.cl' + if self.RemeshingMethod is 'm6prime': + src_remesh = 'remeshing_2D_opt4_noBC.cl' + src_remesh_weights = 'weights_m6prime_builtin.cl' + else: + src_remesh = 'remeshing_m8prime_2D_opt4_noBC.cl' + src_remesh_weights = 'weights_m8prime_builtin.cl' remesh_gwi = (int(workItemNumber), int(resolution[1])) remesh_lwi = (int(workItemNumber), 1) f = open(os.path.join(GPU_SRC, src_remesh_weights)) @@ -417,14 +452,14 @@ class GPUParticleSolver(ParticleSolver): sop.discreteOperator.gpu_precision = self.gpu_precision if sop.discreteOperator.name == 'advection': sop.setMethod(KernelLauncher(self.prg.advection, - self.queue, - advec_gwi, - advec_lwi)) + self.queue, + advec_gwi, + advec_lwi)) elif sop.discreteOperator.name == 'remeshing': sop.setMethod(KernelLauncher(self.prg.remeshing, - self.queue, - remesh_gwi, - remesh_lwi)) + self.queue, + remesh_gwi, + remesh_lwi)) elif sop.discreteOperator.name == 'advection_and_remeshing': sop.setMethod(KernelLauncher(self.prg.advection_and_remeshing, self.queue, diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl new file mode 100644 index 000000000..0a52ced9c --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt4_builtin_noBC.cl @@ -0,0 +1,107 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGN+(id/nb_part); +} +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int4 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGN; + float4 p, s, y, gs, v, vp, coord; + uint4 index4; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + { + v = vload4((i+gidY*WIDTH)/4, gvelo); + gvelo_loc[noBC_id(i,nb_part)] = v.x; + gvelo_loc[noBC_id(i+1,nb_part)] = v.y; + gvelo_loc[noBC_id(i+2,nb_part)] = v.z; + gvelo_loc[noBC_id(i+3,nb_part)] = v.w; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + v = (float4)(gvelo_loc[noBC_id(i,nb_part)], + gvelo_loc[noBC_id(i+1,nb_part)], + gvelo_loc[noBC_id(i+2,nb_part)], + gvelo_loc[noBC_id(i+3,nb_part)]); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)], + gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]); + vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)], + gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]); + p=fma(mix(v,vp,y),dt,coord); + + s = vload4((i + gidY*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH); + gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (eta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (eta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (eta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (eta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (zeta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (zeta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (zeta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (zeta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (theta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (theta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (theta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (theta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (iota(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (iota(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (iota(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (iota(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl new file mode 100644 index 000000000..043ba841f --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_2D_opt8_builtin_noBC.cl @@ -0,0 +1,177 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGN+(id/nb_part); +} +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt,float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int8 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGN; + float8 p, s, y, gs, v, vp, coord; + uint8 index8; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*8; i<WIDTH; i+=(WGN*8)) + { + v = vload8((i+gidY*WIDTH)/8, gvelo); + gvelo_loc[noBC_id(i,nb_part)] = v.s0; + gvelo_loc[noBC_id(i+1,nb_part)] = v.s1; + gvelo_loc[noBC_id(i+2,nb_part)] = v.s2; + gvelo_loc[noBC_id(i+3,nb_part)] = v.s3; + gvelo_loc[noBC_id(i+4,nb_part)] = v.s4; + gvelo_loc[noBC_id(i+5,nb_part)] = v.s5; + gvelo_loc[noBC_id(i+6,nb_part)] = v.s6; + gvelo_loc[noBC_id(i+7,nb_part)] = v.s7; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + gscal_loc[i+4] = 0.0; + gscal_loc[i+5] = 0.0; + gscal_loc[i+6] = 0.0; + gscal_loc[i+7] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=8) + { + v = (float8)(gvelo_loc[noBC_id(i,nb_part)], + gvelo_loc[noBC_id(i+1,nb_part)], + gvelo_loc[noBC_id(i+2,nb_part)], + gvelo_loc[noBC_id(i+3,nb_part)], + gvelo_loc[noBC_id(i+4,nb_part)], + gvelo_loc[noBC_id(i+5,nb_part)], + gvelo_loc[noBC_id(i+6,nb_part)], + gvelo_loc[noBC_id(i+7,nb_part)]); + coord = (float8)(i*dx, + (i+1)*dx, + (i+2)*dx, + (i+3)*dx, + (i+4)*dx, + (i+5)*dx, + (i+6)*dx, + (i+7)*dx); + p = fma((float8)(0.5*dt),v,coord); + ind = convert_int8_rtn(p * invdx); + y = (fma(- convert_float8(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float8)(gvelo_loc[noBC_id(i_ind.s0,nb_part)], + gvelo_loc[noBC_id(i_ind.s1,nb_part)], + gvelo_loc[noBC_id(i_ind.s2,nb_part)], + gvelo_loc[noBC_id(i_ind.s3,nb_part)], + gvelo_loc[noBC_id(i_ind.s4,nb_part)], + gvelo_loc[noBC_id(i_ind.s5,nb_part)], + gvelo_loc[noBC_id(i_ind.s6,nb_part)], + gvelo_loc[noBC_id(i_ind.s7,nb_part)]); + vp = (float8)(gvelo_loc[noBC_id(i_ind_p.s0,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s1,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s2,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s3,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s4,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s5,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s6,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s7,nb_part)]); + p = fma(mix(v,vp,y),dt,coord); + + s = vload8((i + gidY*WIDTH)/8, pscal); + ind = convert_int8_rtn(p * invdx); + y = (p - convert_float8(ind) * dx) * invdx; + index8 = convert_uint8((ind - 3 + WIDTH) % WIDTH); + gscal_loc[noBC_id(index8.s0,nb_part)] += (alpha(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (alpha(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (alpha(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (alpha(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (alpha(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (alpha(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (alpha(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (alpha(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (beta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (beta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (beta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (beta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (beta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (beta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (beta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (beta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (gamma(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (gamma(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (gamma(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (gamma(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (gamma(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (gamma(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (gamma(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (gamma(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (delta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (delta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (delta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (delta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (delta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (delta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (delta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (delta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (eta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (eta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (eta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (eta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (eta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (eta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (eta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (eta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (zeta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (zeta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (zeta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (zeta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (zeta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (zeta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (zeta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (zeta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (theta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (theta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (theta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (theta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (theta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (theta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (theta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (theta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (iota(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (iota(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (iota(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (iota(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (iota(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (iota(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (iota(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (iota(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*8; i<WIDTH; i+=(WGN*8)) + vstore8((float8)(gscal_loc[noBC_id(i,nb_part)], + gscal_loc[noBC_id(i+1,nb_part)], + gscal_loc[noBC_id(i+2,nb_part)], + gscal_loc[noBC_id(i+3,nb_part)], + gscal_loc[noBC_id(i+4,nb_part)], + gscal_loc[noBC_id(i+5,nb_part)], + gscal_loc[noBC_id(i+6,nb_part)], + gscal_loc[noBC_id(i+7,nb_part)]), (i + gidY*WIDTH)/8, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl new file mode 100644 index 000000000..92b508820 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4.cl @@ -0,0 +1,115 @@ + +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0/dx; + int4 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGN; + float4 p, s, y, gs, v, vp, coord; + uint4 index4; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + { + v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); + gvelo_loc[i] = v.x; + gvelo_loc[i+1] = v.y; + gvelo_loc[i+2] = v.z; + gvelo_loc[i+3] = v.w; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + v = vload4(i/4, gvelo_loc); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(gvelo_loc[i_ind.x], gvelo_loc[i_ind.y], + gvelo_loc[i_ind.z], gvelo_loc[i_ind.w]); + vp = (float4)(gvelo_loc[i_ind_p.x], gvelo_loc[i_ind_p.y], + gvelo_loc[i_ind_p.z], gvelo_loc[i_ind_p.w]); + p=fma(mix(v,vp,y),dt,coord); + + s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH); + barrier(CLK_LOCAL_MEM_FENCE); + gs = (alpha(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (beta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (gamma(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (delta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (eta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (zeta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (theta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (iota(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl new file mode 100644 index 000000000..1557aaffc --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_m8prime_3D_opt4_builtin_private4_noBC.cl @@ -0,0 +1,115 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGN+(id/nb_part); +} +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0/dx; + int4 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGN; + float4 p, s, y, gs, v, vp, coord; + uint4 index4; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + { + v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); + gvelo_loc[noBC_id(i, nb_part)] = v.x; + gvelo_loc[noBC_id(i+1, nb_part)] = v.y; + gvelo_loc[noBC_id(i+2, nb_part)] = v.z; + gvelo_loc[noBC_id(i+3, nb_part)] = v.w; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + v = (float4)(gvelo_loc[noBC_id(i,nb_part)], + gvelo_loc[noBC_id(i+1,nb_part)], + gvelo_loc[noBC_id(i+2,nb_part)], + gvelo_loc[noBC_id(i+3,nb_part)]); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)], + gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]); + vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)], + gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]); + p=fma(mix(v,vp,y),dt,coord); + + s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH); + gs = (alpha(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (beta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (gamma(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (delta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (eta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (zeta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (theta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (iota(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_2D_opt4_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_2D_opt4_noBC.cl new file mode 100644 index 000000000..69fddc7f9 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_2D_opt4_noBC.cl @@ -0,0 +1,102 @@ +/** + * Local memory as a 2D array to avoid bank conflicts. + * + * @param id Index 1D + * @param nb_part Particle number per work-item + * + * @return Index in a 2D space. + */ +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGN+(id/nb_part); +} + +/** + * 2D Remeshing kernel. + * + * @param ppos Particle position + * @param pscal Particle scalar + * @param gscal Grid scalar + * @param min_position Domain lower point + * @param dx Space step + */ +__kernel void remeshing(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int4 ind; + uint i, nb_part = WIDTH/WGN; + float4 p, s, y; + uint4 index4; + + __local float gscal_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + { + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position); + s = vload4((i + gidY*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH); // i-3 + gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i-2 + gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i-1 + gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i + gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; //i+1 + gscal_loc[noBC_id(index4.x,nb_part)] += (eta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (eta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (eta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (eta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i+2 + gscal_loc[noBC_id(index4.x,nb_part)] += (zeta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (zeta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (zeta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (zeta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i+3 + gscal_loc[noBC_id(index4.x,nb_part)] += (theta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (theta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (theta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (theta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; // i+4 + gscal_loc[noBC_id(index4.x,nb_part)] += (iota(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (iota(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (iota(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (iota(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_3D_opt4_private4.cl b/HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_3D_opt4_private4.cl new file mode 100644 index 000000000..6af20d005 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/remeshing_m8prime_3D_opt4_private4.cl @@ -0,0 +1,95 @@ + +__kernel void remeshing(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0/dx; + int4 ind; + uint i, nb_part = WIDTH/WGN; + float4 p, s, y, gs; + uint4 index4; + + __local float gscal_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + { + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + p = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, ppos) - (float4)(min_position); + s = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 3 + WIDTH) % WIDTH); + gs = (alpha(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (beta(y, s)); + gscal_loc[index4.x]+= gs.x; + gscal_loc[index4.y]+= gs.y; + gscal_loc[index4.z]+= gs.z; + gscal_loc[index4.w]+= gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (gamma(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (delta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (eta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (zeta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (theta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (iota(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + vstore4((float4)(gscal_loc[i], + gscal_loc[i+1], + gscal_loc[i+2], + gscal_loc[i+3]), + (i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m8_prime.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m8_prime.cl new file mode 100644 index 000000000..84c4032b0 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m8_prime.cl @@ -0,0 +1,31 @@ +/** + * M8prime weights. + * + * @param y Distance between the particle and left-hand grid point + * @param s Scalar of the particle + * + * @return weights + */ +inline float alpha(float y, float s){ + return ((y*(y*(y*(y*(y*(y*(-10.0*y + 21.0) + 28.0) - 105.0) + 70.0) + 35.0) - 56.0) + 17.0) / 3360.0) * s;} + +inline float beta(float y, float s){ + return ((y*(y*(y*(y*(y*(y*(70.0*y - 175.0) - 140.0) + 770.0) - 560.0) - 350.0) + 504.0) - 102.0) / 3360.0)*s;} + +inline float gamma(float y, float s){ + return ((y*(y*(y*(y*(y*(y*(-210.0*y + 609.0) + 224.0) - 2135.0) + 910.0) + 2765.0) - 2520.0) + 255.0) / 3360.0)*s;} + +inline float delta(float y, float s){ + return ((y*y*(y*y*(y*y*(70.0*y - 231.0) + 588.0) - 980.0) + 604.0) / 672.0)*s;} + +inline float eta(float y, float s){ + return ((y*(y*(y*(y*(y*(y*(-70.0*y + 259.0) - 84.0) - 427.0) - 182.0) + 553.0) + 504.0) + 51.0) / 672.0)*s;} + +inline float zeta(float y, float s){ + return ((y*(y*(y*(y*(y*(y*(210.0*y - 861.0) + 532.0) + 770.0) + 560.0) - 350.0) - 504.0) - 102.0) / 3360.0)*s;} + +inline float theta(float y, float s){ + return ((y*(y*(y*(y*(y*(y*(-70.0*y + 315.0) - 280.0) - 105.0) - 70.0) + 35.0) + 56.0) + 17.0) / 3360.0)*s;} + +inline float iota(float y, float s){ + return ((y*y*y*y*y*(y*(10.*y - 49.) + 56.))/3360.)*s;} diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_builtin.cl new file mode 100644 index 000000000..94d393d33 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_builtin.cl @@ -0,0 +1,33 @@ +/** + * M8prime weights. + * + * @param y Distance between the particle and left-hand grid point + * @param s Scalar of the particle + * + * @return weights + * + * Use OpenCL fma builtin function. + */ +inline float alpha(float y, float s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) / 3360.0) * s;} + +inline float beta(float y, float s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(70.0,y, - 175.0), - 140.0), + 770.0), - 560.0), - 350.0), + 504.0), - 102.0) / 3360.0)*s;} + +inline float gamma(float y, float s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-210.0,y, + 609.0), + 224.0), - 2135.0), + 910.0), + 2765.0), - 2520.0), + 255.0) / 3360.0)*s;} + +inline float delta(float y, float s){ + return (fma(y*y, fma(y*y, fma(y*y, fma(70.0,y, - 231.0), + 588.0), - 980.0), + 604.0) / 672.0)*s;} + +inline float eta(float y, float s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-70.0,y, 259.0), - 84.0), - 427.0), - 182.0), + 553.0), + 504.0), + 51.0) / 672.0)*s;} + +inline float zeta(float y, float s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(210.0,y,- 861.0), + 532.0), + 770.0), + 560.0), - 350.0), - 504.0), - 102.0) / 3360.0)*s;} + +inline float theta(float y, float s){ + return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) / 3360.0)*s;} + +inline float iota(float y, float s){ + return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) / 3360.0)*s;} diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_opt4_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_opt4_builtin.cl new file mode 100644 index 000000000..90f28e492 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m8prime_opt4_builtin.cl @@ -0,0 +1,33 @@ +/** + * M8prime weights. + * + * @param y Distance between the particle and left-hand grid point + * @param s Scalar of the particle + * + * @return weights + * + * Use OpenCL fma builtin function, computed by float4. + */ +inline float4 alpha(float4 y, float4 s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) / 3360.0) * s;} + +inline float4 beta(float4 y, float4 s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(70.0,y, - 175.0), - 140.0), + 770.0), - 560.0), - 350.0), + 504.0), - 102.0) / 3360.0)*s;} + +inline float4 gamma(float4 y, float4 s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-210.0,y, + 609.0), + 224.0), - 2135.0), + 910.0), + 2765.0), - 2520.0), + 255.0) / 3360.0)*s;} + +inline float4 delta(float4 y, float4 s){ + return (fma(y*y, fma(y*y, fma(y*y, fma(70.0,y, - 231.0), + 588.0), - 980.0), + 604.0) / 672.0)*s;} + +inline float4 eta(float4 y, float4 s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-70.0,y, 259.0), - 84.0), - 427.0), - 182.0), + 553.0), + 504.0), + 51.0) / 672.0)*s;} + +inline float4 zeta(float4 y, float4 s){ + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(210.0,y,- 861.0), + 532.0), + 770.0), + 560.0), - 350.0), - 504.0), - 102.0) / 3360.0)*s;} + +inline float4 theta(float4 y, float4 s){ + return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) / 3360.0)*s;} + +inline float4 iota(float4 y, float4 s){ + return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) / 3360.0)*s;} -- GitLab