diff --git a/HySoP/hysop/operator/advec_and_remesh_d.py b/HySoP/hysop/operator/advec_and_remesh_d.py new file mode 100644 index 0000000000000000000000000000000000000000..38e927542794cf9c719ab805573db8a9b2964d78 --- /dev/null +++ b/HySoP/hysop/operator/advec_and_remesh_d.py @@ -0,0 +1,128 @@ +""" +@package parmepy.operator.transport_d + +Discrete transport representation +""" +from ..constants import * +from .discrete import DiscreteOperator +import pyopencl as cl +import time + + +class Advec_and_Remesh_d(DiscreteOperator): + """ + Particle advection and remesh operator representation. + + """ + + def __init__(self, advec, idVelocityD=0, idScalarD=0, particle_scalar=None, method=None): + """ + Create a Advection and Remeshing operator. + Work on a given scalar at a given velocity to produce scalar distribution at new positions. + + @param advec : Transport operator + @param idVelocityD : Index of velocity discretisation to use. + @param idScalarD : Index of scalar discretisation to use. + @param result_scalar : result scalar. + @param method : the method to use. + """ + DiscreteOperator.__init__(self) + ## Velocity. + self.velocity = advec.velocity.discreteField[idVelocityD] + ## Transported scalar. + self.scalar = advec.scalar.discreteField[idScalarD] + ## Result scalar + self.particle_scalar = particle_scalar.discreteField[0] + self.input = [self.velocity, self.scalar] + self.output = [self.particle_scalar] + self.method = method + ## Previous splitting direction + self.old_splitting_direction = None + ## Compute time detailed per directions + self.compute_time = [0., 0., 0.] + ## Compute time for copy detailed per directions + self.compute_time_copy = [0., 0., 0.] + ## Compute time for transposition detailed per directions + self.compute_time_swap = [0., 0., 0.] + self.name = "advection_and_remeshing" + self.call_number = [0, 0, 0] + self.gpu_precision = PARMES_REAL_GPU + + def apply(self, t, dt, splittingDirection): + """ + Apply advection and remeshing operator. + + @param t : current time. + @param dt : time step. + @param splittingDirection : Direction of splitting. + + Advection algorithm: + @li 1. Particle initialization : \n + - by copy scalar from grid to particles if previous splitting direction equals current splitting direction.\n + - by transposition of scalar from grid to particle. + @li 2. Particle advection and remesh :\n + - compute particle position in splitting direction as a scalar. Performs a RK2 resolution of dx_p/dt = a_p. + - remesh particle scalar + @li 3. Profile timings of OpenCL kernels. + """ + c_time, c_time_init = 0., 0. + self.call_number[splittingDirection] += 1 + if self.numMethod is not None and self.init_transpose is not None and self.init_copy is not None: + # Particle init + if (self.old_splitting_direction == splittingDirection) or self.old_splitting_direction is None: + evt_init = self.init_copy.launch(self.scalar.gpu_data, + self.particle_scalar.gpu_data) + else: + if self.scalar.domain.dimension == 2: + evt_init = self.init_transpose.launch(self.scalar.gpu_data, + self.particle_scalar.gpu_data) + else: + if min(self.old_splitting_direction, splittingDirection) == 0: + evt_init = self.init_transpose.launch(0, + self.scalar.gpu_data, + self.particle_scalar.gpu_data) + else: + evt_init = self.init_transpose.launch(1, + self.scalar.gpu_data, + self.particle_scalar.gpu_data) + # Advection and Remeshing + evt = self.numMethod.launch(self.velocity.gpu_data[splittingDirection], + self.particle_scalar.gpu_data, + self.scalar.gpu_data, + self.gpu_precision(dt), + self.gpu_precision(self.scalar.domain.origin[splittingDirection]), + self.gpu_precision(self.scalar.domain.step[splittingDirection])) + for df in self.output: + df.contains_data = False + # Get timpings from OpenCL events + self.numMethod.finish() + c_time_init = (evt_init.profile.end - evt_init.profile.start) * 1e-9 + c_time = (evt.profile.end - evt.profile.start) * 1e-9 + self.compute_time[splittingDirection] += c_time + if (self.old_splitting_direction == splittingDirection) or self.old_splitting_direction is None: + self.compute_time_copy[splittingDirection] += c_time_init + else: + self.compute_time_swap[min(self.old_splitting_direction, splittingDirection)] += c_time_init + self.total_time += (c_time + c_time_init) + self.old_splitting_direction = splittingDirection + return (c_time + c_time_init) + + def printComputeTime(self): + self.timings_info[0] = "\"Advection_and_Remesh total\" \"copy\" \"swap xy\" \"swap xz\"" + self.timings_info[0] += " \"Advection_and_Remesh x\" \"Advection_and_Remesh y\" \"Advection_and_Remesh z\" " + self.timings_info[1] = str(self.total_time) + " " + str(self.compute_time_copy[0]) + " " + self.timings_info[1] += str(self.compute_time_swap[0]) + " " + str(self.compute_time_swap[1]) + " " + self.timings_info[1] += str(self.compute_time[0]) + " " + str(self.compute_time[1]) + " " + str(self.compute_time[2]) + " " + print "Advection_and_Remesh total time : ", self.total_time, self.call_number + print "\t Advection_and_Remesh init copy :", self.compute_time_copy, self.init_copy.call_number + print "\t Advection_and_Remesh init swap :", self.compute_time_swap, self.init_transpose.call_number + print "\t Advection_and_Remesh :", self.compute_time, self.numMethod.call_number + + def __str__(self): + s = "Advection_P (DiscreteOperator). " + DiscreteOperator.__str__(self) + return s + +if __name__ == "__main__": + print __doc__ + print "- Provided class : Transport_d" + print AdvectionDOp.__doc__ diff --git a/HySoP/hysop/operator/transport.py b/HySoP/hysop/operator/transport.py index 4879d1799ccf49f4f2395705ab8abf287c25b075..b130522ccd68df9fc098920dcc8df3d4a663337d 100644 --- a/HySoP/hysop/operator/transport.py +++ b/HySoP/hysop/operator/transport.py @@ -5,6 +5,7 @@ Transport operator representation. """ from .continuous import ContinuousOperator from .transport_d import Transport_d +from .advec_and_remesh_d import Advec_and_Remesh_d class Transport(ContinuousOperator): @@ -12,7 +13,7 @@ class Transport(ContinuousOperator): Transport operator representation. """ - def __init__(self, velocity, scalar): + def __init__(self, velocity, scalar, include_remesh=False): """ Create an Transport operator from given variables velocity and scalar. @@ -26,6 +27,7 @@ class Transport(ContinuousOperator): self.scalar = scalar self.needSplitting = True self.addVariable([velocity, scalar]) + self.include_remesh = include_remesh def discretize(self, idVelocityD=0, idScalarD=0, result_position=None, result_scalar=None, method=None): """ @@ -39,7 +41,10 @@ class Transport(ContinuousOperator): @param method : the method to use. """ if self.discreteOperator is None: - self.discreteOperator = Transport_d(self, idVelocityD, idScalarD, result_position, result_scalar, method) + if self.include_remesh: + self.discreteOperator = Advec_and_Remesh_d(self, idVelocityD, idScalarD, result_scalar, method) + else: + self.discreteOperator = Transport_d(self, idVelocityD, idScalarD, result_position, result_scalar, method) def __str__(self): """ToString method""" diff --git a/HySoP/hysop/particular_solvers/basic.py b/HySoP/hysop/particular_solvers/basic.py index 8c5544b664219b5c0f0b20493ee19e58de801b78..32c87d746c822be0d37f1c84dd225b7dbd021da8 100644 --- a/HySoP/hysop/particular_solvers/basic.py +++ b/HySoP/hysop/particular_solvers/basic.py @@ -132,7 +132,7 @@ class ParticleSolver(Solver): # op.discretize(result_position=self.p_position, result_scalar=self.p_scalar, method=self.ODESolver) else: op.discretize() - if self.advection is not None: + if self.advection is not None and not self.advection.include_remesh: ## Create remeshing operator self.remeshing = Remeshing(partPositions=self.advection.discreteOperator.res_position, partScalar=self.advection.discreteOperator.res_scalar, @@ -140,13 +140,16 @@ class ParticleSolver(Solver): method=self.RemeshingMethod) for op in self.problem.operators: if op.needSplitting: - if op is self.advection: - index = self.problem.operators.index(op) + index = self.problem.operators.index(op) + if op is self.advection and not self.advection.include_remesh: self.problem.operators[index] = Splitting([op, self.remeshing], dim=self.problem.topology.dim) else: self.problem.operators[index] = Splitting([op], dim=self.problem.topology.dim) self.isInitialized = True + def end(self): + """@TODO : Put here memory free instrutcions""" + def __str__(self): """ToString method""" s = " Particular solver " diff --git a/HySoP/hysop/particular_solvers/gpu.py b/HySoP/hysop/particular_solvers/gpu.py index cbd48a6bb8dfaa284c57da6694a438b256a6e736..f3c55e6afd46cec922d7dd9b18da4e961a7489d4 100644 --- a/HySoP/hysop/particular_solvers/gpu.py +++ b/HySoP/hysop/particular_solvers/gpu.py @@ -164,43 +164,71 @@ class GPUParticleSolver(ParticleSolver): ## Collect OpenCL source code print "\n Source code:" ## advection - if len(resolution) == 3: - if self.gpu_precision is np.float32: - src_advec = 'advection_3D_opt4_builtin.cl' + if self.advection.include_remesh: + if len(resolution) == 3: + if self.gpu_precision is np.float32: + src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' + src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4.cl' + else: + src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' + src_advec_and_remesh = 'advection_and_remesh_3D_opt4_builtin_private4_noBC.cl' + advec_and_remesh_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2])) + advec_and_remesh_lwi = (int(workItemNumber), 1, 1) else: - src_advec = 'advection_3D_opt2_builtin.cl' - advec_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2])) - advec_lwi = (int(workItemNumber), 1, 1) + if self.gpu_precision is np.float32: + src_remesh_weights = 'weights_m6prime_builtin.cl' + src_advec_and_remesh = 'advection_and_remesh_2D_opt8_builtin_noBC.cl' + else: + src_remesh_weights = 'weights_m6prime_builtin.cl' + src_advec_and_remesh = 'advection_and_remesh_2D_opt4_builtin_noBC.cl' + advec_and_remesh_gwi = (int(workItemNumber), int(resolution[1])) + advec_and_remesh_lwi = (int(workItemNumber), 1) + f = open(os.path.join(GPU_SRC, src_remesh_weights)) + self.gpu_src += "".join(f.readlines()) + print " - remeshing weights:", f.name + f.close() + f = open(os.path.join(GPU_SRC, src_advec_and_remesh)) + print " - advection and remesh:", f.name + self.gpu_src += "".join(f.readlines()) + f.close() else: - if self.gpu_precision is np.float32: - src_advec = 'advection_2D_opt4_builtin.cl' + if len(resolution) == 3: + if self.gpu_precision is np.float32: + src_advec = 'advection_3D_opt4_builtin.cl' + else: + src_advec = 'advection_3D_opt2_builtin.cl' + advec_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2])) + advec_lwi = (int(workItemNumber), 1, 1) else: - src_advec = 'advection_2D_builtin.cl' - advec_gwi = (int(workItemNumber), int(resolution[1])) - advec_lwi = (int(workItemNumber), 1) - f = open(os.path.join(GPU_SRC, src_advec)) - print " - advection:", f.name - self.gpu_src += "".join(f.readlines()) - f.close() - ## remeshing - if len(resolution) == 3: - src_remesh = 'remeshing_3D_opt4_private4.cl' - src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' - remesh_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2])) - remesh_lwi = (int(workItemNumber), 1, 1) - else: - src_remesh = 'remeshing_2D_opt4_noBC.cl' - src_remesh_weights = 'weights_m6prime_builtin.cl' - remesh_gwi = (int(workItemNumber), int(resolution[1])) - remesh_lwi = (int(workItemNumber), 1) - f = open(os.path.join(GPU_SRC, src_remesh_weights)) - self.gpu_src += "".join(f.readlines()) - print " - remeshing weights:", f.name - f.close() - f = open(os.path.join(GPU_SRC, src_remesh)) - print " - remeshing:", f.name - self.gpu_src += "".join(f.readlines()) - f.close() + if self.gpu_precision is np.float32: + src_advec = 'advection_2D_opt4_builtin.cl' + else: + src_advec = 'advection_2D_builtin.cl' + advec_gwi = (int(workItemNumber), int(resolution[1])) + advec_lwi = (int(workItemNumber), 1) + f = open(os.path.join(GPU_SRC, src_advec)) + print " - advection:", f.name + self.gpu_src += "".join(f.readlines()) + f.close() + ## remeshing + if len(resolution) == 3: + src_remesh = 'remeshing_3D_opt4_private4.cl' + src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' + remesh_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2])) + remesh_lwi = (int(workItemNumber), 1, 1) + else: + src_remesh = 'remeshing_2D_opt4_noBC.cl' + src_remesh_weights = 'weights_m6prime_builtin.cl' + remesh_gwi = (int(workItemNumber), int(resolution[1])) + remesh_lwi = (int(workItemNumber), 1) + f = open(os.path.join(GPU_SRC, src_remesh_weights)) + self.gpu_src += "".join(f.readlines()) + print " - remeshing weights:", f.name + f.close() + f = open(os.path.join(GPU_SRC, src_remesh)) + print " - remeshing:", f.name + self.gpu_src += "".join(f.readlines()) + f.close() ## copy if len(resolution) == 3: if self.gpu_precision is np.float32: @@ -210,20 +238,20 @@ class GPUParticleSolver(ParticleSolver): copy_lwi = (4, 8, 1) else: src_copy = 'copy_3D_locMem.cl' - build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=4" - copy_gwi = (int(resolution[0]), int(resolution[1] / 8), int(resolution[2])) - copy_lwi = (32, 4, 1) + build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=8" + copy_gwi = (int(resolution[0]), int(resolution[1] / 4), int(resolution[2])) + copy_lwi = (32, 8, 1) else: if self.gpu_precision is np.float32: src_copy = 'copy_2D_opt2.cl' - build_options += " -D TILE_DIM_COPY=128 -D BLOCK_ROWS_COPY=4" - copy_gwi = (int(resolution[0] / 2), int(resolution[1] / 32)) - copy_lwi = (64, 4) + build_options += " -D TILE_DIM_COPY=16 -D BLOCK_ROWS_COPY=8" + copy_gwi = (int(resolution[0] / 2), int(resolution[1] / 2)) + copy_lwi = (8, 8) else: - src_copy = 'copy_2D.cl' + src_copy = 'copy_2D_opt2.cl' build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=2" - copy_gwi = (int(resolution[0]), int(resolution[1] / 16)) - copy_lwi = (32, 2) + copy_gwi = (int(resolution[0] / 2), int(resolution[1] / 16)) + copy_lwi = (16, 2) f = open(os.path.join(GPU_SRC, src_copy)) print " - copy:", f.name self.gpu_src += "".join(f.readlines()) @@ -232,9 +260,9 @@ class GPUParticleSolver(ParticleSolver): if len(resolution) == 3: if self.gpu_precision is np.float32: src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl' - build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8" - transpose_xy_gwi = (int(resolution[0] / 2), int(resolution[1] / 4), int(resolution[2])) - transpose_xy_lwi = (16, 8, 1) + build_options += " -D TILE_DIM_XY=16 -D BLOCK_ROWS_XY=8" + transpose_xy_gwi = (int(resolution[0] / 2), int(resolution[1] / 2), int(resolution[2])) + transpose_xy_lwi = (8, 8, 1) src_transpose_xz = 'transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl' build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4" transpose_xz_gwi = (int(resolution[0]), int(resolution[1] / 4), int(resolution[2] / 4)) @@ -249,10 +277,16 @@ class GPUParticleSolver(ParticleSolver): transpose_xz_gwi = (int(resolution[0]), int(resolution[1] / 4), int(resolution[2] / 4)) transpose_xz_lwi = (8, 2, 2) else: - src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl' - build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=2" - transpose_xy_gwi = (int(resolution[0] / 4), int(resolution[1]) / 16) - transpose_xy_lwi = (8, 2) + if self.gpu_precision is np.float32: + src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl' + build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8" + transpose_xy_gwi = (int(resolution[0] / 4), int(resolution[1]) / 4) + transpose_xy_lwi = (8, 8) + else: + src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl' + build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=2" + transpose_xy_gwi = (int(resolution[0] / 4), int(resolution[1]) / 16) + transpose_xy_lwi = (8, 2) f = open(os.path.join(GPU_SRC, src_transpose_xy)) print " - transpose_xy:", f.name self.gpu_src += "".join(f.readlines()) @@ -391,6 +425,11 @@ class GPUParticleSolver(ParticleSolver): self.queue, remesh_gwi, remesh_lwi)) + elif sop.discreteOperator.name == 'advection_and_remeshing': + sop.setMethod(KernelLauncher(self.prg.advection_and_remeshing, + self.queue, + advec_and_remesh_gwi, + advec_and_remesh_lwi)) else: raise ValueError("Unknown operator : " + sop.discreteOperator.name) else: @@ -420,6 +459,21 @@ class GPUParticleSolver(ParticleSolver): self.problem.io.set_get_data_method(self.get_data_from_device) print "===\n" + def end(self): + print "=== OpenCL Buffer deallocations ===" + for f in self.problem.variables: + print f.name, "deallocate " + temp_mem_used = 0 + for df in f.discreteField: + if f.vector: + for d in xrange(len(df.data)): + if isinstance(df.gpu_data[d], cl.MemoryObject): + df.gpu_data[d].release() + else: + if isinstance(df.gpu_data, cl.MemoryObject): + df.gpu_data.release() + print "===\n" + def get_data_from_device(self, field): """Data collect method.""" deviceToHost(self.queue, field) diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_2D_opt4_builtin_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_2D_opt4_builtin_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..2eaec614a26d971b549d417d48d85a2ce4f5dd80 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_2D_opt4_builtin_noBC.cl @@ -0,0 +1,95 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGN+(id/nb_part); +} +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int4 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGN; + float4 p, s, y, gs, v, vp, coord; + uint4 index4; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + { + v = vload4((i+gidY*WIDTH)/4, gvelo); + gvelo_loc[noBC_id(i,nb_part)] = v.x; + gvelo_loc[noBC_id(i+1,nb_part)] = v.y; + gvelo_loc[noBC_id(i+2,nb_part)] = v.z; + gvelo_loc[noBC_id(i+3,nb_part)] = v.w; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + v = (float4)(gvelo_loc[noBC_id(i,nb_part)], + gvelo_loc[noBC_id(i+1,nb_part)], + gvelo_loc[noBC_id(i+2,nb_part)], + gvelo_loc[noBC_id(i+3,nb_part)]); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)], + gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]); + vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)], + gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]); + p=fma(mix(v,vp,y),dt,coord); + + s = vload4((i + gidY*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH); + gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (eta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (eta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (eta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (eta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (zeta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (zeta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (zeta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (zeta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_2D_opt8_builtin_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_2D_opt8_builtin_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..289b9d4a6a0c4fc9bd2f84c09e96d294385485ed --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_2D_opt8_builtin_noBC.cl @@ -0,0 +1,157 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGN+(id/nb_part); +} +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt,float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int8 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGN; + float8 p, s, y, gs, v, vp, coord; + uint8 index8; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*8; i<WIDTH; i+=(WGN*8)) + { + v = vload8((i+gidY*WIDTH)/8, gvelo); + gvelo_loc[noBC_id(i,nb_part)] = v.s0; + gvelo_loc[noBC_id(i+1,nb_part)] = v.s1; + gvelo_loc[noBC_id(i+2,nb_part)] = v.s2; + gvelo_loc[noBC_id(i+3,nb_part)] = v.s3; + gvelo_loc[noBC_id(i+4,nb_part)] = v.s4; + gvelo_loc[noBC_id(i+5,nb_part)] = v.s5; + gvelo_loc[noBC_id(i+6,nb_part)] = v.s6; + gvelo_loc[noBC_id(i+7,nb_part)] = v.s7; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + gscal_loc[i+4] = 0.0; + gscal_loc[i+5] = 0.0; + gscal_loc[i+6] = 0.0; + gscal_loc[i+7] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=8) + { + v = (float8)(gvelo_loc[noBC_id(i,nb_part)], + gvelo_loc[noBC_id(i+1,nb_part)], + gvelo_loc[noBC_id(i+2,nb_part)], + gvelo_loc[noBC_id(i+3,nb_part)], + gvelo_loc[noBC_id(i+4,nb_part)], + gvelo_loc[noBC_id(i+5,nb_part)], + gvelo_loc[noBC_id(i+6,nb_part)], + gvelo_loc[noBC_id(i+7,nb_part)]); + coord = (float8)(i*dx, + (i+1)*dx, + (i+2)*dx, + (i+3)*dx, + (i+4)*dx, + (i+5)*dx, + (i+6)*dx, + (i+7)*dx); + p = fma((float8)(0.5*dt),v,coord); + ind = convert_int8_rtn(p * invdx); + y = (fma(- convert_float8(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float8)(gvelo_loc[noBC_id(i_ind.s0,nb_part)], + gvelo_loc[noBC_id(i_ind.s1,nb_part)], + gvelo_loc[noBC_id(i_ind.s2,nb_part)], + gvelo_loc[noBC_id(i_ind.s3,nb_part)], + gvelo_loc[noBC_id(i_ind.s4,nb_part)], + gvelo_loc[noBC_id(i_ind.s5,nb_part)], + gvelo_loc[noBC_id(i_ind.s6,nb_part)], + gvelo_loc[noBC_id(i_ind.s7,nb_part)]); + vp = (float8)(gvelo_loc[noBC_id(i_ind_p.s0,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s1,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s2,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s3,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s4,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s5,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s6,nb_part)], + gvelo_loc[noBC_id(i_ind_p.s7,nb_part)]); + p = fma(mix(v,vp,y),dt,coord); + + s = vload8((i + gidY*WIDTH)/8, pscal); + ind = convert_int8_rtn(p * invdx); + y = (p - convert_float8(ind) * dx) * invdx; + index8 = convert_uint8((ind - 2 + WIDTH) % WIDTH); + gscal_loc[noBC_id(index8.s0,nb_part)] += (alpha(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (alpha(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (alpha(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (alpha(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (alpha(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (alpha(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (alpha(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (alpha(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (beta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (beta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (beta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (beta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (beta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (beta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (beta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (beta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (gamma(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (gamma(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (gamma(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (gamma(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (gamma(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (gamma(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (gamma(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (gamma(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (delta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (delta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (delta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (delta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (delta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (delta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (delta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (delta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (eta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (eta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (eta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (eta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (eta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (eta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (eta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (eta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + index8 = (index8 + 1) % WIDTH; + gscal_loc[noBC_id(index8.s0,nb_part)] += (zeta(y.s0,s.s0)); + gscal_loc[noBC_id(index8.s1,nb_part)] += (zeta(y.s1,s.s1)); + gscal_loc[noBC_id(index8.s2,nb_part)] += (zeta(y.s2,s.s2)); + gscal_loc[noBC_id(index8.s3,nb_part)] += (zeta(y.s3,s.s3)); + gscal_loc[noBC_id(index8.s4,nb_part)] += (zeta(y.s4,s.s4)); + gscal_loc[noBC_id(index8.s5,nb_part)] += (zeta(y.s5,s.s5)); + gscal_loc[noBC_id(index8.s6,nb_part)] += (zeta(y.s6,s.s6)); + gscal_loc[noBC_id(index8.s7,nb_part)] += (zeta(y.s7,s.s7)); + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*8; i<WIDTH; i+=(WGN*8)) + vstore8((float8)(gscal_loc[noBC_id(i,nb_part)], + gscal_loc[noBC_id(i+1,nb_part)], + gscal_loc[noBC_id(i+2,nb_part)], + gscal_loc[noBC_id(i+3,nb_part)], + gscal_loc[noBC_id(i+4,nb_part)], + gscal_loc[noBC_id(i+5,nb_part)], + gscal_loc[noBC_id(i+6,nb_part)], + gscal_loc[noBC_id(i+7,nb_part)]), (i + gidY*WIDTH)/8, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_3D_opt4_builtin_private4.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_3D_opt4_builtin_private4.cl new file mode 100644 index 0000000000000000000000000000000000000000..46e9bc59b2ea793d24cd36c429a41bb0e448ff8e --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_3D_opt4_builtin_private4.cl @@ -0,0 +1,101 @@ + +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0/dx; + int4 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGN; + float4 p, s, y, gs, v, vp, coord; + uint4 index4; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + { + v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); + gvelo_loc[i] = v.x; + gvelo_loc[i+1] = v.y; + gvelo_loc[i+2] = v.z; + gvelo_loc[i+3] = v.w; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + v = vload4(i/4, gvelo_loc); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(gvelo_loc[i_ind.x], gvelo_loc[i_ind.y], + gvelo_loc[i_ind.z], gvelo_loc[i_ind.w]); + vp = (float4)(gvelo_loc[i_ind_p.x], gvelo_loc[i_ind_p.y], + gvelo_loc[i_ind_p.z], gvelo_loc[i_ind_p.w]); + p=fma(mix(v,vp,y),dt,coord); + + s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH); + barrier(CLK_LOCAL_MEM_FENCE); + gs = (alpha(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (beta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (gamma(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (delta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (eta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + //barrier(CLK_LOCAL_MEM_FENCE); + gs = (zeta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..5d221f67393d24f1f7614f422ef0396ad22d8c20 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_and_remesh_3D_opt4_builtin_private4_noBC.cl @@ -0,0 +1,101 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGN+(id/nb_part); +} +__kernel void advection_and_remeshing(__global const float* gvelo, + __global const float* pscal, + __global float* gscal, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0/dx; + int4 ind, i_ind, i_ind_p; + uint i, nb_part = WIDTH/WGN; + float4 p, s, y, gs, v, vp, coord; + uint4 index4; + + __local float gscal_loc[WIDTH]; + __local float gvelo_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + { + v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); + gvelo_loc[noBC_id(i, nb_part)] = v.x; + gvelo_loc[noBC_id(i+1, nb_part)] = v.y; + gvelo_loc[noBC_id(i+2, nb_part)] = v.z; + gvelo_loc[noBC_id(i+3, nb_part)] = v.w; + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + v = (float4)(gvelo_loc[noBC_id(i,nb_part)], + gvelo_loc[noBC_id(i+1,nb_part)], + gvelo_loc[noBC_id(i+2,nb_part)], + gvelo_loc[noBC_id(i+3,nb_part)]); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(gvelo_loc[noBC_id(i_ind.x,nb_part)], gvelo_loc[noBC_id(i_ind.y,nb_part)], + gvelo_loc[noBC_id(i_ind.z,nb_part)], gvelo_loc[noBC_id(i_ind.w,nb_part)]); + vp = (float4)(gvelo_loc[noBC_id(i_ind_p.x,nb_part)], gvelo_loc[noBC_id(i_ind_p.y,nb_part)], + gvelo_loc[noBC_id(i_ind_p.z,nb_part)], gvelo_loc[noBC_id(i_ind_p.w,nb_part)]); + p=fma(mix(v,vp,y),dt,coord); + + s = vload4((i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH); + gs = (alpha(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (beta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (gamma(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (delta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (eta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (zeta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/problem/problem.py b/HySoP/hysop/problem/problem.py index c78606626bfc4a46c27a2f252e906a89c5d76825..fce59388f3b7326dee9ce3c4511c1e46ce3a6681 100644 --- a/HySoP/hysop/problem/problem.py +++ b/HySoP/hysop/problem/problem.py @@ -100,6 +100,7 @@ class Problem(): self.timer.step() self.io.step() print "\n\n End solving\n" + self.solver.end() print "=== Timings ===" if ( self.topology.rank == 0 ) : print "IO total", self.io.compute_time diff --git a/HySoP/hysop/tools/kernel_benchmark.py b/HySoP/hysop/tools/kernel_benchmark.py index a87735bd4156be57a3a617387256d93790a4175d..83af9c94a757fa6b0a7b798451da1924746d7ba2 100644 --- a/HySoP/hysop/tools/kernel_benchmark.py +++ b/HySoP/hysop/tools/kernel_benchmark.py @@ -194,8 +194,11 @@ class BenchmarkSuite: f.write(k + " ") f.write(v + " ") f.write(c + " ") - Nx = s[0] - f.write(str(eval(c)) + ' ') + #print c + cs = c.replace('Nx', str(s[0])).replace('x', '*').split('_') + cse = cs[0]+'/'+cs[1][1] if len(cs) == 2 else cs[0] + #print cse + f.write(str(eval(cse)) + ' ') try: f.write(str(self.timings[k][v][c][s]) + "\n") except: diff --git a/HySoP/setup.py.in b/HySoP/setup.py.in index 098bcc29d0c516836267366604bc2c05813f0e96..8aa8053048dd4f0429d0de156a8874cc01759d47 100644 --- a/HySoP/setup.py.in +++ b/HySoP/setup.py.in @@ -101,7 +101,7 @@ config = Configuration(name=name, data_files=[('./parmepy/particular_solvers/gpu_src', ['@CMAKE_SOURCE_DIR@/parmepy/particular_solvers/gpu_src/' + cl_file for cl_file in os.listdir('@CMAKE_SOURCE_DIR@/parmepy/particular_solvers/gpu_src/') - if cl_file[-3:]=='.cl'])] + if cl_file[0]!='.' and cl_file[-3:]=='.cl'])] ) setup(**config.todict())