From 77d76eee68688fe603f3c57076186ff46d7c83f4 Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@imag.fr> Date: Fri, 16 Mar 2012 09:27:43 +0000 Subject: [PATCH] Fix GPU code. Still a bug in sequential (numpy) code. --- parmepy/examples/mainJM.py | 11 +- parmepy/examples/main_Rotating_2D.py | 6 +- parmepy/examples/main_Shear_2D.py | 71 ++++ parmepy/new_ParMePy/Operator/AdvectionDOp.py | 19 +- parmepy/new_ParMePy/Operator/RemeshingDOp.py | 53 +-- .../Problem/DiscreteTransportProblem.py | 2 +- .../new_ParMePy/Utils/GPUParticularSolver.py | 16 +- parmepy/new_ParMePy/Utils/Printer.py | 13 +- parmepy/new_ParMePy/Utils/gpu_src.cl | 319 ++++++++++-------- 9 files changed, 294 insertions(+), 216 deletions(-) create mode 100644 parmepy/examples/main_Shear_2D.py diff --git a/parmepy/examples/mainJM.py b/parmepy/examples/mainJM.py index 018181072..eec917abb 100644 --- a/parmepy/examples/mainJM.py +++ b/parmepy/examples/mainJM.py @@ -17,14 +17,21 @@ def scalaire(x): def run(): # Parameter dim = 3 - nb = 32 + nb = 128 boxLength = [1., 1., 1.] boxMin = [0., 0., 0.] nbElem = [nb, nb, nb] + + # dim = 2 + # nb = 64 + # boxLength = [1., 1.] + # boxMin = [0., 0.] + # nbElem = [nb, nb] + timeStep = 0.02 FinalTime = 1. outputFilePrefix = './res/RK2_' - outputModulo = 1 + outputModulo = 2 t0 = time.time() diff --git a/parmepy/examples/main_Rotating_2D.py b/parmepy/examples/main_Rotating_2D.py index bb4313c19..742a5d5cf 100644 --- a/parmepy/examples/main_Rotating_2D.py +++ b/parmepy/examples/main_Rotating_2D.py @@ -18,7 +18,7 @@ def scalaire(x): def run(): dim = 2 - nb = 100 + nb = 128 boxLength = [2., 2.] boxMin = [-1., -1.] nbElem = [nb, nb] @@ -48,9 +48,9 @@ def run(): RemeshingMethod = pp.Utils.M6Prime(grid=box.discreteDomain, gvalues=scal.discreteVariable) ODESolver = pp.Utils.RK2(InterpolationMethod, box.discreteDomain.applyConditions) ## cpu - solver = pp.Utils.ParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod) + #solver = pp.Utils.ParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod) ## gpu - #solver = pp.GPUParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod, device_type='gpu') + solver = pp.GPUParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod, device_type='gpu') cTspPb.setSolver(solver) cTspPb.setPrinter(pp.Utils.Printer(frequency=outputModulo, outputPrefix=outputFilePrefix, problem=cTspPb)) diff --git a/parmepy/examples/main_Shear_2D.py b/parmepy/examples/main_Shear_2D.py new file mode 100644 index 000000000..5fe230160 --- /dev/null +++ b/parmepy/examples/main_Shear_2D.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +import time +import new_ParMePy as pp +import math + + +def vitesse(x): + r = math.sqrt(x[0] * x[0] + x[1] * x[1]) + c = math.cos(3 * math.pi * r / 2.) + return [-c * x[1], c * x[0]] + + +def scalaire(x): + r = math.sqrt(x[0] * x[0] + x[1] * x[1]) + if r < 1: + return (1. - r * r) ** 6 + else: + return 0. + + +def run(): + dim = 2 + nb = 128 + boxLength = [2., 2.] + boxMin = [-1., -1.] + nbElem = [nb, nb] + + timeStep = 0.02 + FinalTime = 1. + outputFilePrefix = './res/RK2_' + outputModulo = 1 + + t0 = time.time() + + box = pp.Domain.Box(dimension=dim, + length=boxLength, + minPosition=boxMin) + velo = pp.Variable.AnalyticalVariable(domain=box, + dimension=dim, + formula=vitesse) + scal = pp.Variable.AnalyticalVariable(domain=box, + dimension=1, + formula=scalaire, scalar=True) + advec = pp.Operator.Advection(velo, scal) + cTspPb = pp.Problem.ContinuousTransportProblem(advection=advec) + + cTspPb.discretize(dSpec=[nbElem]) + + InterpolationMethod = pp.Utils.Linear(grid=box.discreteDomain, gvalues=velo.discreteVariable) + RemeshingMethod = pp.Utils.M6Prime(grid=box.discreteDomain, gvalues=scal.discreteVariable) + ODESolver = pp.Utils.RK2(InterpolationMethod, box.discreteDomain.applyConditions) + ## cpu + #solver = pp.Utils.ParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod) + ## gpu + solver = pp.GPUParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod, device_type='gpu') + cTspPb.setSolver(solver) + cTspPb.setPrinter(pp.Utils.Printer(frequency=outputModulo, outputPrefix=outputFilePrefix, problem=cTspPb)) + + t1 = time.time() + + cTspPb.solve(T=FinalTime, dt=timeStep) + + tf = time.time() + + print "Total time : ", tf - t0, "sec (CPU)" + print "Init time : ", t1 - t0, "sec (CPU)" + print "Solving time : ", tf - t1, "sec (CPU)" + + +if __name__ == "__main__": + run() diff --git a/parmepy/new_ParMePy/Operator/AdvectionDOp.py b/parmepy/new_ParMePy/Operator/AdvectionDOp.py index 9af9d2d96..b9ea41436 100644 --- a/parmepy/new_ParMePy/Operator/AdvectionDOp.py +++ b/parmepy/new_ParMePy/Operator/AdvectionDOp.py @@ -56,9 +56,6 @@ class AdvectionDOp(DiscreteOperator): @param dt : time step for advection. @param splittingDirection : direction to advect. """ - #for i in self.velocity.domain.explore(): - # self.resultPosition.values[i] = self.velocity.domain.applyConditions(self.numMethod.integrate(self.velocity.domain.points[i], self.velocity.values[i], t, dt, splittingDirection)) - #self.resultScalar.values[i] = self.scalar.values[i] if self.gpu_kernel is None: c_time = time.time() ## ------------------- NumPy Optimisation @@ -67,14 +64,22 @@ class AdvectionDOp(DiscreteOperator): ## ------------------- self.compute_time += (time.time() - c_time) else: - evt = self.gpu_kernel(self.gpu_queue, self.gpu_shape, None, + l = list(self.gpu_shape) + nb_ligne = l.pop(splittingDirection) + evt = self.gpu_kernel(self.gpu_queue, tuple(l), None, self.velocity.gpu_mem_object, self.scalar.gpu_mem_object, self.resultPosition.gpu_mem_object, self.resultScalar.gpu_mem_object, - np.float32(t), np.float32(dt), np.uint(splittingDirection), - np.concatenate((self.velocity.domain.min, [1.])).astype(np.float32), - np.concatenate((self.velocity.domain.max, [1.])).astype(np.float32)) + np.float32(t), np.float32(dt), np.uint32(splittingDirection), np.uint32(nb_ligne), + self.velocity.domain.min.astype(np.float32), + self.velocity.domain.max.astype(np.float32), + self.velocity.domain.elementNumber.astype(np.uint32), + self.velocity.domain.elementSize.astype(np.float32) + ) + # self.gpu_queue.finish() + # cl.enqueue_copy(self.gpu_queue, self.resultPosition.values, self.resultPosition.gpu_mem_object) + # print self.resultPosition.values self.gpu_queue.finish() self.queued_time += (evt.profile.submit - evt.profile.queued) self.submit_time += (evt.profile.start - evt.profile.submit) diff --git a/parmepy/new_ParMePy/Operator/RemeshingDOp.py b/parmepy/new_ParMePy/Operator/RemeshingDOp.py index 9efb9a8b3..745c927ec 100644 --- a/parmepy/new_ParMePy/Operator/RemeshingDOp.py +++ b/parmepy/new_ParMePy/Operator/RemeshingDOp.py @@ -65,31 +65,6 @@ class RemeshingDOp(DiscreteOperator): """ Apply Remeshing operator. """ - #reset scalar - #for i in self.resultScalar.domain.explore(): - # self.resultScalar.values[i] = 0. - #### Remaillage tag: - # if isinstance(self.numMethod, Lambda2): - # for i in self.resultScalar.domain.explore(): - # # Particules Taguées Centrées (tag = 1, bloc Type = 1) - # if self.ptag.values[i] == 1 and self.pbloc.values[i] == 1: - # ip = tuple([(i[d] + 1) % self.resultScalar.domain.elementNumber[d] if d == splittingDirection else i[d] for d in xrange(len(i))]) - # self.numMethod.remesh_Tag_Center(self.ppos.values[i], self.pscal.values[i], self.ppos.values[ip], self.pscal.values[ip], splittingDirection) - # # Particules Taguées Gauche (tag = 1, bloc Type = 0) - # elif self.ptag.values[i] == 1 and self.pbloc.values[i] == 0: - # ip = tuple([(i[d] + 1) % self.resultScalar.domain.elementNumber[d] if d == splittingDirection else i[d] for d in xrange(len(i))]) - # self.numMethod.remesh_Tag_Left(self.ppos.values[i], self.pscal.values[i], self.ppos.values[ip], self.pscal.values[ip], splittingDirection) - # # Particules non-Taguées Centrées (tag = 0, bloc Type = 1) - # elif self.ptag.values[i] == 0 and self.pbloc.values[i] == 1: - # self.numMethod.remesh_noTag_Center(self.ppos.values[i], self.pscal.values[i], splittingDirection) - # # Particules non-Taguées Gauche (tag = 0, bloc Type = 0) - # elif self.ptag.values[i] == 0 and self.pbloc.values[i] == 0: - # self.numMethod.remesh_noTag_Left(self.ppos.values[i], self.pscal.values[i], splittingDirection) - # elif self.ptag.values[i] != 2: - # raise Exception("Bad tag or bloc type : (tag = {0}, type = {1})".format(self.ptag.values[i], self.pbloc.values[i])) - # else: - #for i in self.resultScalar.domain.explore(): - # self.numMethod.remesh(self.ppos.values[i], self.pscal.values[i], splittingDirection) c_time = time.time() if self.gpu_kernel is None: c_time = time.time() @@ -99,26 +74,24 @@ class RemeshingDOp(DiscreteOperator): ## ------------------- self.compute_time += (time.time() - c_time) else: - evt = self.gpu_kernel[0](self.gpu_queue, self.gpu_shape, None, - self.ppos.gpu_mem_object, self.pscal.gpu_mem_object, self.resultScalar.gpu_mem_object, self.gpu_buffer, - np.float32(t), np.float32(dt), np.int32(splittingDirection), - np.concatenate((self.resultScalar.domain.min, [1.])).astype(np.float32), - np.concatenate((self.resultScalar.domain.max, [1.])).astype(np.float32)) + l = list(self.gpu_shape) + nb_ligne = l.pop(splittingDirection) + evt = self.gpu_kernel(self.gpu_queue, tuple(l), None, + self.ppos.gpu_mem_object, self.pscal.gpu_mem_object, self.resultScalar.gpu_mem_object, + np.float32(t), np.float32(dt), np.uint32(splittingDirection), np.uint32(nb_ligne), + self.resultScalar.domain.min.astype(np.float32), + self.resultScalar.domain.max.astype(np.float32), + self.resultScalar.domain.elementNumber.astype(np.uint32), + self.resultScalar.domain.elementSize.astype(np.float32) + ) + # self.gpu_queue.finish() + # cl.enqueue_copy(self.gpu_queue, self.resultScalar.values, self.resultScalar.gpu_mem_object) + # print self.resultScalar.values self.gpu_queue.finish() self.queued_time += (evt.profile.submit - evt.profile.queued) self.submit_time += (evt.profile.start - evt.profile.submit) self.compute_time += (evt.profile.end - evt.profile.start) #print "Remeshing M'6:", self.compute_time * 1e-9 - evt = self.gpu_kernel[1](self.gpu_queue, self.gpu_shape, None, - self.resultScalar.gpu_mem_object, self.gpu_buffer, - np.float32(t), np.float32(dt), np.int32(splittingDirection), - np.concatenate((self.resultScalar.domain.min, [1.])).astype(np.float32), - np.concatenate((self.resultScalar.domain.max, [1.])).astype(np.float32)) - self.gpu_queue.finish() - self.queued_time += (evt.profile.submit - evt.profile.queued) - self.submit_time += (evt.profile.start - evt.profile.submit) - self.compute_time += (evt.profile.end - evt.profile.start) - #print "Remeshing reduce:", self.compute_time * 1e-9 def __str__(self): """ToString method""" diff --git a/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py b/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py index e42a7471a..224d311d0 100644 --- a/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py +++ b/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py @@ -47,7 +47,7 @@ class DiscreteTransportProblem(DiscreteProblem): @param dt : Simulation time step. """ ite = 0 - if self.printer != None: + if not self.printer is None: while self.t < T: [[op.applyOperator(self.t, dt * i[1], i[0]) for op in self.operators] for i in self.splittingStep] self.t += dt diff --git a/parmepy/new_ParMePy/Utils/GPUParticularSolver.py b/parmepy/new_ParMePy/Utils/GPUParticularSolver.py index c9183718d..69aaac6e0 100644 --- a/parmepy/new_ParMePy/Utils/GPUParticularSolver.py +++ b/parmepy/new_ParMePy/Utils/GPUParticularSolver.py @@ -121,13 +121,15 @@ class GPUParticularSolver(ParticularSolver): discreteProblem.advecOp.gpu_kernel = self.prg.integrate remesh.gpu_queue = self.queue remesh.gpu_shape = self.gpu_shape - remesh.gpu_kernel = (self.prg.remeshing, self.prg.remeshing_reduce) - remesh.gpu_buffer = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=self.gscal.values.nbytes * 8) - self.gpu_used_mem += remesh.gpu_buffer.size - print "Allocation remeshing buffer on gpu, size:", remesh.gpu_buffer.size, 'B' - buffer_shape = list(self.gpu_shape) - buffer_shape.append(8) - remesh.gpu_buffer_values = np.zeros(tuple(buffer_shape), dtype=np.float32) + remesh.gpu_kernel = self.prg.remeshing + + # Modifying domains parameters to fit with float4 or int4 opencl objects + padding_float = np.zeros(4 - self.grid.dimension, dtype=np.float32) + padding_uint = np.zeros(4 - self.grid.dimension, dtype=np.uint) + self.grid.min = np.concatenate((self.grid.min, padding_float)).astype(np.float32) + self.grid.max = np.concatenate((self.grid.max, padding_float)).astype(np.float32) + self.grid.elementSize = np.concatenate((self.grid.elementSize, padding_float)).astype(np.float32) + self.grid.elementNumber = np.concatenate((self.grid.elementNumber, padding_uint)).astype(np.uint32) self.queue.finish() p = self.gpu_used_mem * 100 / (self.device.global_mem_size * 1.) diff --git a/parmepy/new_ParMePy/Utils/Printer.py b/parmepy/new_ParMePy/Utils/Printer.py index c90b14bf3..b30981eb8 100644 --- a/parmepy/new_ParMePy/Utils/Printer.py +++ b/parmepy/new_ParMePy/Utils/Printer.py @@ -30,24 +30,23 @@ class Printer(): self.printStep = self._printStep else: self.printStep = self._passStep + self.skip = 1 self.grid = self.problem.discreteProblem.advecOp.velocity.domain self.gvelo = self.problem.discreteProblem.advecOp.velocity.values self.ppos = self.problem.discreteProblem.solver.ppos.values self.pscal = self.problem.discreteProblem.solver.pscal.values self.gscal = self.problem.discreteProblem.advecOp.scalar.values - - #self.pvelo = self.problem.discreteProblem.solver.pvelo - # self.pbloc = self.problem.discreteProblem.solver.pbloc - # self.ptag = self.problem.discreteProblem.solver.ptag self.printStep() def _passStep(self): pass def _printStep(self): - if (self.ite % self.freq) == 0: + self.skip -= 1 + if self.skip == 0: + self.skip = self.freq f = open(self.outputPrefix + "results_{0:05d}.dat".format(self.ite), 'w') - dim = len(self.grid.elementNumber) + dim = self.grid.dimension f.write("# iteration : {0}, time = {1}".format(self.ite, self.problem.discreteProblem.t)) f.write("# gid({0}) gpos({1}) gvelo({2}) gscal(1) ppos({3}) pvelo({4}) pscal(1) pbloc(1) ptag(1)\n".format(dim, dim, dim, dim, dim)) if isinstance(self.problem.discreteProblem.solver, GPUParticularSolver): @@ -98,7 +97,7 @@ class Printer(): f.write("\n") f.write("\n") f.close() - self.ite += 1 + self.ite += 1 if __name__ == "__main__": diff --git a/parmepy/new_ParMePy/Utils/gpu_src.cl b/parmepy/new_ParMePy/Utils/gpu_src.cl index ec0e6a207..b671bff9b 100644 --- a/parmepy/new_ParMePy/Utils/gpu_src.cl +++ b/parmepy/new_ParMePy/Utils/gpu_src.cl @@ -1,84 +1,105 @@ -size_t space_index_to_buffer_index(size_t *ind, size_t d, size_t dim_vec); -#if ORDER == 'F' -size_t space_index_to_buffer_index(size_t *ind, size_t d, size_t dim_vec) -{ - //F order -#if DIM == 2 - return ind[0] + ind[1] * get_global_size(0) + ind[2] * get_global_size(0) * get_global_size(1) + d * get_global_size(0) * get_global_size(1) * get_global_size(2); -#else - return ind[0] + ind[1] * get_global_size(0) + d * get_global_size(0) * get_global_size(1); -#endif -} -#else -size_t space_index_to_buffer_index(size_t *ind, size_t d, size_t dim_vec) -{ - //C order -#if DIM == 3 - return dim_vec * (ind[2] + ind[1] * get_global_size(2) + ind[0] * get_global_size(2) * get_global_size(1)) + d; -#else - return dim_vec * (ind[1] + ind[0] * get_global_size(1)) + d; -#endif -} -#endif - - // Integration Kernel (RK2 method) __kernel void integrate(__global const float* gvelo, - __global const float* gscal, + __global float* gscal, __global float* ppos, __global float* pscal, - float t, - float dt, - size_t dir, - float4 min_v, - float4 max_v + float t, + float dt, + size_t dir, + size_t line_nb_pts, + float4 min_v, + float4 max_v, + uint4 nb, + float4 size ) { - __private size_t i, stride_vec, stride_part_dir, ind, ind_scal, ind_interp_dir, ind_shift, ind_shift_p; - __private size_t ind_d[DIM], elementNumber[DIM]; - __private float v_loc_dir, ppos_loc_dir, x0; - __private float ppos_loc_d[DIM], elementSize[DIM]; - stride_vec = 1; - stride_part_dir = DIM; - // Compute global index and buffer index - // Compute element number and size - // Compute grid position from index - for(i=0;i<DIM;i++) + __private size_t i; // DIM loop index + __private size_t p; // Particle line loop index + __private size_t stride_vec; // Buffer stride between two components of the same particle + __private size_t stride_along_dir; // Buffer stride between two particle of the same line + __private size_t ind_line; // Line begin buffer index + __private size_t i_ind; // Interpolation left point buffer index + __private size_t i_ind_p; // Interpolation right point buffer index + __private int ind_c; // Interpolation grid point + __private float x0; // Interpolation distance from nearest left grid point + __private float size_line; // Space step along line + __private float p_velo; // Particle interpolated velocity + __private float ppos_c; // Particle position temp + __private float line_c[DIM]; // Particle final position + // Compute strides and index. Depend on array order (C order here) + stride_vec = 1; // Les composantes des vecteurs sont à la suite dans les buffers + if(DIM == 3) { - ind_d[i] = get_global_id(i); - elementNumber[i] = get_global_size(i); - elementSize[i] = (max_v[i] - min_v[i]) / ((float)elementNumber[i]); - ppos_loc_d[i] = ind_d[i]*elementSize[i] + min_v[i]; + if (dir == 0) + { + stride_along_dir = DIM *nb[2] * nb[1]; // avancer de 1 dans la direction 0 + ind_line = DIM * (get_global_id(1) + get_global_id(0)*nb[2]); // indice du point 0,y,z,0 = DIM*(z+y*nb[2]) + line_c[1] = min_v[1] + get_global_id(0)*size[1]; + line_c[2] = min_v[2] + get_global_id(1)*size[2]; + size_line = size[0]; + } + else if (dir == 1) + { + stride_along_dir = DIM*nb[2]; // Avancer de 1 dans la direction 1 + ind_line = DIM * (get_global_id(1) + get_global_id(0) * nb[2] * nb[1]); // indice du point x,0,z,0 = DIM(z+x*nb[2]*nb[1]) + line_c[0] = min_v[0] + get_global_id(0)*size[0]; + line_c[2] = min_v[2] + get_global_id(1)*size[2]; + size_line =size[1]; + } + else + { + stride_along_dir = DIM; // Avancer de 1 dans la direction 2 + ind_line = DIM * (get_global_id(1) * nb[2] + get_global_id(0) * nb[2] * nb[1]); // indice du point x,y,0,0 = DIM*(y*nb[2]+x*nb[2]*nb[1]) + line_c[0] = min_v[0] + get_global_id(0)*size[0]; + line_c[1] = min_v[1] + get_global_id(1)*size[1]; + size_line = size[2]; + } } - // Compute buffer index from global index (vector|scalar buffer) - ind = space_index_to_buffer_index(ind_d, 0, DIM); - ind_scal = space_index_to_buffer_index(ind_d, 0, 1); - // Buffer read velocity (dir component) - v_loc_dir = gvelo[ind + dir * stride_vec]; - // RK2 intermediar point (dir component) - ppos_loc_dir = ppos_loc_d[dir] + 0.5f * dt * v_loc_dir; - // Compute left index of intermediar point (dir component) - ind_interp_dir = (int)((ppos_loc_dir - min_v[dir]) / elementSize[dir]); - // Compute distance to left grid point - x0 = (ppos_loc_dir - ind_interp_dir * elementSize[dir] - min_v[dir]) / elementSize[dir]; - // Compute effective shift grid index to read Velocity - ind_shift = ((ind_interp_dir + elementNumber[dir]) % elementNumber[dir]) - ind_d[dir]; - ind_shift_p = ((ind_interp_dir + 1 + elementNumber[dir]) % elementNumber[dir]) - ind_d[dir]; - // Compute interpolatex velocity - v_loc_dir = (1.0f - x0) * gvelo[ind + ind_shift * stride_part_dir] + x0 * gvelo[ind + ind_shift_p * stride_part_dir]; - // Compute final position - ppos_loc_dir = ppos_loc_d[dir] + dt * v_loc_dir; - // Applying boundary conditions on new position - if (ppos_loc_dir >= max_v[dir]) - ppos_loc_d[dir] = ppos_loc_dir - max_v[dir] + min_v[dir]; - else if (ppos_loc_dir < min_v[dir]) - ppos_loc_d[dir] = ppos_loc_dir + max_v[dir] - min_v[dir]; else - ppos_loc_d[dir] = ppos_loc_dir; - // Writing particles position and scalar to results buffer - for(i=0; i<DIM; i++) - ppos[ind + i*stride_vec] = ppos_loc_d[i]; - pscal[ind_scal] = gscal[ind_scal]; + { + if (dir == 0) + { + stride_along_dir = DIM * nb[1]; // Avancer de 1 dans la direction 0 + ind_line = DIM * (get_global_id(0)); // indice du point 0,y,0 + line_c[1] = min_v[1] + get_global_id(0)*size[1]; + size_line = size[0]; + } + else + { + stride_along_dir = DIM; // Avancer de 1 dans la direction 1 + ind_line = DIM * (get_global_id(0) * nb[1]); // indice du point x,0,0 + line_c[0] = min_v[0] + get_global_id(0)*size[0]; + size_line = size[1]; + } + } + for(p=0;p<line_nb_pts;p++) + { + // Particles scalar initialisation + pscal[(ind_line + p*stride_along_dir)/DIM] = gscal[(ind_line + p*stride_along_dir)/DIM]; + // Compute intermediar point + ppos_c = min_v[dir] + p*size_line + 0.5f*dt*gvelo[ind_line + p*stride_along_dir + dir*stride_vec]; + // Compute nearest left grid point + ind_c = convert_int_rtn((ppos_c - min_v[dir])/size_line); + // Compute nearest left grid point distance + x0 = (ppos_c - ind_c * size_line - min_v[dir]) / size_line; + // Compute line index + i_ind = (ind_c + line_nb_pts) % line_nb_pts; + i_ind_p = (ind_c + 1 + line_nb_pts) % line_nb_pts; + // Interpolate velocity + p_velo = (1.0f - x0) * gvelo[ind_line + i_ind * stride_along_dir + dir*stride_vec] + x0 * gvelo[ind_line + i_ind_p * stride_along_dir + dir*stride_vec]; + // Compute final position + ppos_c = min_v[dir] + p*size_line + dt*p_velo; + // Apply boundary conditions + if (ppos_c >= max_v[dir]) + line_c[dir] = ppos_c - max_v[dir] + min_v[dir]; + else if (ppos_c < min_v[dir]) + line_c[dir] = ppos_c + max_v[dir] - min_v[dir]; + else + line_c[dir] = ppos_c; + // Write result in buffer + for(i=0; i<DIM; i++) + ppos[ind_line + p*stride_along_dir + i*stride_vec] = line_c[i]; + } } @@ -86,88 +107,88 @@ __kernel void integrate(__global const float* gvelo, __kernel void remeshing(__global const float* part, __global const float* pscal, __global float* gscal, - __global float *tmp, - float t, - float dt, - int dir, - float4 min_v, - float4 max_v + float t, + float dt, + int dir, + size_t line_nb_pts, + float4 min_v, + float4 max_v, + uint4 nb, + float4 size ) { - __private size_t i; - __private size_t s_ind[DIM]; - __private size_t elementNumber[DIM]; - __private float elementSize[DIM]; - __private float s_part[DIM],s_pscal; - __private size_t s_ind_im2[DIM],s_ind_im[DIM],s_ind_i0[DIM],s_ind_ip[DIM],s_ind_ip2[DIM],s_ind_ip3[DIM]; - __private float x0, am2, am, a0, ap, ap2, ap3; - // Compute global index and global size - for(i=0; i<DIM; i++) + __private size_t i; // DIM loop index + __private size_t p; // Particle line loop index + __private size_t stride_vec; // Buffer stride between two components of the same particle + __private size_t stride_along_dir; // Buffer stride between two particle of the same line + __private size_t ind_line; // Line begin buffer index + __private size_t r_ind; // Remeshing nearest left point buffer index + __private int ind_c; // Remeshing grid point + __private float x0; // Remeshing distance from nearest left grid point + __private float size_line; // Space step along line + __private float ppos_c; // Particle position temp along line + __private float pscal_c; // Particle scalar along line + __private float weights[6]; // Remeshing weights + // Compute strides and index. Depend on array order (C order here) + stride_vec = 1; // Les composantes des vecteurs sont à la suite dans les buffers + if(DIM == 3){ + if (dir == 0) { - s_ind[i] = get_global_id(i); - elementNumber[i] = get_global_size(i); + stride_along_dir = DIM *nb[2] * nb[1]; // avancer de 1 dans la direction 0 + ind_line = DIM * (get_global_id(1) + get_global_id(0)*nb[2]); // indice du point 0,y,z,0 = DIM*(z+y*nb[2]) + size_line = size[0]; } - // Compute element size - for(i=0; i<DIM; i++) - elementSize[i] = (max_v[i] - min_v[i]) / ((float)elementNumber[i]); - // Read buffers - for(i=0; i<DIM; i++) - s_part[i] = part[space_index_to_buffer_index(s_ind, i, DIM)]; - s_pscal = pscal[space_index_to_buffer_index(s_ind, 0, 1)]; - // Compute left nearest index and copy to other index - for(i=0; i<DIM; i++) + else if (dir == 1) { - s_ind_i0[i] = (int)((s_part[i] - min_v[i])/elementSize[i]+0.000001f); - s_ind_ip[i] = s_ind_i0[i]; - s_ind_ip2[i] = s_ind_i0[i]; - s_ind_ip3[i] = s_ind_i0[i]; - s_ind_im[i] = s_ind_i0[i]; - s_ind_im2[i] = s_ind_i0[i]; + stride_along_dir = DIM*nb[2]; // Avancer de 1 dans la direction 1 + ind_line = DIM * (get_global_id(1) + get_global_id(0) * nb[2] * nb[1]); // indice du point x,0,z,0 = DIM(z+x*nb[2]*nb[1]) + size_line =size[1]; } - s_ind_im2[dir] = (s_ind_im2[dir] - 2 + elementNumber[dir]) % elementNumber[dir]; - s_ind_im[dir] = (s_ind_im[dir] - 1 + elementNumber[dir]) % elementNumber[dir]; - s_ind_i0[dir] = (s_ind_i0[dir] + elementNumber[dir]) % elementNumber[dir]; - s_ind_ip[dir] = (s_ind_ip[dir] + 1 + elementNumber[dir]) % elementNumber[dir]; - s_ind_ip2[dir] = (s_ind_ip2[dir] + 2 + elementNumber[dir]) % elementNumber[dir]; - s_ind_ip3[dir] = (s_ind_ip3[dir] + 3 + elementNumber[dir]) % elementNumber[dir]; - // Compute distance to left nearest point - x0 = (s_part[dir] - s_ind_i0[dir] * elementSize[dir] - min_v[dir]) / elementSize[dir]; - // Compute remeshing weights - am2 = (-(x0) * (5.0f * (x0 + 2.0f) - 8.0f) * (x0 - 1.0f) * (x0 - 1.0f) * (x0 - 1.0f)) / 24.0f; - am = x0 * (x0 - 1.0f) * (25.0f * (x0 + 1.0f)*(x0 + 1.0f)*(x0 + 1.0f) - 114.0f * (x0 + 1.0f) * (x0 + 1.0f) + 153.0f * (x0 + 1.0f) - 48.0f) / 24.0f; - a0 = -(x0 - 1.0f) * (25.0f * x0*x0*x0*x0 - 38.0f * x0*x0*x0 - 3.0f * x0*x0 + 12.0f * x0 + 12.0f) / 12.0f; - ap = x0 * (25.0f * (1.0f - x0)*(1.0f - x0)*(1.0f - x0)*(1.0f - x0) - 38.0f * (1.0f - x0)*(1.0f - x0)*(1.0f - x0) - 3.0f * (1.0f - x0)*(1.0f - x0) + 12.0f * (1.0f - x0) + 12.0f) / 12.0f; - ap2 = (1.0f - x0) * (-x0) * (25.0f * (2.0f - x0)*(2.0f - x0)*(2.0f - x0) - 114.0f * (2.0f - x0)*(2.0f - x0) + 153.0f * (2.0f - x0) - 48.0f) / 24.0f; - ap3 = -(1.0f - x0) * ((5.0f * (3.0f - x0) - 8.0f) * (-x0)*(-x0)*(-x0)) / 24.0f; - // Write temp result - tmp[space_index_to_buffer_index(s_ind_im2, 0, 8)] += am2 * s_pscal; - tmp[space_index_to_buffer_index(s_ind_im, 1, 8)] += am * s_pscal; - tmp[space_index_to_buffer_index(s_ind_i0, 2, 8)] += a0 * s_pscal; - tmp[space_index_to_buffer_index(s_ind_ip, 3, 8)] += ap * s_pscal; - tmp[space_index_to_buffer_index(s_ind_ip2, 4, 8)] += ap2 * s_pscal; - tmp[space_index_to_buffer_index(s_ind_ip3, 5, 8)] += ap3 * s_pscal; -} + else + { + stride_along_dir = DIM; // Avancer de 1 dans la direction 2 + ind_line = DIM * (get_global_id(1) * nb[2] + get_global_id(0) * nb[2] * nb[1]); // indice du point x,y,0,0 = DIM*(y*nb[2]+x*nb[2]*nb[1]) + size_line = size[2]; + }} + else{ + if (dir == 0) + { + stride_along_dir = DIM * nb[1]; // Avancer de 1 dans la direction 0 + ind_line = DIM * (get_global_id(0)); // indice du point 0,y,0 + size_line = size[0]; + } + else + { + stride_along_dir = DIM; // Avancer de 1 dans la direction 1 + ind_line = DIM * (get_global_id(0) * nb[1]); // indice du point x,0,0 + size_line = size[1]; + } + } -__kernel void remeshing_reduce(__global float* gscal, - __global float *tmp, - float t, - float dt, - int dir, - float4 min, - float4 max) -{ - __private size_t i; - __private size_t s_ind[DIM]; - __private float sum = 0.0f; - // Compute global index and global size - for(i=0; i<DIM; i++) - s_ind[i] = get_global_id(i); - // Reduce grid point weights - for(i=0;i<6;i++) - sum += tmp[space_index_to_buffer_index(s_ind, i, 8)]; - // Write grid point new scalar - gscal[space_index_to_buffer_index(s_ind, 0, 1)] = sum; - // reset buffer - for(i=0;i<6;i++) - tmp[space_index_to_buffer_index(s_ind, i, 8)] = 0.0f; + // Initialise grid scalar + for(p=0;p<line_nb_pts;p++) + gscal[(ind_line + p*stride_along_dir)/DIM] = 0.0f; + for(p=0;p<line_nb_pts;p++) + { + // read particle position + ppos_c = part[ind_line + p*stride_along_dir + dir*stride_vec]; + // read particle scalar + pscal_c = pscal[(ind_line + p*stride_along_dir)/DIM]; + // Compute nearest left grid point + ind_c = convert_int_rtn((ppos_c - min_v[dir])/size_line); + // Compute nearest left grid point distance + x0 = (ppos_c - ind_c * size_line - min_v[dir]) / size_line; + // Compute remeshing weights + weights[0] = (-(x0) * (5.0f * (x0 + 2.0f) - 8.0f) * (x0 - 1.0f) * (x0 - 1.0f) * (x0 - 1.0f)) / 24.0f; + weights[1] = x0 * (x0 - 1.0f) * (25.0f * (x0 + 1.0f)*(x0 + 1.0f)*(x0 + 1.0f) - 114.0f * (x0 + 1.0f) * (x0 + 1.0f) + 153.0f * (x0 + 1.0f) - 48.0f) / 24.0f; + weights[2] = -(x0 - 1.0f) * (25.0f * x0*x0*x0*x0 - 38.0f * x0*x0*x0 - 3.0f * x0*x0 + 12.0f * x0 + 12.0f) / 12.0f; + weights[3] = x0 * (25.0f * (1.0f - x0)*(1.0f - x0)*(1.0f - x0)*(1.0f - x0) - 38.0f * (1.0f - x0)*(1.0f - x0)*(1.0f - x0) - 3.0f * (1.0f - x0)*(1.0f - x0) + 12.0f * (1.0f - x0) + 12.0f) / 12.0f; + weights[4] = (1.0f - x0) * (-x0) * (25.0f * (2.0f - x0)*(2.0f - x0)*(2.0f - x0) - 114.0f * (2.0f - x0)*(2.0f - x0) + 153.0f * (2.0f - x0) - 48.0f) / 24.0f; + weights[5] = -(1.0f - x0) * ((5.0f * (3.0f - x0) - 8.0f) * (-x0)*(-x0)*(-x0)) / 24.0f; + // Apply remeshing weights + for(i=0;i<6;i++){ + r_ind = (ind_c-2+i+line_nb_pts)%line_nb_pts; + gscal[(ind_line + r_ind*stride_along_dir)/DIM] += weights[i] * pscal_c; + } + } } -- GitLab