From ebfd261b648d0ff58b203a7c61e32580ddc02a82 Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@imag.fr> Date: Tue, 6 Mar 2012 15:51:40 +0000 Subject: [PATCH] New GPU version using Buffer instead of Image. --- parmepy/examples/mainJM.py | 3 + parmepy/gpu_src.cl | 284 ++++++++++-------- parmepy/new_ParMePy/Operator/AdvectionDOp.py | 22 +- parmepy/new_ParMePy/Operator/RemeshingDOp.py | 29 +- .../Problem/DiscreteTransportProblem.py | 7 + .../new_ParMePy/Utils/GPUParticularSolver.py | 114 ++++--- parmepy/new_ParMePy/Utils/Printer.py | 8 +- .../Variable/AnalyticalVariable.py | 6 +- .../new_ParMePy/Variable/DiscreteVariable.py | 26 +- 9 files changed, 266 insertions(+), 233 deletions(-) diff --git a/parmepy/examples/mainJM.py b/parmepy/examples/mainJM.py index fc4d915fe..50c2b780d 100644 --- a/parmepy/examples/mainJM.py +++ b/parmepy/examples/mainJM.py @@ -53,6 +53,9 @@ def run(): InterpolationMethod = Linear(grid=box.discreteDomain, gvalues=velo.discreteVariable) RemeshingMethod = M6Prime(grid=box.discreteDomain, gvalues=scal.discreteVariable) ODESolver = RK2(InterpolationMethod, box.discreteDomain.applyConditions) + ## cpu + #solver = ParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod) + ## gpu solver = GPUParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod) cTspPb.setSolver(solver) #cTspPb.setPrinter(Printer(frequency=outputModulo, outputPrefix=outputFilePrefix, problem=cTspPb)) diff --git a/parmepy/gpu_src.cl b/parmepy/gpu_src.cl index 0f40a506b..a034bdc32 100644 --- a/parmepy/gpu_src.cl +++ b/parmepy/gpu_src.cl @@ -1,157 +1,177 @@ - // Cast image coordinate to buffer index. -unsigned int coord_to_index(int4 c); -// Cast array to image coordinate (images are stored in fortran order) -// TODO: Utiliser des tableaux en fortran partout. -int4 img_coord(int *c); -// Elementwise float array copy -void copy_f(float *src, float *dest); -// Elementwise int array copy -void copy_i(int *src, int *dest); - -void copy_f(float *src, float *dest) -{ - for(size_t i=0;i<4;i++) - dest[i] = src[i]; -} -void copy_i(int *src, int *dest) -{ - for(size_t i=0;i<4;i++) - dest[i] = src[i]; -} -int4 img_coord(int *c) +size_t space_index_to_buffer_index(size_t *ind, size_t d, size_t dim_vec); +#if ORDER == 'F' +size_t space_index_to_buffer_index(size_t *ind, size_t d, size_t dim_vec) { - return (int4)(c[2], c[1], c[0], 0); + //F order +#if DIM == 2 + return ind[0] + ind[1] * get_global_size(0) + ind[2] * get_global_size(0) * get_global_size(1) + d * get_global_size(0) * get_global_size(1) * get_global_size(2); +#else + return ind[0] + ind[1] * get_global_size(0) + d * get_global_size(0) * get_global_size(1); +#endif } -unsigned int coord_to_index(int4 c) +#else +size_t space_index_to_buffer_index(size_t *ind, size_t d, size_t dim_vec) { - return c.x + get_global_size(0) * c.y + get_global_size(0) * get_global_size(1) * c.z; + //C order +#if DIM == 3 + return dim_vec * (ind[2] + ind[1] * get_global_size(2) + ind[0] * get_global_size(2) * get_global_size(1)) + d; +#else + return dim_vec * (ind[1] + ind[0] * get_global_size(1)) + d; +#endif } +#endif + // Integration Kernel (RK2 method) -__kernel void integrate(__read_only image3d_t y, - __read_only image3d_t fy, - __write_only image3d_t pos, - float t, float dt, int dir, - float4 min_v, float4 max_v) +__kernel void integrate(__global const float* y, + __global const float* fy, + __global const float* s, + __global float* pos, + __global float* ps, + float t, + float dt, + size_t dir, + float4 min_v, + float4 max_v + ) { - const sampler_t isp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_REPEAT | CLK_FILTER_NEAREST ; - float4 elementNumber_v = (float4)(get_global_size(0), get_global_size(1), get_global_size(2), 1.0f); - float4 elementSize_v = (max_v - min_v) / elementNumber_v; - int ind[4], iind[4], iindp[4]; - float res[4], y_px[4], fy_px[4], min[4], max[4], elementNumber[4], elementSize[4]; - float x0; - // Storing vectors data in simple arrays. - vstore4((int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0), 0, ind); - vstore4(read_imagef(y, isp, img_coord(ind)), 0, y_px); - vstore4(read_imagef(fy, isp, img_coord(ind)), 0, fy_px); - vstore4(min_v, 0, min); - vstore4(max_v, 0, max); - vstore4(elementNumber_v, 0, elementNumber); - vstore4(elementSize_v, 0, elementSize); - // Copying position - copy_f(y_px, res); + __private size_t i; + __private size_t s_ind[DIM]; + __private size_t elementNumber[DIM]; + __private float elementSize[DIM]; + __private float s_y[DIM]; + __private float s_fy[DIM]; + __private float s_pos[DIM]; + __private size_t s_ind_i[DIM],s_ind_ip[DIM]; + __private float x0; + // Compute global index and global size + for(i=0; i<DIM; i++) + { + s_ind[i] = get_global_id(i); + elementNumber[i] = get_global_size(i); + } + // Compute element size + for(i=0; i<DIM; i++) + elementSize[i] = (max_v[i] - min_v[i]) / ((float)elementNumber[i]); + // Read buffers + for(i=0; i<DIM; i++) + { + s_y[i] = y[space_index_to_buffer_index(s_ind, i, DIM)]; + s_fy[i] = fy[space_index_to_buffer_index(s_ind, i, DIM)]; + } + // Copy position + for(i=0; i<DIM; i++) + s_pos[i] = s_y[i]; // RK2 first advection - res[dir] = y_px[dir] + fy_px[dir] * dt * 0.5f; - // Applying boundary conditions on new position - if (res[dir] >= max[dir]) - res[dir] = res[dir] - max[dir] + min[dir]; - if (res[dir] < min[dir]) - res[dir] = res[dir] + max[dir] - min[dir]; + s_pos[dir] = s_y[dir] + s_fy[dir] * dt * 0.5f; // Velocity interpolation - vstore4(convert_int4_rtn((vload4(0, res) - min_v) / elementSize_v), 0, iind); - copy_i(iind, iindp); - iindp[dir] = (iindp[dir] + 1) % (int) (elementNumber[dir]); - x0 = (res[dir] - iind[dir] * elementSize[dir] - min[dir]) / elementSize[dir]; - vstore4((1.0f - x0) * read_imagef(fy, isp, img_coord(iind)) + x0 * read_imagef(fy, isp, img_coord(iindp)),0 ,fy_px); + for(i=0; i<DIM; i++) + s_ind_i[i] = (int)((s_pos[i] - min_v[i])/elementSize[i]+0.000001f) ; + s_ind_i[dir] = (s_ind_i[dir] + elementNumber[dir]) % elementNumber[dir]; + for(i=0; i<DIM; i++) + s_ind_ip[i] = s_ind_i[i]; + s_ind_ip[dir] = (s_ind_i[dir] + 1) % elementNumber[dir]; + x0 = (s_pos[dir] - s_ind_i[dir] * elementSize[dir] - min_v[dir]) / elementSize[dir]; + s_fy[dir] = (1.0f - x0) * fy[space_index_to_buffer_index(s_ind_i, dir, DIM)] + x0 * fy[space_index_to_buffer_index(s_ind_ip, dir, DIM)]; // RK2 second advection - res[dir] = y_px[dir] + fy_px[dir] * dt; + s_pos[dir] = s_y[dir] + s_fy[dir] * dt; // Applying boundary conditions on new position - if (res[dir] >= max[dir]) - res[dir] = res[dir] - max[dir] + min[dir]; - if (res[dir] < min[dir]) - res[dir] = res[dir] + max[dir] - min[dir]; - // Writing final position in image - write_imagef(pos, img_coord(ind), vload4(0, res)); + if (s_pos[dir] >= max_v[dir]) + s_pos[dir] = s_pos[dir] - max_v[dir] + min_v[dir]; + if (s_pos[dir] < min_v[dir]) + s_pos[dir] = s_pos[dir] + max_v[dir] - min_v[dir]; + // Writing result + for(i=0; i<DIM; i++) + pos[space_index_to_buffer_index(s_ind, i, DIM)] = s_pos[i]; + ps[space_index_to_buffer_index(s_ind, 0, 1)] = s[space_index_to_buffer_index(s_ind, 0, 1)]; } - // Remeshing kernel (M'6 formula) -__kernel void remeshing(__read_only image3d_t part, - __read_only image3d_t pscal, - __write_only image3d_t gscal, - __global float8 *tmp, - float t, float dt, int dir, - float4 min_v, float4 max_v) +__kernel void remeshing(__global const float* part, + __global const float* pscal, + __global float* gscal, + __global float *tmp, + float t, + float dt, + int dir, + float4 min_v, + float4 max_v + ) { - const sampler_t isp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_REPEAT | CLK_FILTER_NEAREST ; - //unsigned int ddir = convert_uint(dir); - int ind[4], im2[4], im[4], i0[4], ip[4], ip2[4], ip3[4], elementNumber[4]; - float part_px[4], pscal_px[4], elementSize[4], min[4], max[4]; - //int4 ind = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - //int4 i0, im, im2, ip, ip2, ip3; - int4 elementNumber_v = (int4)(get_global_size(0), get_global_size(1), get_global_size(2), 1.0f); - float4 elementSize_v = (max_v - min_v) / convert_float4(elementNumber_v); - //float4 part_px = read_imagef(part, isp, ind); - //float4 pscal_px = read_imagef(pscal, isp, ind); - //int4 indp; - float x0, am2, am, a0, ap, ap2, ap3; - vstore4((int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0), 0, ind); - vstore4(read_imagef(part, isp, img_coord(ind)), 0, part_px); - vstore4(read_imagef(pscal, isp, img_coord(ind)), 0, pscal_px); - vstore4(convert_int4_rtn((vload4(0, part_px) - min_v) / elementSize_v), 0, i0); - vstore4(min_v, 0, min); - vstore4(max_v, 0, max); - vstore4(elementNumber_v, 0, elementNumber); - vstore4(elementSize_v, 0, elementSize); - i0[dir] = i0[dir] % elementNumber[dir]; - copy_i(i0, im2); - copy_i(i0, im); - copy_i(i0, ip); - copy_i(i0, ip2); - copy_i(i0, ip3); - im2[dir] = (i0[dir] - 2 + elementNumber[dir]) % elementNumber[dir]; - im[dir] = (i0[dir] - 1 + elementNumber[dir]) % elementNumber[dir]; - ip[dir] = (i0[dir] + 1) % elementNumber[dir]; - ip2[dir] = (i0[dir] + 2) % elementNumber[dir]; - ip3[dir] = (i0[dir] + 3) % elementNumber[dir]; - x0 = (part_px[dir] - i0[dir] * elementSize[dir] - min[dir]) / elementSize[dir]; + __private size_t i; + __private size_t s_ind[DIM]; + __private size_t elementNumber[DIM]; + __private float elementSize[DIM]; + __private float s_part[DIM],s_pscal; + __private size_t s_ind_im2[DIM],s_ind_im[DIM],s_ind_i0[DIM],s_ind_ip[DIM],s_ind_ip2[DIM],s_ind_ip3[DIM]; + __private float x0, am2, am, a0, ap, ap2, ap3; + // Compute global index and global size + for(i=0; i<DIM; i++) + { + s_ind[i] = get_global_id(i); + elementNumber[i] = get_global_size(i); + } + // Compute element size + for(i=0; i<DIM; i++) + elementSize[i] = (max_v[i] - min_v[i]) / ((float)elementNumber[i]); + // Read buffers + for(i=0; i<DIM; i++) + s_part[i] = part[space_index_to_buffer_index(s_ind, i, DIM)]; + s_pscal = pscal[space_index_to_buffer_index(s_ind, 0, 1)]; + // Compute left nearest index and copy to other index + for(i=0; i<DIM; i++) + { + s_ind_i0[i] = (int)((s_part[i] - min_v[i])/elementSize[i]+0.000001f); + s_ind_ip[i] = s_ind_i0[i]; + s_ind_ip2[i] = s_ind_i0[i]; + s_ind_ip3[i] = s_ind_i0[i]; + s_ind_im[i] = s_ind_i0[i]; + s_ind_im2[i] = s_ind_i0[i]; + } + s_ind_im2[dir] = (s_ind_im2[dir] - 2 + elementNumber[dir]) % elementNumber[dir]; + s_ind_im[dir] = (s_ind_im[dir] - 1 + elementNumber[dir]) % elementNumber[dir]; + s_ind_i0[dir] = (s_ind_i0[dir] + elementNumber[dir]) % elementNumber[dir]; + s_ind_ip[dir] = (s_ind_ip[dir] + 1 + elementNumber[dir]) % elementNumber[dir]; + s_ind_ip2[dir] = (s_ind_ip2[dir] + 2 + elementNumber[dir]) % elementNumber[dir]; + s_ind_ip3[dir] = (s_ind_ip3[dir] + 3 + elementNumber[dir]) % elementNumber[dir]; + // Compute distance to left nearest point + x0 = (s_part[dir] - s_ind_i0[dir] * elementSize[dir] - min_v[dir]) / elementSize[dir]; + // Compute remeshing weights am2 = (-(x0) * (5.0f * (x0 + 2.0f) - 8.0f) * (x0 - 1.0f) * (x0 - 1.0f) * (x0 - 1.0f)) / 24.0f; am = x0 * (x0 - 1.0f) * (25.0f * (x0 + 1.0f)*(x0 + 1.0f)*(x0 + 1.0f) - 114.0f * (x0 + 1.0f) * (x0 + 1.0f) + 153.0f * (x0 + 1.0f) - 48.0f) / 24.0f; a0 = -(x0 - 1.0f) * (25.0f * x0*x0*x0*x0 - 38.0f * x0*x0*x0 - 3.0f * x0*x0 + 12.0f * x0 + 12.0f) / 12.0f; ap = x0 * (25.0f * (1.0f - x0)*(1.0f - x0)*(1.0f - x0)*(1.0f - x0) - 38.0f * (1.0f - x0)*(1.0f - x0)*(1.0f - x0) - 3.0f * (1.0f - x0)*(1.0f - x0) + 12.0f * (1.0f - x0) + 12.0f) / 12.0f; ap2 = (1.0f - x0) * (-x0) * (25.0f * (2.0f - x0)*(2.0f - x0)*(2.0f - x0) - 114.0f * (2.0f - x0)*(2.0f - x0) + 153.0f * (2.0f - x0) - 48.0f) / 24.0f; ap3 = -(1.0f - x0) * ((5.0f * (3.0f - x0) - 8.0f) * (-x0)*(-x0)*(-x0)) / 24.0f; - // versin initiale - tmp[coord_to_index(img_coord(im2))].s0 = am2 * pscal_px[0]; - tmp[coord_to_index(img_coord(im))].s1 = am * pscal_px[0]; - tmp[coord_to_index(img_coord(i0))].s2 = a0 * pscal_px[0]; - tmp[coord_to_index(img_coord(ip))].s3 = ap * pscal_px[0]; - tmp[coord_to_index(img_coord(ip2))].s4 = ap2 * pscal_px[0]; - tmp[coord_to_index(img_coord(ip3))].s5 = ap3 * pscal_px[0]; - tmp[coord_to_index(img_coord(ip3))].s6 = 0.0f; - tmp[coord_to_index(img_coord(ip3))].s7 = 0.0f; - //tmp[coord_to_index(img_coord(im2))].s0 = tmp[coord_to_index(img_coord(im2))].s0 + am2 * pscal_px[0]; - //tmp[coord_to_index(img_coord(im))].s0 = tmp[coord_to_index(img_coord(im))].s0 + am * pscal_px[0]; - //ddir=(unsigned int)dir + 0 ; - //tmp[coord_to_index(img_coord(ind))].s0 = ind[dir]; - //tmp[coord_to_index(img_coord(ip))].s0 = tmp[coord_to_index(img_coord(ip))].s0 + ap * pscal_px[0]; - //tmp[coord_to_index(img_coord(ip2))].s0 = tmp[coord_to_index(img_coord(ip2))].s0 + ap2 * pscal_px[0]; - //tmp[coord_to_index(img_coord(ip3))].s0 = tmp[coord_to_index(img_coord(ip3))].s0 + ap3 * pscal_px[0]; - write_imagef(gscal, img_coord(i0), (float4)(a0 * pscal_px[0],0.0f,0.0f,0.0f)); + // Write temp result + tmp[space_index_to_buffer_index(s_ind_im2, 0, 8)] += am2 * s_pscal; + tmp[space_index_to_buffer_index(s_ind_im, 1, 8)] += am * s_pscal; + tmp[space_index_to_buffer_index(s_ind_i0, 2, 8)] += a0 * s_pscal; + tmp[space_index_to_buffer_index(s_ind_ip, 3, 8)] += ap * s_pscal; + tmp[space_index_to_buffer_index(s_ind_ip2, 4, 8)] += ap2 * s_pscal; + tmp[space_index_to_buffer_index(s_ind_ip3, 5, 8)] += ap3 * s_pscal; } -__kernel void remeshing_reduce(__write_only image3d_t gscal, - __global float8 *tmp, - float t, float dt, int dir, - float4 min, float4 max) +__kernel void remeshing_reduce(__global float* gscal, + __global float *tmp, + float t, + float dt, + int dir, + float4 min, + float4 max) { - int ind[4]; - vstore4((int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0), 0, ind); - float8 tscal = tmp[coord_to_index(img_coord(ind))]; - float scal;// = tscal.s0;//tscal.s0 + tscal.s1 + tscal.s2 + tscal.s3 + tscal.s4 + tscal.s5; - //scal = 1.0f / MAXFLOAT; - scal = scal + dot(tscal.lo, (float4)(1.0f, 1.0f, 1.0f, 1.0f)); - scal = scal + dot(tscal.hi, (float4)(1.0f, 1.0f, 1.0f, 1.0f)); - tmp[coord_to_index(img_coord(ind))] = (float8)(0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f); - write_imagef(gscal, img_coord(ind), (float4)(scal,0.0f,0.0f,0.0f)); + __private size_t i; + __private size_t s_ind[DIM]; + __private float sum = 0.0f; + // Compute global index and global size + for(i=0; i<DIM; i++) + s_ind[i] = get_global_id(i); + // Reduce grid point weights + for(i=0;i<6;i++) + sum += tmp[space_index_to_buffer_index(s_ind, i, 8)]; + // Write grid point new scalar + gscal[space_index_to_buffer_index(s_ind, 0, 1)] = sum; + // reset buffer + for(i=0;i<6;i++) + tmp[space_index_to_buffer_index(s_ind, i, 8)] = 0.0f; } diff --git a/parmepy/new_ParMePy/Operator/AdvectionDOp.py b/parmepy/new_ParMePy/Operator/AdvectionDOp.py index 38eab3a0f..de7c78031 100644 --- a/parmepy/new_ParMePy/Operator/AdvectionDOp.py +++ b/parmepy/new_ParMePy/Operator/AdvectionDOp.py @@ -6,6 +6,7 @@ Operator representation from DiscreteOperator import * import numpy as np import pyopencl as cl +import time providedClass = "AdvectionDOp" @@ -43,6 +44,8 @@ class AdvectionDOp(DiscreteOperator): self.resultPosition = respos ## Scalar value on new positions self.resultScalar = resscal + self.compute_time = 0. + self.queued_time = 0. def applyOperator(self, t, dt, splittingDirection): """ @@ -56,26 +59,25 @@ class AdvectionDOp(DiscreteOperator): # self.resultPosition.values[i] = self.velocity.domain.applyConditions(self.numMethod.integrate(self.velocity.domain.points[i], self.velocity.values[i], t, dt, splittingDirection)) #self.resultScalar.values[i] = self.scalar.values[i] if self.gpu_kernel is None: + c_time = time.time() ## ------------------- NumPy Optimisation self.resultPosition.values = self.numMethod.integrate(self.velocity.domain.points, self.velocity.values, t, dt, splittingDirection) self.resultScalar.values[:] = self.scalar.values[:] ## ------------------- + self.compute_time += (time.time() - c_time) else: - self.gpu_queue.finish() - self.gpu_kernel(self.gpu_queue, self.gpu_shape, None, + evt = self.gpu_kernel(self.gpu_queue, self.gpu_shape, None, self.velocity.domain.gpu_mem_object, self.velocity.gpu_mem_object, + self.scalar.gpu_mem_object, self.resultPosition.gpu_mem_object, - np.float32(t), np.float32(dt), np.int32(splittingDirection), + self.resultScalar.gpu_mem_object, + np.float32(t), np.float32(dt), np.uint(splittingDirection), np.concatenate((self.velocity.domain.min, [1.])).astype(np.float32), np.concatenate((self.velocity.domain.max, [1.])).astype(np.float32)) - evt = cl.enqueue_copy(self.gpu_queue, self.resultScalar.gpu_mem_object, self.scalar.gpu_mem_object, - src_origin=tuple([0 for x in self.gpu_shape]), - dest_origin=tuple([0 for x in self.gpu_shape]), - region=self.gpu_shape) - #self.scalar.values.fill(0.) - #cl.enqueue_copy(self.gpu_queue, self.scalar.gpu_mem_object, self.scalar.values, - #origin=tuple([0 for x in self.gpu_shape]), region=self.gpu_shape, wait_for=[evt]) + self.gpu_queue.finish() + self.compute_time += (evt.profile.end - evt.profile.start) / (10 ** 9) + self.queued_time += (evt.profile.start - evt.profile.queued) def __str__(self): """ToString method""" diff --git a/parmepy/new_ParMePy/Operator/RemeshingDOp.py b/parmepy/new_ParMePy/Operator/RemeshingDOp.py index 894352beb..a26ec09b0 100644 --- a/parmepy/new_ParMePy/Operator/RemeshingDOp.py +++ b/parmepy/new_ParMePy/Operator/RemeshingDOp.py @@ -7,6 +7,7 @@ from DiscreteOperator import * #from ..Utils.Lambda2 import * import numpy as np import pyopencl as cl +import time providedClass = "RemeshingDOp" @@ -43,6 +44,8 @@ class RemeshingDOp(DiscreteOperator): self.addVariable([self.ppos, self.pscal, self.pbloc, self.ptag, self.resultScalar]) self.numMethod = method self.gpu_kernel = None + self.compute_time = 0. + self.queued_time = 0. def setResultVariable(self, resscal): """ @@ -83,36 +86,30 @@ class RemeshingDOp(DiscreteOperator): # else: #for i in self.resultScalar.domain.explore(): # self.numMethod.remesh(self.ppos.values[i], self.pscal.values[i], splittingDirection) + c_time = time.time() if self.gpu_kernel is None: ## ------------------- NumPy Optimisation self.resultScalar.values[:] = 0. self.numMethod.remesh(self.ppos.values, self.pscal.values, splittingDirection) ## ------------------- else: - cl.enqueue_write_buffer(self.gpu_queue, self.gpu_buffer, self.gpu_buffer_values) - self.gpu_queue.finish() - #print "avant" - #print self.gpu_buffer_values[10,10,:8] - #print self.gpu_buffer_values[0,15,6] - #print "fin -avant" - self.gpu_kernel[0](self.gpu_queue, self.gpu_shape, None, + evt = self.gpu_kernel[0](self.gpu_queue, self.gpu_shape, None, self.ppos.gpu_mem_object, self.pscal.gpu_mem_object, self.resultScalar.gpu_mem_object, self.gpu_buffer, np.float32(t), np.float32(dt), np.int32(splittingDirection), np.concatenate((self.resultScalar.domain.min, [1.])).astype(np.float32), np.concatenate((self.resultScalar.domain.max, [1.])).astype(np.float32)) - cl.enqueue_copy(self.gpu_queue, self.gpu_buffer_values, self.gpu_buffer) - #print "Apres" - #print self.gpu_buffer_values[0,0,0,0] - #print self.gpu_buffer_values[2,0,0,0] - #print self.gpu_buffer_values[10,10,10,0] - #print self.gpu_buffer_values[10,10,:8] - #print self.gpu_buffer_values[0,15,6] - #print "fin apres" - self.gpu_kernel[1](self.gpu_queue, self.gpu_shape, None, + self.gpu_queue.finish() + self.compute_time += (evt.profile.end - evt.profile.start) / (10 ** 9) + self.queued_time += (evt.profile.start - evt.profile.queued) + evt = self.gpu_kernel[1](self.gpu_queue, self.gpu_shape, None, self.resultScalar.gpu_mem_object, self.gpu_buffer, np.float32(t), np.float32(dt), np.int32(splittingDirection), np.concatenate((self.resultScalar.domain.min, [1.])).astype(np.float32), np.concatenate((self.resultScalar.domain.max, [1.])).astype(np.float32)) + self.gpu_queue.finish() + self.compute_time += (evt.profile.end - evt.profile.start) / (10 ** 9) + self.queued_time += (evt.profile.start - evt.profile.queued) + self.compute_time += (time.time() - c_time) def __str__(self): """ToString method""" diff --git a/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py b/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py index 1a37efda7..87de7af55 100644 --- a/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py +++ b/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py @@ -58,6 +58,13 @@ class DiscreteTransportProblem(DiscreteProblem): while self.t < T: [[op.applyOperator(self.t, dt * i[1], i[0]) for op in self.operators] for i in self.splittingStep] self.t += dt + c_time = 0. + q_time = 0. + for op in self.operators: + c_time += op.compute_time + q_time += op.queued_time + print "Solving time : ", c_time, "s" + print "Queued time : ", q_time, "ns" def __str__(self): """ToString method""" diff --git a/parmepy/new_ParMePy/Utils/GPUParticularSolver.py b/parmepy/new_ParMePy/Utils/GPUParticularSolver.py index fa06f3440..7de8c0ac5 100644 --- a/parmepy/new_ParMePy/Utils/GPUParticularSolver.py +++ b/parmepy/new_ParMePy/Utils/GPUParticularSolver.py @@ -69,60 +69,49 @@ class GPUParticularSolver(ParticularSolver): self.parts = ParticleField(self.grid) self.ppos = DiscreteVariable(domain=self.parts, dimension=self.gvelo.dimension) self.parts.setPositions(self.ppos) - #self.pvelo = DiscreteVariable(domain=self.parts, dimension=self.gvelo.dimension) self.pscal = DiscreteVariable(domain=self.parts, dimension=self.gscal.dimension, initialValue=lambda x: 0., scalar=True) # Initialization GPU side. - # Images2D/Images3D/Buffers creation. + # Buffers creation. dim = self.grid.dimension - shape_dim = list(self.grid.elementNumber) - img_format = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT) - shape_img = tuple(shape_dim + [4]) self.gpu_shape = tuple(self.grid.elementNumber) - - padded_points = np.zeros(shape_img, dtype=np.float32) - padded_points[..., 0:dim] = self.grid.points[:] - self.grid.point = padded_points - self.grid.gpu_mem_object = cl.Image(self.ctx, - cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, img_format, - shape=self.gpu_shape, hostbuf=self.grid.point) - - padded_velocity = np.zeros(shape_img, dtype=np.float32) - padded_velocity[..., 0:dim] = self.gvelo.values[:] - self.gvelo.values = padded_velocity - self.gvelo.gpu_mem_object = cl.Image(self.ctx, - cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, img_format, - shape=self.gpu_shape, hostbuf=self.gvelo.values) - - scalar_nbytes = self.gscal.values.nbytes - padded_scalar = np.zeros(shape_img, dtype=np.float32) - padded_scalar[..., 0] = self.gscal.values[:] - self.gscal.values = padded_scalar - self.gscal.gpu_mem_object = cl.Image(self.ctx, - cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, img_format, - shape=self.gpu_shape, hostbuf=self.gscal.values) - - padded_ppos = np.zeros(shape_img, dtype=np.float32) - padded_ppos[..., 0:dim] = self.ppos.values[:] - self.ppos.values = padded_ppos - self.ppos.gpu_mem_object = cl.Image(self.ctx, - cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, img_format, - shape=self.gpu_shape, hostbuf=self.ppos.values) - - # padded_pvelo = np.zeros(shape_img, dtype=np.float32) - # padded_pvelo[..., 0:dim] = self.pvelo.values[:] - # self.pvelo.values = padded_pvelo - # self.pvelo.gpu_mem_object = cl.Image(self.ctx, - # cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, img_format, - # shape=self.gpu_shape, hostbuf=self.pvelo.values) - - padded_pscal = np.zeros(shape_img, dtype=np.float32) - padded_pscal[..., 0] = self.pscal.values[:] - self.pscal.values = padded_pscal - self.pscal.gpu_mem_object = cl.Image(self.ctx, - cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, img_format, - shape=self.gpu_shape, hostbuf=self.pscal.values) - - # Remeshing : particles positions, particles scalar, particles tag, particles bloc -> grid scalar + self.gpu_used_mem = 0 + + ## Allocation/copy grid points + self.grid.gpu_mem_object = cl.Buffer(self.ctx, + cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, + size=self.grid.points.nbytes, hostbuf=self.grid.points) + self.gpu_used_mem += self.grid.gpu_mem_object.size + print "Allocation grid points on gpu, size:", self.grid.gpu_mem_object.size, 'B' + + ## Allocation/copy grid velocity + self.gvelo.gpu_mem_object = cl.Buffer(self.ctx, + cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, + size=self.gvelo.values.nbytes, hostbuf=self.gvelo.values) + self.gpu_used_mem += self.gvelo.gpu_mem_object.size + print "Allocation grid velocity on gpu, size:", self.gvelo.gpu_mem_object.size, 'B' + + ## Allocation/copy grid scalar + self.gscal.gpu_mem_object = cl.Buffer(self.ctx, + cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, + size=self.gscal.values.nbytes, hostbuf=self.gscal.values) + self.gpu_used_mem += self.gscal.gpu_mem_object.size + print "Allocation grid scalar on gpu, size:", self.gscal.gpu_mem_object.size, 'B' + + ## Allocation/copy particles positions + self.ppos.gpu_mem_object = cl.Buffer(self.ctx, + cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, + size=self.ppos.values.nbytes, hostbuf=self.ppos.values) + self.gpu_used_mem += self.ppos.gpu_mem_object.size + print "Allocation particles positions on gpu, size:", self.ppos.gpu_mem_object.size, 'B' + + ## Allocation/copy particles scalar + self.pscal.gpu_mem_object = cl.Buffer(self.ctx, + cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, + size=self.pscal.values.nbytes, hostbuf=self.pscal.values) + self.gpu_used_mem += self.pscal.gpu_mem_object.size + print "Allocation particles scalar on gpu, size:", self.pscal.gpu_mem_object.size, 'B' + + ## Remeshing : particles positions, particles scalar, particles tag, particles bloc -> grid scalar remesh = RemeshingDOp(partPositions=self.ppos, partScalar=self.pscal, partBloc=self.pscal, @@ -130,24 +119,34 @@ class GPUParticularSolver(ParticularSolver): resscal=discreteProblem.advecOp.scalar, method=self.RemeshingMethod) discreteProblem.addOperator([remesh]) + discreteProblem.advecOp.setResultVariable(respos=self.ppos, resscal=self.pscal) discreteProblem.advecOp.setMethod(self.ODESolver) # Kernels creation/compilation f = open("gpu_src.cl", 'r') self.gpu_src = "".join(f.readlines()) - self.prg = cl.Program(self.ctx, self.gpu_src).build() + self.prg = cl.Program(self.ctx, self.gpu_src).build("-cl-single-precision-constant -cl-opt-disable -D ORDER='C' -D DIM=" + str(dim)) + print self.prg.get_build_info(self.device, cl.program_build_info.LOG) + print self.prg.get_build_info(self.device, cl.program_build_info.OPTIONS) + print self.prg.get_build_info(self.device, cl.program_build_info.STATUS) discreteProblem.advecOp.gpu_queue = self.queue discreteProblem.advecOp.gpu_shape = self.gpu_shape discreteProblem.advecOp.gpu_kernel = self.prg.integrate remesh.gpu_queue = self.queue remesh.gpu_shape = self.gpu_shape remesh.gpu_kernel = (self.prg.remeshing, self.prg.remeshing_reduce) - remesh.gpu_buffer = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=scalar_nbytes * 8) + remesh.gpu_buffer = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=self.gscal.values.nbytes * 8) + self.gpu_used_mem += remesh.gpu_buffer.size + print "Allocation remeshing buffer on gpu, size:", remesh.gpu_buffer.size, 'B' buffer_shape = list(self.gpu_shape) buffer_shape.append(8) remesh.gpu_buffer_values = np.zeros(tuple(buffer_shape), dtype=np.float32) + self.queue.finish() + p = self.gpu_used_mem * 100 / (self.device.global_mem_size * 1.) + print "Total memory allocated on gpu :", self.gpu_used_mem, "B (", p, " % of total available memory)" + # Splitting Step splitting = [] if self.splittingMethod == 'strang': @@ -161,12 +160,11 @@ class GPUParticularSolver(ParticularSolver): def collect_data(self): o = tuple([0 for x in self.gpu_shape]) self.queue.finish() - cl.enqueue_copy(self.queue, self.grid.point, self.grid.gpu_mem_object, origin=o, region=self.gpu_shape) - cl.enqueue_copy(self.queue, self.gvelo.values, self.gvelo.gpu_mem_object, origin=o, region=self.gpu_shape) - cl.enqueue_copy(self.queue, self.gscal.values, self.gscal.gpu_mem_object, origin=o, region=self.gpu_shape) - cl.enqueue_copy(self.queue, self.ppos.values, self.ppos.gpu_mem_object, origin=o, region=self.gpu_shape) - #cl.enqueue_copy(self.queue, self.pvelo.values, self.pvelo.gpu_mem_object, origin=o, region=self.gpu_shape) - cl.enqueue_copy(self.queue, self.pscal.values, self.pscal.gpu_mem_object, origin=o, region=self.gpu_shape) + cl.enqueue_copy(self.queue, self.grid.points, self.grid.gpu_mem_object) + cl.enqueue_copy(self.queue, self.gvelo.values, self.gvelo.gpu_mem_object) + cl.enqueue_copy(self.queue, self.gscal.values, self.gscal.gpu_mem_object) + cl.enqueue_copy(self.queue, self.ppos.values, self.ppos.gpu_mem_object) + cl.enqueue_copy(self.queue, self.pscal.values, self.pscal.gpu_mem_object) def terminate(self): self.queue.finish() diff --git a/parmepy/new_ParMePy/Utils/Printer.py b/parmepy/new_ParMePy/Utils/Printer.py index f401f9235..02707c020 100644 --- a/parmepy/new_ParMePy/Utils/Printer.py +++ b/parmepy/new_ParMePy/Utils/Printer.py @@ -34,12 +34,8 @@ class Printer(): self.grid = self.problem.discreteProblem.advecOp.velocity.domain self.gvelo = self.problem.discreteProblem.advecOp.velocity.values self.ppos = self.problem.discreteProblem.solver.ppos.values - if isinstance(self.problem.discreteProblem.solver, GPUParticularSolver): - self.gscal = self.problem.discreteProblem.advecOp.scalar.values[...,0] - self.pscal = self.problem.discreteProblem.solver.pscal.values[...,0] - else: - self.pscal = self.problem.discreteProblem.solver.pscal.values - self.gscal = self.problem.discreteProblem.advecOp.scalar.values + self.pscal = self.problem.discreteProblem.solver.pscal.values + self.gscal = self.problem.discreteProblem.advecOp.scalar.values #self.pvelo = self.problem.discreteProblem.solver.pvelo # self.pbloc = self.problem.discreteProblem.solver.pbloc diff --git a/parmepy/new_ParMePy/Variable/AnalyticalVariable.py b/parmepy/new_ParMePy/Variable/AnalyticalVariable.py index c2b82f071..4c1914dd3 100644 --- a/parmepy/new_ParMePy/Variable/AnalyticalVariable.py +++ b/parmepy/new_ParMePy/Variable/AnalyticalVariable.py @@ -31,14 +31,14 @@ class AnalyticalVariable(ContinuousVariable): ## Is scalar variable self.scalar = scalar - def discretize(self, spec=None): + def discretize(self, d_spec=None): """ AnalyticalVariable discretization method. Create an DiscreteVariable.DiscreteVariable from given specifications. - @param spec : discretization specifications, not used. + @param d_spec : discretization specifications, not used. """ - self.discreteVariable = DiscreteVariable(domain=self.domain.discreteDomain, dimension=self.dimension, initialValue=self.formula, scalar=self.scalar) + self.discreteVariable = DiscreteVariable(domain=self.domain.discreteDomain, dimension=self.dimension, initialValue=self.formula, scalar=self.scalar, spec=d_spec) def value(self, pos): """ diff --git a/parmepy/new_ParMePy/Variable/DiscreteVariable.py b/parmepy/new_ParMePy/Variable/DiscreteVariable.py index 51e649327..cc28c8fc6 100644 --- a/parmepy/new_ParMePy/Variable/DiscreteVariable.py +++ b/parmepy/new_ParMePy/Variable/DiscreteVariable.py @@ -14,7 +14,7 @@ class DiscreteVariable: Discrete variables abstract description """ - def __init__(self, domain=None, dimension=None, initialValue=lambda x: x, integer=False, scalar=False): + def __init__(self, domain=None, dimension=None, initialValue=lambda x: x, integer=False, scalar=False, spec=None): """ Constructor. Create a DiscreteVariable from a given ContinuousVariable.ContinuousVariable or a domain. @@ -52,14 +52,24 @@ class DiscreteVariable: ### ------------------ # New grid creation with numpy shape = list(self.domain.elementNumber) - if not scalar: - shape.append(self.dimension) - if integer: - self.values = np.zeros(shape, dtype=int) + if spec is None: + if not scalar: + shape.append(self.dimension) + if integer: + self.values = np.zeros(shape, dtype=int) + else: + self.values = np.zeros(shape, dtype=np.float32) + for ind in self.domain.explore(): + self.values[ind] = initialValue(self.domain[ind]) else: - self.values = np.zeros(shape) - for ind in self.domain.explore(): - self.values[ind] = initialValue(self.domain[ind]) + shape.append(4) + self.values = np.zeros(shape, dtype=np.float32) + if scalar: + for ind in self.domain.explore(): + self.values[ind][0] = initialValue(self.domain[ind]) + else: + for ind in self.domain.explore(): + self.values[ind][:self.dimension] = initialValue(self.domain[ind]) self.gpu_mem_object = None def __str__(self): -- GitLab