From ebfd261b648d0ff58b203a7c61e32580ddc02a82 Mon Sep 17 00:00:00 2001
From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@imag.fr>
Date: Tue, 6 Mar 2012 15:51:40 +0000
Subject: [PATCH] New GPU version using Buffer instead of Image.

---
 parmepy/examples/mainJM.py                    |   3 +
 parmepy/gpu_src.cl                            | 284 ++++++++++--------
 parmepy/new_ParMePy/Operator/AdvectionDOp.py  |  22 +-
 parmepy/new_ParMePy/Operator/RemeshingDOp.py  |  29 +-
 .../Problem/DiscreteTransportProblem.py       |   7 +
 .../new_ParMePy/Utils/GPUParticularSolver.py  | 114 ++++---
 parmepy/new_ParMePy/Utils/Printer.py          |   8 +-
 .../Variable/AnalyticalVariable.py            |   6 +-
 .../new_ParMePy/Variable/DiscreteVariable.py  |  26 +-
 9 files changed, 266 insertions(+), 233 deletions(-)

diff --git a/parmepy/examples/mainJM.py b/parmepy/examples/mainJM.py
index fc4d915fe..50c2b780d 100644
--- a/parmepy/examples/mainJM.py
+++ b/parmepy/examples/mainJM.py
@@ -53,6 +53,9 @@ def run():
     InterpolationMethod = Linear(grid=box.discreteDomain, gvalues=velo.discreteVariable)
     RemeshingMethod = M6Prime(grid=box.discreteDomain, gvalues=scal.discreteVariable)
     ODESolver = RK2(InterpolationMethod, box.discreteDomain.applyConditions)
+    ## cpu
+    #solver = ParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod)
+    ## gpu
     solver = GPUParticularSolver(ODESolver, InterpolationMethod, RemeshingMethod)
     cTspPb.setSolver(solver)
     #cTspPb.setPrinter(Printer(frequency=outputModulo, outputPrefix=outputFilePrefix, problem=cTspPb))
diff --git a/parmepy/gpu_src.cl b/parmepy/gpu_src.cl
index 0f40a506b..a034bdc32 100644
--- a/parmepy/gpu_src.cl
+++ b/parmepy/gpu_src.cl
@@ -1,157 +1,177 @@
- // Cast image coordinate to buffer index.
-unsigned int coord_to_index(int4 c);
-// Cast array to image coordinate (images are stored in fortran order)
-// TODO: Utiliser des tableaux en fortran partout.
-int4 img_coord(int *c);
-// Elementwise float array copy
-void copy_f(float *src, float *dest);
-// Elementwise int array copy
-void copy_i(int *src, int *dest);
-
-void copy_f(float *src, float *dest)
-{
-  for(size_t i=0;i<4;i++)
-    dest[i] = src[i];
-}
-void copy_i(int *src, int *dest)
-{
-  for(size_t i=0;i<4;i++)
-    dest[i] = src[i];
-}
-int4 img_coord(int *c)
+size_t space_index_to_buffer_index(size_t *ind, size_t d, size_t dim_vec);
+#if ORDER == 'F'
+size_t space_index_to_buffer_index(size_t *ind, size_t d, size_t dim_vec)
 {
-  return (int4)(c[2], c[1], c[0], 0);
+    //F order
+#if DIM == 2
+      return ind[0] + ind[1] * get_global_size(0) + ind[2] * get_global_size(0) * get_global_size(1) + d * get_global_size(0) * get_global_size(1) * get_global_size(2);
+#else
+      return ind[0] + ind[1] * get_global_size(0) + d * get_global_size(0) * get_global_size(1);
+#endif
 }
-unsigned int coord_to_index(int4 c)
+#else
+size_t space_index_to_buffer_index(size_t *ind, size_t d, size_t dim_vec)
 {
-  return c.x + get_global_size(0) * c.y + get_global_size(0) * get_global_size(1) * c.z;
+  //C order
+#if DIM == 3
+    return dim_vec * (ind[2] + ind[1] * get_global_size(2) + ind[0] * get_global_size(2) * get_global_size(1)) + d;
+#else
+    return dim_vec * (ind[1] + ind[0] * get_global_size(1)) + d;
+#endif
 }
+#endif
+
 
 // Integration Kernel (RK2 method)
-__kernel void integrate(__read_only image3d_t y,
-			__read_only image3d_t fy,
-			__write_only image3d_t pos,
-			float t, float dt, int dir,
-			float4 min_v, float4 max_v)
+__kernel void integrate(__global const float* y,
+			__global const float* fy,
+			__global const float* s,
+			__global float* pos,
+			__global float* ps,
+			 float t,
+			 float dt,
+			 size_t dir,
+			 float4 min_v,
+			 float4 max_v
+			)
 {
-  const sampler_t isp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_REPEAT | CLK_FILTER_NEAREST ;
-  float4 elementNumber_v = (float4)(get_global_size(0), get_global_size(1), get_global_size(2), 1.0f);
-  float4 elementSize_v = (max_v - min_v) / elementNumber_v;
-  int ind[4], iind[4], iindp[4];
-  float res[4], y_px[4], fy_px[4], min[4], max[4], elementNumber[4], elementSize[4];
-  float x0;
-  // Storing vectors data in simple arrays.
-  vstore4((int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0), 0, ind);
-  vstore4(read_imagef(y, isp, img_coord(ind)), 0, y_px);
-  vstore4(read_imagef(fy, isp, img_coord(ind)), 0, fy_px);
-  vstore4(min_v, 0, min);
-  vstore4(max_v, 0, max);
-  vstore4(elementNumber_v, 0, elementNumber);
-  vstore4(elementSize_v, 0, elementSize);
-  // Copying position
-  copy_f(y_px, res);
+  __private size_t i;
+  __private size_t s_ind[DIM];
+  __private size_t elementNumber[DIM];
+  __private float elementSize[DIM];
+  __private float s_y[DIM];
+  __private float s_fy[DIM];
+  __private float s_pos[DIM];
+  __private size_t s_ind_i[DIM],s_ind_ip[DIM];
+  __private float x0;
+  // Compute global index and global size
+  for(i=0; i<DIM; i++)
+    {
+      s_ind[i] = get_global_id(i);
+      elementNumber[i] = get_global_size(i);
+    }
+  // Compute element size
+  for(i=0; i<DIM; i++)
+    elementSize[i] = (max_v[i] - min_v[i]) / ((float)elementNumber[i]);
+  // Read buffers
+  for(i=0; i<DIM; i++)
+    {
+      s_y[i] = y[space_index_to_buffer_index(s_ind, i, DIM)];
+      s_fy[i] = fy[space_index_to_buffer_index(s_ind, i, DIM)];
+    }
+  // Copy position
+  for(i=0; i<DIM; i++)
+    s_pos[i] = s_y[i];
   // RK2 first advection
-  res[dir] = y_px[dir] + fy_px[dir] * dt * 0.5f;
-  // Applying boundary conditions on new position
-  if (res[dir] >= max[dir])
-    res[dir] = res[dir] - max[dir] + min[dir];
-  if (res[dir] < min[dir])
-    res[dir] = res[dir] + max[dir] - min[dir];
+  s_pos[dir] = s_y[dir] + s_fy[dir] * dt * 0.5f;
   // Velocity interpolation
-  vstore4(convert_int4_rtn((vload4(0, res) - min_v) / elementSize_v), 0, iind);
-  copy_i(iind, iindp);
-  iindp[dir] = (iindp[dir] + 1) % (int) (elementNumber[dir]);
-  x0 = (res[dir] - iind[dir] * elementSize[dir] - min[dir]) / elementSize[dir];
-  vstore4((1.0f - x0) * read_imagef(fy, isp, img_coord(iind)) + x0 * read_imagef(fy, isp, img_coord(iindp)),0 ,fy_px);
+  for(i=0; i<DIM; i++)
+    s_ind_i[i] = (int)((s_pos[i] - min_v[i])/elementSize[i]+0.000001f) ;
+  s_ind_i[dir] = (s_ind_i[dir] + elementNumber[dir]) % elementNumber[dir];
+  for(i=0; i<DIM; i++)
+    s_ind_ip[i] = s_ind_i[i];
+  s_ind_ip[dir] = (s_ind_i[dir] + 1) % elementNumber[dir];
+  x0 = (s_pos[dir] - s_ind_i[dir] * elementSize[dir] - min_v[dir]) / elementSize[dir];
+  s_fy[dir] = (1.0f - x0) * fy[space_index_to_buffer_index(s_ind_i, dir, DIM)] + x0 * fy[space_index_to_buffer_index(s_ind_ip, dir, DIM)];
   // RK2 second advection
-  res[dir] = y_px[dir] + fy_px[dir] * dt;
+  s_pos[dir] = s_y[dir] + s_fy[dir] * dt;
   // Applying boundary conditions on new position
-  if (res[dir] >= max[dir])
-    res[dir] = res[dir] - max[dir] + min[dir];
-  if (res[dir] < min[dir])
-    res[dir] = res[dir] + max[dir] - min[dir];
-  // Writing final position in image
-  write_imagef(pos, img_coord(ind), vload4(0, res));
+  if (s_pos[dir] >= max_v[dir])
+    s_pos[dir] = s_pos[dir] - max_v[dir] + min_v[dir];
+  if (s_pos[dir] < min_v[dir])
+  s_pos[dir] = s_pos[dir] + max_v[dir] - min_v[dir];
+  // Writing result
+  for(i=0; i<DIM; i++)
+    pos[space_index_to_buffer_index(s_ind, i, DIM)] = s_pos[i];
+  ps[space_index_to_buffer_index(s_ind, 0, 1)] = s[space_index_to_buffer_index(s_ind, 0, 1)];
 }
 
-
 // Remeshing kernel (M'6 formula)
-__kernel void remeshing(__read_only image3d_t part,
-			__read_only image3d_t pscal,
-			__write_only image3d_t gscal,
-			__global float8 *tmp,
-			float t, float dt, int dir,
-			float4 min_v, float4 max_v)
+__kernel void remeshing(__global const float* part,
+			__global const float* pscal,
+			__global float* gscal,
+			__global float *tmp,
+			 float t,
+			 float dt,
+			 int dir,
+			 float4 min_v,
+			 float4 max_v
+			)
 {
-  const sampler_t isp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_REPEAT | CLK_FILTER_NEAREST ;
-  //unsigned int ddir = convert_uint(dir);
-  int ind[4], im2[4], im[4], i0[4], ip[4], ip2[4], ip3[4], elementNumber[4];
-  float part_px[4], pscal_px[4], elementSize[4], min[4], max[4];
-  //int4 ind = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-  //int4 i0, im, im2, ip, ip2, ip3;
-  int4 elementNumber_v = (int4)(get_global_size(0), get_global_size(1), get_global_size(2), 1.0f);
-  float4 elementSize_v = (max_v - min_v) / convert_float4(elementNumber_v);
-  //float4 part_px = read_imagef(part, isp, ind);
-  //float4 pscal_px = read_imagef(pscal, isp, ind);
-  //int4 indp;
-  float x0, am2, am, a0, ap, ap2, ap3;
-  vstore4((int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0), 0, ind);
-  vstore4(read_imagef(part, isp, img_coord(ind)), 0, part_px);
-  vstore4(read_imagef(pscal, isp, img_coord(ind)), 0, pscal_px);
-  vstore4(convert_int4_rtn((vload4(0, part_px) - min_v) / elementSize_v), 0, i0);
-  vstore4(min_v, 0, min);
-  vstore4(max_v, 0, max);
-  vstore4(elementNumber_v, 0, elementNumber);
-  vstore4(elementSize_v, 0, elementSize);
-  i0[dir] = i0[dir] % elementNumber[dir];
-  copy_i(i0, im2);
-  copy_i(i0, im);
-  copy_i(i0, ip);
-  copy_i(i0, ip2);
-  copy_i(i0, ip3);
-  im2[dir] = (i0[dir] - 2 + elementNumber[dir]) % elementNumber[dir];
-  im[dir] = (i0[dir] - 1 + elementNumber[dir]) % elementNumber[dir];
-  ip[dir] = (i0[dir] + 1) % elementNumber[dir];
-  ip2[dir] = (i0[dir] + 2) % elementNumber[dir];
-  ip3[dir] = (i0[dir] + 3) % elementNumber[dir];
-  x0 = (part_px[dir] - i0[dir] * elementSize[dir] - min[dir]) / elementSize[dir];
+  __private size_t i;
+  __private size_t s_ind[DIM];
+  __private size_t elementNumber[DIM];
+  __private float elementSize[DIM];
+  __private float s_part[DIM],s_pscal;
+  __private size_t s_ind_im2[DIM],s_ind_im[DIM],s_ind_i0[DIM],s_ind_ip[DIM],s_ind_ip2[DIM],s_ind_ip3[DIM];
+  __private float x0, am2, am, a0, ap, ap2, ap3;
+  // Compute global index and global size
+  for(i=0; i<DIM; i++)
+    {
+      s_ind[i] = get_global_id(i);
+      elementNumber[i] = get_global_size(i);
+    }
+  // Compute element size
+  for(i=0; i<DIM; i++)
+    elementSize[i] = (max_v[i] - min_v[i]) / ((float)elementNumber[i]);
+  // Read buffers
+  for(i=0; i<DIM; i++)
+      s_part[i] = part[space_index_to_buffer_index(s_ind, i, DIM)];
+  s_pscal = pscal[space_index_to_buffer_index(s_ind, 0, 1)];
+  // Compute left nearest index and copy to other index
+  for(i=0; i<DIM; i++)
+    {
+      s_ind_i0[i] = (int)((s_part[i] - min_v[i])/elementSize[i]+0.000001f);
+      s_ind_ip[i] = s_ind_i0[i];
+      s_ind_ip2[i] = s_ind_i0[i];
+      s_ind_ip3[i] = s_ind_i0[i];
+      s_ind_im[i] = s_ind_i0[i];
+      s_ind_im2[i] = s_ind_i0[i];
+    }
+  s_ind_im2[dir] = (s_ind_im2[dir] - 2 + elementNumber[dir]) % elementNumber[dir];
+  s_ind_im[dir] = (s_ind_im[dir] - 1 + elementNumber[dir]) % elementNumber[dir];
+  s_ind_i0[dir] = (s_ind_i0[dir] + elementNumber[dir]) % elementNumber[dir];
+  s_ind_ip[dir] = (s_ind_ip[dir] + 1 + elementNumber[dir]) % elementNumber[dir];
+  s_ind_ip2[dir] = (s_ind_ip2[dir] + 2 + elementNumber[dir]) % elementNumber[dir];
+  s_ind_ip3[dir] = (s_ind_ip3[dir] + 3 + elementNumber[dir]) % elementNumber[dir];
+  // Compute distance to left nearest point
+  x0 = (s_part[dir] - s_ind_i0[dir] * elementSize[dir] - min_v[dir]) / elementSize[dir];
+  // Compute remeshing weights
   am2 = (-(x0) * (5.0f * (x0 + 2.0f) - 8.0f) * (x0 - 1.0f) * (x0 - 1.0f) * (x0 - 1.0f)) / 24.0f;
   am = x0 * (x0 - 1.0f) * (25.0f * (x0 + 1.0f)*(x0 + 1.0f)*(x0 + 1.0f) - 114.0f * (x0 + 1.0f) * (x0 + 1.0f) + 153.0f * (x0 + 1.0f) - 48.0f) / 24.0f;
   a0 = -(x0 - 1.0f) * (25.0f * x0*x0*x0*x0 - 38.0f * x0*x0*x0 - 3.0f * x0*x0 + 12.0f * x0 + 12.0f) / 12.0f;
   ap = x0 * (25.0f * (1.0f - x0)*(1.0f - x0)*(1.0f - x0)*(1.0f - x0) - 38.0f * (1.0f - x0)*(1.0f - x0)*(1.0f - x0) - 3.0f * (1.0f - x0)*(1.0f - x0) + 12.0f * (1.0f - x0) + 12.0f) / 12.0f;
   ap2 = (1.0f - x0) * (-x0) * (25.0f * (2.0f - x0)*(2.0f - x0)*(2.0f - x0) - 114.0f * (2.0f - x0)*(2.0f - x0) + 153.0f * (2.0f - x0) - 48.0f) / 24.0f;
   ap3 = -(1.0f - x0) * ((5.0f * (3.0f - x0) - 8.0f) * (-x0)*(-x0)*(-x0)) / 24.0f;
-  // versin initiale
-  tmp[coord_to_index(img_coord(im2))].s0 = am2 * pscal_px[0];
-  tmp[coord_to_index(img_coord(im))].s1 = am * pscal_px[0];
-  tmp[coord_to_index(img_coord(i0))].s2 = a0 * pscal_px[0];
-  tmp[coord_to_index(img_coord(ip))].s3 = ap * pscal_px[0];
-  tmp[coord_to_index(img_coord(ip2))].s4 = ap2 * pscal_px[0];
-  tmp[coord_to_index(img_coord(ip3))].s5 = ap3 * pscal_px[0];
-  tmp[coord_to_index(img_coord(ip3))].s6 = 0.0f;
-  tmp[coord_to_index(img_coord(ip3))].s7 = 0.0f;
-  //tmp[coord_to_index(img_coord(im2))].s0 = tmp[coord_to_index(img_coord(im2))].s0 + am2 * pscal_px[0];
-  //tmp[coord_to_index(img_coord(im))].s0 = tmp[coord_to_index(img_coord(im))].s0 + am * pscal_px[0];
-  //ddir=(unsigned int)dir + 0 ;
-  //tmp[coord_to_index(img_coord(ind))].s0 = ind[dir];
-  //tmp[coord_to_index(img_coord(ip))].s0 = tmp[coord_to_index(img_coord(ip))].s0 + ap * pscal_px[0];
-  //tmp[coord_to_index(img_coord(ip2))].s0 = tmp[coord_to_index(img_coord(ip2))].s0 + ap2 * pscal_px[0];
-  //tmp[coord_to_index(img_coord(ip3))].s0 = tmp[coord_to_index(img_coord(ip3))].s0 + ap3 * pscal_px[0];
-  write_imagef(gscal, img_coord(i0), (float4)(a0 * pscal_px[0],0.0f,0.0f,0.0f));
+  // Write temp result
+  tmp[space_index_to_buffer_index(s_ind_im2, 0, 8)] += am2 * s_pscal;
+  tmp[space_index_to_buffer_index(s_ind_im, 1, 8)] += am * s_pscal;
+  tmp[space_index_to_buffer_index(s_ind_i0, 2, 8)] += a0 * s_pscal;
+  tmp[space_index_to_buffer_index(s_ind_ip, 3, 8)] += ap * s_pscal;
+  tmp[space_index_to_buffer_index(s_ind_ip2, 4, 8)] += ap2 * s_pscal;
+  tmp[space_index_to_buffer_index(s_ind_ip3, 5, 8)] += ap3 * s_pscal;
 }
 
-__kernel void remeshing_reduce(__write_only image3d_t gscal,
-			__global float8 *tmp,
-			float t, float dt, int dir,
-			float4 min, float4 max)
+__kernel void remeshing_reduce(__global float* gscal,
+			       __global float *tmp,
+			        float t,
+			        float dt,
+			        int dir,
+			        float4 min,
+			        float4 max)
 {
-  int ind[4];
-  vstore4((int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0), 0, ind);
-  float8 tscal = tmp[coord_to_index(img_coord(ind))];
-  float scal;// = tscal.s0;//tscal.s0 + tscal.s1 + tscal.s2 + tscal.s3 + tscal.s4 + tscal.s5;
-  //scal = 1.0f / MAXFLOAT;
-  scal = scal + dot(tscal.lo, (float4)(1.0f, 1.0f, 1.0f, 1.0f));
-  scal = scal + dot(tscal.hi, (float4)(1.0f, 1.0f, 1.0f, 1.0f));
-  tmp[coord_to_index(img_coord(ind))] = (float8)(0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f);
-  write_imagef(gscal, img_coord(ind), (float4)(scal,0.0f,0.0f,0.0f));
+  __private size_t i;
+  __private size_t s_ind[DIM];
+  __private float sum = 0.0f;
+  // Compute global index and global size
+  for(i=0; i<DIM; i++)
+      s_ind[i] = get_global_id(i);
+  // Reduce grid point weights
+  for(i=0;i<6;i++)
+    sum += tmp[space_index_to_buffer_index(s_ind, i, 8)];
+  // Write grid point new scalar
+  gscal[space_index_to_buffer_index(s_ind, 0, 1)] = sum;
+  // reset buffer
+  for(i=0;i<6;i++)
+    tmp[space_index_to_buffer_index(s_ind, i, 8)] = 0.0f;
 }
diff --git a/parmepy/new_ParMePy/Operator/AdvectionDOp.py b/parmepy/new_ParMePy/Operator/AdvectionDOp.py
index 38eab3a0f..de7c78031 100644
--- a/parmepy/new_ParMePy/Operator/AdvectionDOp.py
+++ b/parmepy/new_ParMePy/Operator/AdvectionDOp.py
@@ -6,6 +6,7 @@ Operator representation
 from DiscreteOperator import *
 import numpy as np
 import pyopencl as cl
+import time
 
 providedClass = "AdvectionDOp"
 
@@ -43,6 +44,8 @@ class AdvectionDOp(DiscreteOperator):
         self.resultPosition = respos
         ## Scalar value on new positions
         self.resultScalar = resscal
+        self.compute_time = 0.
+        self.queued_time = 0.
 
     def applyOperator(self, t, dt, splittingDirection):
         """
@@ -56,26 +59,25 @@ class AdvectionDOp(DiscreteOperator):
         #    self.resultPosition.values[i] = self.velocity.domain.applyConditions(self.numMethod.integrate(self.velocity.domain.points[i], self.velocity.values[i], t, dt, splittingDirection))
             #self.resultScalar.values[i] = self.scalar.values[i]
         if self.gpu_kernel is None:
+            c_time = time.time()
             ## ------------------- NumPy Optimisation
             self.resultPosition.values = self.numMethod.integrate(self.velocity.domain.points, self.velocity.values, t, dt, splittingDirection)
             self.resultScalar.values[:] = self.scalar.values[:]
             ## -------------------
+            self.compute_time += (time.time() - c_time)
         else:
-            self.gpu_queue.finish()
-            self.gpu_kernel(self.gpu_queue, self.gpu_shape, None,
+            evt = self.gpu_kernel(self.gpu_queue, self.gpu_shape, None,
                 self.velocity.domain.gpu_mem_object,
                 self.velocity.gpu_mem_object,
+                self.scalar.gpu_mem_object,
                 self.resultPosition.gpu_mem_object,
-                np.float32(t), np.float32(dt), np.int32(splittingDirection),
+                self.resultScalar.gpu_mem_object,
+                np.float32(t), np.float32(dt), np.uint(splittingDirection),
                 np.concatenate((self.velocity.domain.min, [1.])).astype(np.float32),
                 np.concatenate((self.velocity.domain.max, [1.])).astype(np.float32))
-            evt = cl.enqueue_copy(self.gpu_queue, self.resultScalar.gpu_mem_object, self.scalar.gpu_mem_object,
-                src_origin=tuple([0 for x in self.gpu_shape]),
-                dest_origin=tuple([0 for x in self.gpu_shape]),
-                region=self.gpu_shape)
-                #self.scalar.values.fill(0.)
-                #cl.enqueue_copy(self.gpu_queue, self.scalar.gpu_mem_object, self.scalar.values,
-                #origin=tuple([0 for x in self.gpu_shape]), region=self.gpu_shape, wait_for=[evt])
+            self.gpu_queue.finish()
+            self.compute_time += (evt.profile.end - evt.profile.start) / (10 ** 9)
+            self.queued_time += (evt.profile.start - evt.profile.queued)
 
     def __str__(self):
         """ToString method"""
diff --git a/parmepy/new_ParMePy/Operator/RemeshingDOp.py b/parmepy/new_ParMePy/Operator/RemeshingDOp.py
index 894352beb..a26ec09b0 100644
--- a/parmepy/new_ParMePy/Operator/RemeshingDOp.py
+++ b/parmepy/new_ParMePy/Operator/RemeshingDOp.py
@@ -7,6 +7,7 @@ from DiscreteOperator import *
 #from ..Utils.Lambda2 import *
 import numpy as np
 import pyopencl as cl
+import time
 
 providedClass = "RemeshingDOp"
 
@@ -43,6 +44,8 @@ class RemeshingDOp(DiscreteOperator):
         self.addVariable([self.ppos, self.pscal, self.pbloc, self.ptag, self.resultScalar])
         self.numMethod = method
         self.gpu_kernel = None
+        self.compute_time = 0.
+        self.queued_time = 0.
 
     def setResultVariable(self, resscal):
         """
@@ -83,36 +86,30 @@ class RemeshingDOp(DiscreteOperator):
         # else:
         #for i in self.resultScalar.domain.explore():
         #    self.numMethod.remesh(self.ppos.values[i], self.pscal.values[i], splittingDirection)
+        c_time = time.time()
         if self.gpu_kernel is None:
             ## ------------------- NumPy Optimisation
             self.resultScalar.values[:] = 0.
             self.numMethod.remesh(self.ppos.values, self.pscal.values, splittingDirection)
             ## -------------------
         else:
-            cl.enqueue_write_buffer(self.gpu_queue, self.gpu_buffer, self.gpu_buffer_values)
-            self.gpu_queue.finish()
-            #print "avant"
-            #print self.gpu_buffer_values[10,10,:8]
-            #print self.gpu_buffer_values[0,15,6]
-            #print "fin -avant"
-            self.gpu_kernel[0](self.gpu_queue, self.gpu_shape, None,
+            evt = self.gpu_kernel[0](self.gpu_queue, self.gpu_shape, None,
                 self.ppos.gpu_mem_object, self.pscal.gpu_mem_object, self.resultScalar.gpu_mem_object, self.gpu_buffer,
                 np.float32(t), np.float32(dt), np.int32(splittingDirection),
                 np.concatenate((self.resultScalar.domain.min, [1.])).astype(np.float32),
                 np.concatenate((self.resultScalar.domain.max, [1.])).astype(np.float32))
-            cl.enqueue_copy(self.gpu_queue, self.gpu_buffer_values, self.gpu_buffer)
-            #print "Apres"
-            #print self.gpu_buffer_values[0,0,0,0]
-            #print self.gpu_buffer_values[2,0,0,0]
-            #print self.gpu_buffer_values[10,10,10,0]
-            #print self.gpu_buffer_values[10,10,:8]
-            #print self.gpu_buffer_values[0,15,6]
-            #print "fin apres"
-            self.gpu_kernel[1](self.gpu_queue, self.gpu_shape, None,
+            self.gpu_queue.finish()
+            self.compute_time += (evt.profile.end - evt.profile.start) / (10 ** 9)
+            self.queued_time += (evt.profile.start - evt.profile.queued)
+            evt = self.gpu_kernel[1](self.gpu_queue, self.gpu_shape, None,
                 self.resultScalar.gpu_mem_object, self.gpu_buffer,
                 np.float32(t), np.float32(dt), np.int32(splittingDirection),
                 np.concatenate((self.resultScalar.domain.min, [1.])).astype(np.float32),
                 np.concatenate((self.resultScalar.domain.max, [1.])).astype(np.float32))
+            self.gpu_queue.finish()
+            self.compute_time += (evt.profile.end - evt.profile.start) / (10 ** 9)
+            self.queued_time += (evt.profile.start - evt.profile.queued)
+        self.compute_time += (time.time() - c_time)
 
     def __str__(self):
         """ToString method"""
diff --git a/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py b/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py
index 1a37efda7..87de7af55 100644
--- a/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py
+++ b/parmepy/new_ParMePy/Problem/DiscreteTransportProblem.py
@@ -58,6 +58,13 @@ class DiscreteTransportProblem(DiscreteProblem):
             while self.t < T:
                 [[op.applyOperator(self.t, dt * i[1], i[0]) for op in self.operators] for i in self.splittingStep]
                 self.t += dt
+        c_time = 0.
+        q_time = 0.
+        for op in self.operators:
+            c_time += op.compute_time
+            q_time += op.queued_time
+        print "Solving time : ", c_time, "s"
+        print "Queued time : ", q_time, "ns"
 
     def __str__(self):
         """ToString method"""
diff --git a/parmepy/new_ParMePy/Utils/GPUParticularSolver.py b/parmepy/new_ParMePy/Utils/GPUParticularSolver.py
index fa06f3440..7de8c0ac5 100644
--- a/parmepy/new_ParMePy/Utils/GPUParticularSolver.py
+++ b/parmepy/new_ParMePy/Utils/GPUParticularSolver.py
@@ -69,60 +69,49 @@ class GPUParticularSolver(ParticularSolver):
         self.parts = ParticleField(self.grid)
         self.ppos = DiscreteVariable(domain=self.parts, dimension=self.gvelo.dimension)
         self.parts.setPositions(self.ppos)
-        #self.pvelo = DiscreteVariable(domain=self.parts, dimension=self.gvelo.dimension)
         self.pscal = DiscreteVariable(domain=self.parts, dimension=self.gscal.dimension, initialValue=lambda x: 0., scalar=True)
         # Initialization GPU side.
-        # Images2D/Images3D/Buffers creation.
+        # Buffers creation.
         dim = self.grid.dimension
-        shape_dim = list(self.grid.elementNumber)
-        img_format = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT)
-        shape_img = tuple(shape_dim + [4])
         self.gpu_shape = tuple(self.grid.elementNumber)
-
-        padded_points = np.zeros(shape_img, dtype=np.float32)
-        padded_points[..., 0:dim] = self.grid.points[:]
-        self.grid.point = padded_points
-        self.grid.gpu_mem_object = cl.Image(self.ctx,
-            cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, img_format,
-            shape=self.gpu_shape, hostbuf=self.grid.point)
-
-        padded_velocity = np.zeros(shape_img, dtype=np.float32)
-        padded_velocity[..., 0:dim] = self.gvelo.values[:]
-        self.gvelo.values = padded_velocity
-        self.gvelo.gpu_mem_object = cl.Image(self.ctx,
-            cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, img_format,
-            shape=self.gpu_shape, hostbuf=self.gvelo.values)
-
-        scalar_nbytes = self.gscal.values.nbytes
-        padded_scalar = np.zeros(shape_img, dtype=np.float32)
-        padded_scalar[..., 0] = self.gscal.values[:]
-        self.gscal.values = padded_scalar
-        self.gscal.gpu_mem_object = cl.Image(self.ctx,
-            cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, img_format,
-            shape=self.gpu_shape, hostbuf=self.gscal.values)
-
-        padded_ppos = np.zeros(shape_img, dtype=np.float32)
-        padded_ppos[..., 0:dim] = self.ppos.values[:]
-        self.ppos.values = padded_ppos
-        self.ppos.gpu_mem_object = cl.Image(self.ctx,
-            cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, img_format,
-            shape=self.gpu_shape, hostbuf=self.ppos.values)
-
-        # padded_pvelo = np.zeros(shape_img, dtype=np.float32)
-        # padded_pvelo[..., 0:dim] = self.pvelo.values[:]
-        # self.pvelo.values = padded_pvelo
-        # self.pvelo.gpu_mem_object = cl.Image(self.ctx,
-        #     cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, img_format,
-        #     shape=self.gpu_shape, hostbuf=self.pvelo.values)
-
-        padded_pscal = np.zeros(shape_img, dtype=np.float32)
-        padded_pscal[..., 0] = self.pscal.values[:]
-        self.pscal.values = padded_pscal
-        self.pscal.gpu_mem_object = cl.Image(self.ctx,
-            cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, img_format,
-            shape=self.gpu_shape, hostbuf=self.pscal.values)
-
-        # Remeshing : particles positions, particles scalar, particles tag, particles bloc -> grid scalar
+        self.gpu_used_mem = 0
+
+        ## Allocation/copy grid points
+        self.grid.gpu_mem_object = cl.Buffer(self.ctx,
+            cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,
+            size=self.grid.points.nbytes, hostbuf=self.grid.points)
+        self.gpu_used_mem += self.grid.gpu_mem_object.size
+        print "Allocation grid points on gpu, size:", self.grid.gpu_mem_object.size, 'B'
+
+        ## Allocation/copy grid velocity
+        self.gvelo.gpu_mem_object = cl.Buffer(self.ctx,
+            cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,
+            size=self.gvelo.values.nbytes, hostbuf=self.gvelo.values)
+        self.gpu_used_mem += self.gvelo.gpu_mem_object.size
+        print "Allocation grid velocity on gpu, size:", self.gvelo.gpu_mem_object.size, 'B'
+
+        ## Allocation/copy grid scalar
+        self.gscal.gpu_mem_object = cl.Buffer(self.ctx,
+            cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,
+            size=self.gscal.values.nbytes, hostbuf=self.gscal.values)
+        self.gpu_used_mem += self.gscal.gpu_mem_object.size
+        print "Allocation grid scalar on gpu, size:", self.gscal.gpu_mem_object.size, 'B'
+
+        ## Allocation/copy particles positions
+        self.ppos.gpu_mem_object = cl.Buffer(self.ctx,
+            cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,
+            size=self.ppos.values.nbytes, hostbuf=self.ppos.values)
+        self.gpu_used_mem += self.ppos.gpu_mem_object.size
+        print "Allocation particles positions on gpu, size:", self.ppos.gpu_mem_object.size, 'B'
+
+        ## Allocation/copy particles scalar
+        self.pscal.gpu_mem_object = cl.Buffer(self.ctx,
+            cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,
+            size=self.pscal.values.nbytes, hostbuf=self.pscal.values)
+        self.gpu_used_mem += self.pscal.gpu_mem_object.size
+        print "Allocation particles scalar on gpu, size:", self.pscal.gpu_mem_object.size, 'B'
+
+        ## Remeshing : particles positions, particles scalar, particles tag, particles bloc -> grid scalar
         remesh = RemeshingDOp(partPositions=self.ppos,
                               partScalar=self.pscal,
                               partBloc=self.pscal,
@@ -130,24 +119,34 @@ class GPUParticularSolver(ParticularSolver):
                               resscal=discreteProblem.advecOp.scalar,
                               method=self.RemeshingMethod)
         discreteProblem.addOperator([remesh])
+
         discreteProblem.advecOp.setResultVariable(respos=self.ppos, resscal=self.pscal)
         discreteProblem.advecOp.setMethod(self.ODESolver)
 
         # Kernels creation/compilation
         f = open("gpu_src.cl", 'r')
         self.gpu_src = "".join(f.readlines())
-        self.prg = cl.Program(self.ctx, self.gpu_src).build()
+        self.prg = cl.Program(self.ctx, self.gpu_src).build("-cl-single-precision-constant -cl-opt-disable -D ORDER='C' -D DIM=" + str(dim))
+        print self.prg.get_build_info(self.device, cl.program_build_info.LOG)
+        print self.prg.get_build_info(self.device, cl.program_build_info.OPTIONS)
+        print self.prg.get_build_info(self.device, cl.program_build_info.STATUS)
         discreteProblem.advecOp.gpu_queue = self.queue
         discreteProblem.advecOp.gpu_shape = self.gpu_shape
         discreteProblem.advecOp.gpu_kernel = self.prg.integrate
         remesh.gpu_queue = self.queue
         remesh.gpu_shape = self.gpu_shape
         remesh.gpu_kernel = (self.prg.remeshing, self.prg.remeshing_reduce)
-        remesh.gpu_buffer = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=scalar_nbytes * 8)
+        remesh.gpu_buffer = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, size=self.gscal.values.nbytes * 8)
+        self.gpu_used_mem += remesh.gpu_buffer.size
+        print "Allocation remeshing buffer on gpu, size:", remesh.gpu_buffer.size, 'B'
         buffer_shape = list(self.gpu_shape)
         buffer_shape.append(8)
         remesh.gpu_buffer_values = np.zeros(tuple(buffer_shape), dtype=np.float32)
 
+        self.queue.finish()
+        p = self.gpu_used_mem * 100 / (self.device.global_mem_size * 1.)
+        print "Total memory allocated on gpu :", self.gpu_used_mem, "B (", p, " % of total available memory)"
+
         # Splitting Step
         splitting = []
         if self.splittingMethod == 'strang':
@@ -161,12 +160,11 @@ class GPUParticularSolver(ParticularSolver):
     def collect_data(self):
         o = tuple([0 for x in self.gpu_shape])
         self.queue.finish()
-        cl.enqueue_copy(self.queue, self.grid.point, self.grid.gpu_mem_object, origin=o, region=self.gpu_shape)
-        cl.enqueue_copy(self.queue, self.gvelo.values, self.gvelo.gpu_mem_object, origin=o, region=self.gpu_shape)
-        cl.enqueue_copy(self.queue, self.gscal.values, self.gscal.gpu_mem_object, origin=o, region=self.gpu_shape)
-        cl.enqueue_copy(self.queue, self.ppos.values, self.ppos.gpu_mem_object, origin=o, region=self.gpu_shape)
-        #cl.enqueue_copy(self.queue, self.pvelo.values, self.pvelo.gpu_mem_object, origin=o, region=self.gpu_shape)
-        cl.enqueue_copy(self.queue, self.pscal.values, self.pscal.gpu_mem_object, origin=o, region=self.gpu_shape)
+        cl.enqueue_copy(self.queue, self.grid.points, self.grid.gpu_mem_object)
+        cl.enqueue_copy(self.queue, self.gvelo.values, self.gvelo.gpu_mem_object)
+        cl.enqueue_copy(self.queue, self.gscal.values, self.gscal.gpu_mem_object)
+        cl.enqueue_copy(self.queue, self.ppos.values, self.ppos.gpu_mem_object)
+        cl.enqueue_copy(self.queue, self.pscal.values, self.pscal.gpu_mem_object)
 
     def terminate(self):
         self.queue.finish()
diff --git a/parmepy/new_ParMePy/Utils/Printer.py b/parmepy/new_ParMePy/Utils/Printer.py
index f401f9235..02707c020 100644
--- a/parmepy/new_ParMePy/Utils/Printer.py
+++ b/parmepy/new_ParMePy/Utils/Printer.py
@@ -34,12 +34,8 @@ class Printer():
         self.grid = self.problem.discreteProblem.advecOp.velocity.domain
         self.gvelo = self.problem.discreteProblem.advecOp.velocity.values
         self.ppos = self.problem.discreteProblem.solver.ppos.values
-        if isinstance(self.problem.discreteProblem.solver, GPUParticularSolver):
-            self.gscal = self.problem.discreteProblem.advecOp.scalar.values[...,0]
-            self.pscal = self.problem.discreteProblem.solver.pscal.values[...,0]
-        else:
-            self.pscal = self.problem.discreteProblem.solver.pscal.values
-            self.gscal = self.problem.discreteProblem.advecOp.scalar.values
+        self.pscal = self.problem.discreteProblem.solver.pscal.values
+        self.gscal = self.problem.discreteProblem.advecOp.scalar.values
 
         #self.pvelo = self.problem.discreteProblem.solver.pvelo
         # self.pbloc = self.problem.discreteProblem.solver.pbloc
diff --git a/parmepy/new_ParMePy/Variable/AnalyticalVariable.py b/parmepy/new_ParMePy/Variable/AnalyticalVariable.py
index c2b82f071..4c1914dd3 100644
--- a/parmepy/new_ParMePy/Variable/AnalyticalVariable.py
+++ b/parmepy/new_ParMePy/Variable/AnalyticalVariable.py
@@ -31,14 +31,14 @@ class AnalyticalVariable(ContinuousVariable):
         ## Is scalar variable
         self.scalar = scalar
 
-    def discretize(self, spec=None):
+    def discretize(self, d_spec=None):
         """
         AnalyticalVariable discretization method.
         Create an DiscreteVariable.DiscreteVariable from given specifications.
 
-        @param spec : discretization specifications, not used.
+        @param d_spec : discretization specifications, not used.
         """
-        self.discreteVariable = DiscreteVariable(domain=self.domain.discreteDomain, dimension=self.dimension, initialValue=self.formula, scalar=self.scalar)
+        self.discreteVariable = DiscreteVariable(domain=self.domain.discreteDomain, dimension=self.dimension, initialValue=self.formula, scalar=self.scalar, spec=d_spec)
 
     def value(self, pos):
         """
diff --git a/parmepy/new_ParMePy/Variable/DiscreteVariable.py b/parmepy/new_ParMePy/Variable/DiscreteVariable.py
index 51e649327..cc28c8fc6 100644
--- a/parmepy/new_ParMePy/Variable/DiscreteVariable.py
+++ b/parmepy/new_ParMePy/Variable/DiscreteVariable.py
@@ -14,7 +14,7 @@ class DiscreteVariable:
     Discrete variables abstract description
     """
 
-    def __init__(self, domain=None, dimension=None, initialValue=lambda x: x, integer=False, scalar=False):
+    def __init__(self, domain=None, dimension=None, initialValue=lambda x: x, integer=False, scalar=False, spec=None):
         """
         Constructor.
         Create a DiscreteVariable from a given ContinuousVariable.ContinuousVariable or a domain.
@@ -52,14 +52,24 @@ class DiscreteVariable:
         ### ------------------
         # New grid creation with numpy
         shape = list(self.domain.elementNumber)
-        if not scalar:
-            shape.append(self.dimension)
-        if integer:
-            self.values = np.zeros(shape, dtype=int)
+        if spec is None:
+            if not scalar:
+                shape.append(self.dimension)
+            if integer:
+                self.values = np.zeros(shape, dtype=int)
+            else:
+                self.values = np.zeros(shape, dtype=np.float32)
+            for ind in self.domain.explore():
+                self.values[ind] = initialValue(self.domain[ind])
         else:
-            self.values = np.zeros(shape)
-        for ind in self.domain.explore():
-            self.values[ind] = initialValue(self.domain[ind])
+            shape.append(4)
+            self.values = np.zeros(shape, dtype=np.float32)
+            if scalar:
+                for ind in self.domain.explore():
+                    self.values[ind][0] = initialValue(self.domain[ind])
+            else:
+                for ind in self.domain.explore():
+                    self.values[ind][:self.dimension] = initialValue(self.domain[ind])
         self.gpu_mem_object = None
 
     def __str__(self):
-- 
GitLab