From b50214768269f2ffbdc736b9551b59d0a2cc1cef Mon Sep 17 00:00:00 2001
From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@imag.fr>
Date: Fri, 11 May 2012 08:49:54 +0000
Subject: [PATCH] kernel benchmark improvements

---
 parmepy/kernel_benchmark/test_advec.py        |  23 ++--
 parmepy/kernel_benchmark/test_remesh.py       |  79 +++++++------
 .../test_remesh_base_kernel.cl                |  13 ++-
 .../test_remesh_img_kernel.cl                 | 105 +-----------------
 4 files changed, 72 insertions(+), 148 deletions(-)

diff --git a/parmepy/kernel_benchmark/test_advec.py b/parmepy/kernel_benchmark/test_advec.py
index 778603dfe..3bb0d4894 100644
--- a/parmepy/kernel_benchmark/test_advec.py
+++ b/parmepy/kernel_benchmark/test_advec.py
@@ -163,38 +163,41 @@ if __name__ == "__main__":
 
     pl.figure(1)
     for d in xrange(grid_dim):
-        pl.plot(t_sizes, times[d][0], label="base d=" + str(d))
+        pl.plot(t_sizes, times[d][0], '-o', label="base d=" + str(d))
     for d in xrange(grid_dim):
-        pl.plot(t_sizes, times[d][1], label="img d=" + str(d))
+        pl.plot(t_sizes, times[d][1], '-o', label="img d=" + str(d))
     pl.legend(("base d=0", "base d=1", "base d=2",
-               "img d=0", "img d=1", "img d=2"), loc='center right')
-    pl.axis([0, 13000000, 0, 0.08])
+               "img d=0", "img d=1", "img d=2"), loc='center left')
+    #pl.axis([0, 13000000, 0, 0.08])
     pl.xlabel("Nb particles")
     pl.ylabel("time (s)")
     pl.title("Advection times")
+    pl.xscale('log')
 
     pl.figure(2)
     for d in xrange(grid_dim):
-        pl.plot(t_sizes, flops[d][0], label="base d=" + str(d))
+        pl.plot(t_sizes, flops[d][0], '-o', label="base d=" + str(d))
     for d in xrange(grid_dim):
-        pl.plot(t_sizes, flops[d][1], label="img d=" + str(d))
+        pl.plot(t_sizes, flops[d][1], '-o', label="img d=" + str(d))
     pl.legend(("base d=0", "base d=1", "base d=2",
-               "img d=0", "img d=1", "img d=2"), loc='center right')
-    pl.axis([0, 13000000, 0, 5.0])
+               "img d=0", "img d=1", "img d=2"), loc='center left')
+    #pl.axis([0, 13000000, 0, 5.0])
     pl.xlabel("Nb particles")
     pl.ylabel("GFlops")
     pl.title("Advection flops")
+    pl.xscale('log')
 
     pl.figure(3)
     for d in xrange(grid_dim):
-        pl.plot(t_sizes, bw[d][0], label="base d=" + str(d))
+        pl.plot(t_sizes, bw[d][0], '-o', label="base d=" + str(d))
     for d in xrange(grid_dim):
-        pl.plot(t_sizes, bw[d][1], label="img d=" + str(d))
+        pl.plot(t_sizes, bw[d][1], '-o', label="img d=" + str(d))
     pl.legend(("base d=0", "base d=1", "base d=2",
                "img d=0", "img d=1", "img d=2"), loc='upper left')
     pl.xlabel("Nb particles")
     pl.ylabel("GB/s")
     pl.title("Advection bw")
+    pl.xscale('log')
 
     signal.signal(signal.SIGINT, signal.SIG_DFL)
     pl.show()
diff --git a/parmepy/kernel_benchmark/test_remesh.py b/parmepy/kernel_benchmark/test_remesh.py
index 3299d3a08..360c17781 100644
--- a/parmepy/kernel_benchmark/test_remesh.py
+++ b/parmepy/kernel_benchmark/test_remesh.py
@@ -26,6 +26,7 @@ queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_EN
 
 def main(sdir, nb_run):
     print "sDir", sdir, "   nbPart", nb, "**3"
+    # Creation des tableaux numpy
     grid_shape = (nb, nb, nb, 3)
     grid_size = 1. / np.array(grid_shape[0:grid_dim], dtype=dtype_real)
     ppos = np.empty((nb, nb, nb), dtype=dtype_real)
@@ -41,13 +42,23 @@ def main(sdir, nb_run):
     pscal = np.random.random_sample((nb, nb, nb)).astype(dtype_real)
     gscal = np.empty_like(pscal).astype(dtype_real)
 
-    avance = np.random.random_sample((nb, nb, nb)).astype(dtype_real)
+    avance = np.random.random_sample((nb, nb, nb)).astype(dtype_real) * 0.9
     ppos += grid_size[sdir] * avance
 
+    # Création des buffer pour le kernel de base
     ppos_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=ppos.nbytes)
     pscal_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=pscal.nbytes)
     gscal_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, size=gscal.nbytes)
 
+    # Création des images3d pour le nouveau kernel
+    ## Image grid_img : R : vitesse.x
+    ##                  G : vitesse.y
+    ##                  B : vitesse.z
+    ##                  A : scalaire
+    ## Image part_img : R : position (dans la direction de splitting)
+    ##                  G : -
+    ##                  B : distance au point de grille de gauche
+    ##                  A : scalaire
     img_fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT)
     grid_rgba = np.zeros((nb, nb, nb, 4), dtype=dtype_real)
     grid_img = cl.Image(ctx, cl.mem_flags.WRITE_ONLY, img_fmt, shape=(nb, nb, nb))
@@ -57,8 +68,15 @@ def main(sdir, nb_run):
     part_rgba[..., 3] = pscal
     part_img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_fmt, shape=(nb, nb, nb))
 
-    cl.enqueue_copy(queue, ppos_buf, ppos)
+    # Envoi des données pour les buffer (mémoire organisée pour un acces contigu)
+    if sdir == 0:
+        cl.enqueue_copy(queue, ppos_buf, ppos.ravel())
+    elif sdir == 1:
+        cl.enqueue_copy(queue, ppos_buf, ppos.swapaxes(1, 0).ravel())
+    else:
+        cl.enqueue_copy(queue, ppos_buf, ppos.swapaxes(1, 0).swapaxes(2, 0).ravel())
     cl.enqueue_copy(queue, pscal_buf, pscal)
+    # Envoi des données sur l'image3d
     cl.enqueue_copy(queue, part_img, part_rgba, origin=(0, 0, 0), region=(nb, nb, nb))
 
     queue.finish()
@@ -74,6 +92,8 @@ def main(sdir, nb_run):
     f.close()
     prg = cl.Program(ctx, gpu_src).build("-D DIM=" + str(grid_dim) + " -D MAX_LINE_POINTS=" + str(nb))
 
+    ## Run du kernel de base (buffers) (résultat de référence)
+    queue.finish()
     t = 0.
     for i in xrange(nb_run):
         evt = prg.remeshing(queue, (nb, nb), (1, nb),
@@ -96,7 +116,7 @@ def main(sdir, nb_run):
     queue.finish()
     #print gscal
 
-
+    ## Run kernel avec image3d
     t = 0.
     for i in xrange(nb_run):
         evt = prg.remeshing_img(queue, (nb, nb), (1, nb),
@@ -109,26 +129,18 @@ def main(sdir, nb_run):
                 )
         queue.finish()
         t += (evt.profile.end - evt.profile.start) * 1e-9
-    time_base = t / (nb_run * 1.)
-    times[sdir][1].append(time_base)
-    flops[sdir][1].append(1e-9 * (71 * nb ** grid_dim) / time_base)
-    bw[sdir][1].append(1e-9 * (8 * 4 * nb ** grid_dim) / time_base)
-    print "img\t", time_base,
+    time_img = t / (nb_run * 1.)
+    times[sdir][1].append(time_img)
+    flops[sdir][1].append(1e-9 * (74 * nb ** grid_dim) / time_img)
+    bw[sdir][1].append(1e-9 * (8 * 4 * nb ** grid_dim) / time_img)
+    print "img\t", time_img,
     temp1 = np.empty_like(gscal)
     cl.enqueue_copy(queue, grid_rgba, grid_img, origin=(0, 0, 0), region=(nb, nb, nb))
     queue.finish()
-    ## Comparaison des résultats pour la position des particules (grid.A)
-    if sdir == 0:
-        print np.max(np.abs(gscal - grid_rgba[..., 3]))
-        #print grid_rgba[..., 3]
-    elif sdir == 1:
-        print np.max(np.abs(gscal - grid_rgba[..., 3].swapaxes(0, 1)))
-        #print grid_rgba[..., 3].swapaxes(0, 1)
-    elif sdir == 2:
-        print np.max(np.abs(gscal - grid_rgba[..., 3].swapaxes(0, 1).swapaxes(0, 2)))
-        #print grid_rgba[..., 3].swapaxes(0, 1).swapaxes(0, 2)
+    ## Comparaison des résultats pour le scalaire sur la grille (grid.A)
     #print grid_rgba[..., 3]
-    print np.max(np.abs(gscal - grid_rgba[..., 3]))
+    print np.max(np.abs(gscal - grid_rgba[..., 3])),
+    print 'gain :', time_base / time_img
 
 
 if __name__ == "__main__":
@@ -139,9 +151,9 @@ if __name__ == "__main__":
     times = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)]
     flops = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)]
     bw = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)]
-    #main(0, nb_run)
-    #main(1, nb_run)
-    #main(2, nb_run)
+    # main(0, nb_run)
+    # main(1, nb_run)
+    # main(2, nb_run)
 
     for nb in sizes:
         print "==== {0}**3 ====".format(nb)
@@ -151,38 +163,41 @@ if __name__ == "__main__":
 
     pl.figure(1)
     for d in xrange(grid_dim):
-        pl.plot(t_sizes, times[d][0], label="base d=" + str(d))
+        pl.plot(t_sizes, times[d][0], '-o', label="base d=" + str(d))
     for d in xrange(grid_dim):
-        pl.plot(t_sizes, times[d][1], label="img d=" + str(d))
+        pl.plot(t_sizes, times[d][1], '-o', label="img d=" + str(d))
     pl.legend(("base d=0", "base d=1", "base d=2",
-               "img d=0", "img d=1", "img d=2"), loc='center right')
-    pl.axis([0, 13000000, 0, 0.2])
+               "img d=0", "img d=1", "img d=2"), loc='center left')
+    #pl.axis([0, 13000000, 0, 0.2])
     pl.xlabel("Nb particles")
     pl.ylabel("time (s)")
     pl.title("Remeshing times")
+    pl.xscale('log')
 
     pl.figure(2)
     for d in xrange(grid_dim):
-        pl.plot(t_sizes, flops[d][0], label="base d=" + str(d))
+        pl.plot(t_sizes, flops[d][0], '-o', label="base d=" + str(d))
     for d in xrange(grid_dim):
-        pl.plot(t_sizes, flops[d][1], label="img d=" + str(d))
+        pl.plot(t_sizes, flops[d][1], '-o', label="img d=" + str(d))
     pl.legend(("base d=0", "base d=1", "base d=2",
-               "img d=0", "img d=1", "img d=2"), loc='center right')
-    pl.axis([0, 13000000, 0, 20])
+               "img d=0", "img d=1", "img d=2"), loc='center left')
+    #pl.axis([0, 13000000, 0, 20])
     pl.xlabel("Nb particles")
     pl.ylabel("GFlops")
     pl.title("Remeshing flops")
+    pl.xscale('log')
 
     pl.figure(3)
     for d in xrange(grid_dim):
-        pl.plot(t_sizes, bw[d][0], label="base d=" + str(d))
+        pl.plot(t_sizes, bw[d][0], '-o', label="base d=" + str(d))
     for d in xrange(grid_dim):
-        pl.plot(t_sizes, bw[d][1], label="img d=" + str(d))
+        pl.plot(t_sizes, bw[d][1], '-o', label="img d=" + str(d))
     pl.legend(("base d=0", "base d=1", "base d=2",
                "img d=0", "img d=1", "img d=2"), loc='upper left')
     pl.xlabel("Nb particles")
     pl.ylabel("GB/s")
     pl.title("Remeshing bw")
+    pl.xscale('log')
 
     signal.signal(signal.SIGINT, signal.SIG_DFL)
     pl.show()
diff --git a/parmepy/kernel_benchmark/test_remesh_base_kernel.cl b/parmepy/kernel_benchmark/test_remesh_base_kernel.cl
index 8d8d46edf..0d0bf49d0 100644
--- a/parmepy/kernel_benchmark/test_remesh_base_kernel.cl
+++ b/parmepy/kernel_benchmark/test_remesh_base_kernel.cl
@@ -20,7 +20,7 @@ __kernel void remeshing(__global const float* part,
   __private uint ind_line, ind_line_scal;  // Line begin buffer index
   __private int ind_c;  // Remeshing grid point
   __private float x0;  // Remeshing distance from nearest left grid point
-  __private float dx_dir;  // Space step along line
+  __private float dx_dir, min_dir;  // Space step along line
   __private float ppos_c;  // Particle position temp along line
   __private float pscal_c;  // Particle scalar along line
   __private float weights[6];  // Remeshing weights
@@ -34,18 +34,21 @@ __kernel void remeshing(__global const float* part,
 	kernel_init_3_0(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal);
 	ind_m = get_global_id(0)*nb.z+get_global_id(1);
 	stride_m = nb.y*nb.z;
+	min_dir = min_v.x;
       }
     else if (dir == 1)
       {
 	kernel_init_3_1(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal);
 	ind_m = get_global_id(0)*nb.z+get_global_id(1);
 	stride_m = nb.x*nb.z;
+	min_dir = min_v.y;
       }
     else
       {
 	kernel_init_3_2(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal);
 	ind_m = get_global_id(0)*nb.y+get_global_id(1);
 	stride_m = nb.x*nb.y;
+	min_dir = min_v.z;
       }}
   else{
     if (dir == 0)
@@ -53,12 +56,14 @@ __kernel void remeshing(__global const float* part,
 	kernel_init_2_0(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal);
 	ind_m = get_global_id(0);
 	stride_m = nb.y;
+	min_dir = min_v.x;
       }
     else
       {
 	kernel_init_2_1(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal);
 	ind_m = get_global_id(0);
 	stride_m = nb.x;
+	min_dir = min_v.y;
       }
   }
 
@@ -73,7 +78,7 @@ __kernel void remeshing(__global const float* part,
       pscal_c = pscal[ind_line_scal + p*stride_along_dir_scal]; // Read : 1float
       // Compute nearest left grid point
       // Compute nearest left grid point distance
-      x0 = remquo_rtn(ppos_c - min_v[dir], dx_dir, &ind_c)/dx_dir; //5flop
+      x0 = remquo_rtn(ppos_c - min_dir, dx_dir, &ind_c)/dx_dir; //5flop
       // Compute remeshing weights
       weights[0] = x0 * (x0 * (x0 * (x0 * (13.0f - 5.0f * x0) - 9.0f) - 1.0f) + 2.0f) / 24.0f; // 10flop
       weights[1] = x0 * (x0 * (x0 * (x0 * (25.0f * x0 - 64.0f) + 39.0f) + 16.0f) - 16.0f) / 24.0f; //10flop
@@ -94,14 +99,10 @@ __kernel void remeshing(__global const float* part,
       gscal_buffer_loc[i] += weights[4] * pscal_c; // 2flop
       i = (i+1)%line_nb_pts;
       gscal_buffer_loc[i] += weights[5] * pscal_c; // 2flop
-      //debug
-      //gscal_buffer_loc[p] = ppos_c;
     }
   i = ind_line_scal;
   for(p=0;p<line_nb_pts;p++)
     gscal[i + p*stride_along_dir_scal] = gscal_buffer_loc[p]; // Write : 1float
-  //debug
-  //gscal[i + p*stride_along_dir_scal] = i + p*stride_along_dir_scal;
 }
 
 
diff --git a/parmepy/kernel_benchmark/test_remesh_img_kernel.cl b/parmepy/kernel_benchmark/test_remesh_img_kernel.cl
index 2c99ebf88..242f45ca6 100644
--- a/parmepy/kernel_benchmark/test_remesh_img_kernel.cl
+++ b/parmepy/kernel_benchmark/test_remesh_img_kernel.cl
@@ -15,7 +15,7 @@ __kernel void remeshing_img(__read_only image3d_t part,
 {
   const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
   __private uint p, i;
-  __private float buffer_loc[MAX_LINE_POINTS];
+  __private float buffer_loc[MAX_LINE_POINTS]__attribute__((aligned));
   __private float4 part_px, grid_px;
   __private int4 coord;
   __private int coord_t[3], ind, nb_dir;
@@ -53,10 +53,9 @@ __kernel void remeshing_img(__read_only image3d_t part,
       coord = (int4)(coord_t[2], coord_t[1], coord_t[0], 0);
       part_px = read_imagef(part, sampler, coord); // Read 1 pixel (4float)
       // Compute remeshing weights
-      x0 = part_px.z;
-      //ind = (convert_int_rtn(part_px.x/dx_dir) + nb_dir) % nb_dir;
       ind = convert_int_rtn(part_px.x/dx_dir); // 1flop
-      //x0 =  (part_px.x - ind * dx_dir)/dx_dir;
+      //x0 = part_px.z; // lecture de la distance si déja calculée par advection
+      x0 =  (part_px.x - ind * dx_dir)/dx_dir; // 3flop
       weights[0] = x0 * (x0 * (x0 * (x0 * (13.0f - 5.0f * x0) - 9.0f) - 1.0f) + 2.0f) / 24.0f; // 10flop
       weights[1] = x0 * (x0 * (x0 * (x0 * (25.0f * x0 - 64.0f) + 39.0f) + 16.0f) - 16.0f) / 24.0f; //10flop
       weights[2] = x0 * x0 * (x0 * (x0 * (126.0f - 50.0f * x0) - 70.0f) - 30.0f) / 24.0f + 1.0f; //10flop
@@ -76,111 +75,17 @@ __kernel void remeshing_img(__read_only image3d_t part,
       buffer_loc[i] += weights[4] * part_px.w; // 2flop
       i = (i+1)%line_nb_pts;
       buffer_loc[i] += weights[5] * part_px.w; // 2flop
-      //debug
-      //buffer_loc[p] = part_px.x;
     }
 
-
   for(p=0;p<line_nb_pts;p++)
     {
-      //read particle informations
+      //write pixel
       coord_t[dir] = p;
       coord = (int4)(coord_t[2], coord_t[1], coord_t[0], 0);
+      // /!\ : grid_px.xyz correspond à la vitesse sur la grille qui est écrasée !!
       grid_px = (float4)(0.0f, 0.0f, 0.0f, buffer_loc[p]);
-      //debug
-      //grid_px.w = get_global_id(1) + get_global_id(0) * nb.z * nb.y + p*nb.z;
       write_imagef(grid, coord, grid_px); // write 1 pixel (4float)
-    }
-
-
-
-  /*
-  __private uint i;  // DIM loop index
-  __private uint p;  // Particle line loop index
-  __private uint stride_along_dir, stride_along_dir_scal;   // Buffer stride between two particle of the same line
-  __private uint ind_line, ind_line_scal;  // Line begin buffer index
-  __private int ind_c;  // Remeshing grid point
-  __private float x0;  // Remeshing distance from nearest left grid point
-  __private float dx_dir;  // Space step along line
-  __private float ppos_c;  // Particle position temp along line
-  __private float pscal_c;  // Particle scalar along line
-  __private float weights[6];  // Remeshing weights
-  __private float gscal_buffer_loc[MAX_LINE_POINTS]; // Local buffer to store gscal for the current line
-  __private uint ind_m, stride_m;
-  // Compute strides and index. Depend on array order (C order here)
-  dx_dir = size[dir];
-  if(DIM == 3){
-    if (dir == 0)
-      {
-	kernel_init_3_0(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal);
-	ind_m = get_global_id(0)*nb.z+get_global_id(1);
-	stride_m = nb.y*nb.z;
-      }
-    else if (dir == 1)
-      {
-	kernel_init_3_1(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal);
-	ind_m = get_global_id(0)*nb.z+get_global_id(1);
-	stride_m = nb.x*nb.z;
       }
-    else
-      {
-	kernel_init_3_2(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal);
-	ind_m = get_global_id(0)*nb.y+get_global_id(1);
-	stride_m = nb.x*nb.y;
-      }}
-  else{
-    if (dir == 0)
-      {
-	kernel_init_2_0(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal);
-	ind_m = get_global_id(0);
-	stride_m = nb.y;
-      }
-    else
-      {
-	kernel_init_2_1(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal);
-	ind_m = get_global_id(0);
-	stride_m = nb.x;
-      }
-  }
-
-  // Initialise grid scalar
-  for(p=0;p<line_nb_pts;p++)
-    gscal_buffer_loc[p] = 0.0f;
-  for(p=0;p<line_nb_pts;p++)
-    {
-      // read particle position
-      ppos_c = part[ind_m + p*stride_m]; // Read : 1float (4bytes)
-      // read particle scalar
-      pscal_c = pscal[ind_line_scal + p*stride_along_dir_scal]; // Read : 1float
-      // Compute nearest left grid point
-      // Compute nearest left grid point distance
-      x0 = remquo_rtn(ppos_c - min_v[dir], dx_dir, &ind_c)/dx_dir; //5flop
-      // Compute remeshing weights
-      weights[0] = x0 * (x0 * (x0 * (x0 * (13.0f - 5.0f * x0) - 9.0f) - 1.0f) + 2.0f) / 24.0f; // 10flop
-      weights[1] = x0 * (x0 * (x0 * (x0 * (25.0f * x0 - 64.0f) + 39.0f) + 16.0f) - 16.0f) / 24.0f; //10flop
-      weights[2] = x0 * x0 * (x0 * (x0 * (126.0f - 50.0f * x0) - 70.0f) - 30.0f) / 24.0f + 1.0f; //10flop
-      weights[3] = x0 * (x0 * (x0 * (x0 * (50.0f * x0 - 124.0f) + 66.0f) + 16.0f) + 16.0f) / 24.0f; //10flop
-      weights[4] = x0 * (x0 * (x0 * (x0 * (61.0f - 25.0f * x0) - 33.0f) - 1.0f) - 2.0f) / 24.0f; //10flop
-      weights[5] = x0 * x0 * x0 * (x0 * (5.0f * x0 - 12.0f) + 7.0f) / 24.0f; // 8flop
-      // Apply remeshing weights
-      i = (ind_c-2+line_nb_pts)%line_nb_pts;
-      gscal_buffer_loc[i] += weights[0] * pscal_c; // 2flop
-      i = (i+1)%line_nb_pts;
-      gscal_buffer_loc[i] += weights[1] * pscal_c; // 2flop
-      i = (i+1)%line_nb_pts;
-      gscal_buffer_loc[i] += weights[2] * pscal_c; // 2flop
-      i = (i+1)%line_nb_pts;
-      gscal_buffer_loc[i] += weights[3] * pscal_c; // 2flop
-      i = (i+1)%line_nb_pts;
-      gscal_buffer_loc[i] += weights[4] * pscal_c; // 2flop
-      i = (i+1)%line_nb_pts;
-      gscal_buffer_loc[i] += weights[5] * pscal_c; // 2flop
-    }
-  i = ind_line_scal;
-  for(p=0;p<line_nb_pts;p++)
-    gscal[i + p*stride_along_dir_scal] = p;
-    //gscal[i + p*stride_along_dir_scal] = gscal_buffer_loc[p]; // Write : 1float
-    */
 }
 
 
-- 
GitLab