From c4d249b1bc22cd5c4ce6716677ebaaf064d37d56 Mon Sep 17 00:00:00 2001
From: Christophe Picard <Christophe.Picard@imag.fr>
Date: Wed, 16 May 2012 13:46:04 +0000
Subject: [PATCH] Test Img2d

---
 parmepy/kernel_benchmark/test_advec2d.py      | 211 ++++++++++++++++++
 .../test_advec_img2d_kernel.cl                |  82 +++++++
 2 files changed, 293 insertions(+)
 create mode 100755 parmepy/kernel_benchmark/test_advec2d.py
 create mode 100644 parmepy/kernel_benchmark/test_advec_img2d_kernel.cl

diff --git a/parmepy/kernel_benchmark/test_advec2d.py b/parmepy/kernel_benchmark/test_advec2d.py
new file mode 100755
index 000000000..64a5902c2
--- /dev/null
+++ b/parmepy/kernel_benchmark/test_advec2d.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import pyopencl as cl
+import numpy as np
+import pylab as pl
+import signal
+
+nb = 8
+grid_dim = 2
+dtype_integer = np.uint32
+dtype_real = np.float32
+grid_shape = (nb, nb, 2)
+grid_size = 1. / np.array(grid_shape[0:grid_dim], dtype=dtype_real)
+times = None
+flops = None
+bw = None
+
+#Get platform.
+platform = cl.get_platforms()[0]
+#Get device.
+device = platform.get_devices(cl.device_type.GPU)[0]
+print "Running on", device.name, "of", platform.name, "platform."
+#Creates GPU Context
+ctx = cl.Context([device])
+#Create CommandQueue on the GPU Context
+queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+
+def main(sdir, nb_run):
+    print "sDir", sdir, "   nbPart", nb, "**3"
+    # Creation des tableaux numpy
+    ppos = np.empty((nb, nb), dtype=dtype_real)
+    pscal = np.empty((nb, nb), dtype=dtype_real)
+    gvelo = np.empty((nb, nb, grid_dim), dtype=dtype_real)
+    ## Initialisation aléatoire des vitesses et scalaires
+    gvelo = np.random.random_sample((nb, nb, grid_dim)).astype(dtype_real) / (nb * 10.)
+    gscal = np.random.random_sample((nb, nb)).astype(dtype_real)
+
+    # Création des buffer pour le kernel de base
+    ppos_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, size=ppos.nbytes)
+    pscal_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, size=pscal.nbytes)
+    gvelo_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=gvelo.nbytes)
+    gscal_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=gscal.nbytes)
+
+    # Création des images3d pour le nouveau kernel
+    ## Image grid_img : R : vitesse.x
+    ##                  G : vitesse.y
+    ##                  B : vitesse.z
+    ##                  A : scalaire
+    ## Image part_img : R : position (dans la direction de splitting)
+    ##                  G : -
+    ##                  B : distance au point de grille de gauche
+    ##                  A : scalaire
+    grid_rgba = np.zeros((nb, nb, 4), dtype=dtype_real)
+    grid_rgba[..., 0:2] = gvelo
+    grid_rgba[..., 3] = gscal
+    part_rgba = np.zeros((nb, nb, 4), dtype=dtype_real)
+
+    print gvelo.shape
+    print grid_rgba.shape
+    img_fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT)
+    grid_img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_fmt, (nb, nb))
+    part_img = cl.Image(ctx, cl.mem_flags.WRITE_ONLY, img_fmt, (nb, nb))
+
+    # Envoi des données pour les buffer (mémoire organisée pour un acces contigu)
+    offset = 0
+    cl.enqueue_copy(queue, gvelo_buf, gvelo[..., 0].ravel(), device_offset=np.long(offset))
+    offset += gvelo[..., 0].nbytes
+    cl.enqueue_copy(queue, gvelo_buf, gvelo[..., 1].swapaxes(1, 0).ravel(), device_offset=np.long(offset))
+    offset += gvelo[..., 1].nbytes
+    #cl.enqueue_copy(queue, gvelo_buf, gvelo[..., 2].swapaxes(1, 0).swapaxes(2, 0).ravel(), device_offset=np.long(offset))
+    cl.enqueue_copy(queue, gscal_buf, gscal)
+    # Envoi des données sur l'image3d
+    cl.enqueue_copy(queue, grid_img, grid_rgba, origin=(0, 0), region=(nb, nb))
+
+    gpu_src = ""
+    f = open("./utils.h", 'r')
+    gpu_src += "".join(f.readlines())
+    f.close()
+    f = open("./test_advec_base_kernel.cl", 'r')
+    gpu_src += "".join(f.readlines())
+    f.close()
+    f = open("./test_advec_img2d_kernel.cl", 'r')
+    gpu_src += "".join(f.readlines())
+    f.close()
+    print cl.device_info
+    prg = cl.Program(ctx, gpu_src).build("-D DIM=" + str(grid_dim) + " -D MAX_LINE_POINTS=" + str(nb))
+
+    ## Run du kernel de base (buffers) (résultat de référence)
+    queue.finish()
+    t = 0.
+    for i in xrange(nb_run):
+        evt = prg.advection(queue, tuple((nb,)), None,
+                            gvelo_buf, ppos_buf,
+                            dtype_real(0.), dtype_real(1.), dtype_integer(sdir), dtype_integer(nb),
+                            np.array([1., 2., 3., 4.], dtype=dtype_real),
+                            np.array([1., 1., 1., 1.], dtype=dtype_real),
+                            np.array([nb, nb, nb, 0], dtype=dtype_integer),
+                            np.array([1. / (1. * nb), 1. / (1. * nb), 1. / (1. * nb), 1. / (1. * nb)], dtype=dtype_real)
+                           )
+        queue.finish()
+        t += (evt.profile.end - evt.profile.start) * 1e-9
+        evt = cl.enqueue_copy(queue, pscal_buf, gscal_buf)
+        queue.finish()
+        t += (evt.profile.end - evt.profile.start) * 1e-9
+        time_base = t / (nb_run * 1.)
+        times[sdir][0].append(time_base)
+        flops[sdir][0].append(1e-9 * (20 * nb ** grid_dim) / time_base)
+        bw[sdir][0].append(1e-9 * (2 * 4 * nb ** grid_dim) / time_base)
+        print "base\t", time_base
+        cl.enqueue_copy(queue, ppos, ppos_buf)
+        cl.enqueue_copy(queue, pscal, pscal_buf)
+        queue.finish()
+
+    ## Run kernel avec image3d
+    queue.finish()
+    t = 0.
+    print "================= Direction is", sdir
+    for i in xrange(nb_run):
+        evt = prg.advection_img_v2(queue, tuple((nb,)), None,
+                                   grid_img, part_img,
+                                   dtype_real(0.), dtype_real(1.), dtype_integer(sdir), dtype_integer(nb),
+                                   np.array([1., 2., 3., 4.], dtype=dtype_real),
+                                   np.array([1., 1., 1., 1.], dtype=dtype_real),
+                                   np.array([nb, nb, nb, 0], dtype=dtype_integer),
+                                   np.array([1. / (1. * nb), 1. / (1. * nb), 1. / (1. * nb), 1. / (1. * nb)], dtype=dtype_real)
+                                  )
+        queue.finish()
+        t += (evt.profile.end - evt.profile.start) * 1e-9
+        time_img = t / (nb_run * 1.)
+        times[sdir][1].append(time_img)
+        flops[sdir][1].append(1e-9 * (21 * nb ** grid_dim) / time_img)
+        bw[sdir][1].append(1e-9 * (8 * 4 * nb ** grid_dim) / time_img)
+        print "img \t", time_img, ' \t',
+        cl.enqueue_copy(queue, part_rgba, part_img, origin=(0, 0), region=(nb, nb))
+        queue.finish()
+        ## Comparaison des résultats pour la position des particules (part.R)
+        if sdir == 0:
+            print np.max(np.abs(ppos - part_rgba[..., 0])),
+            #print part_rgba[..., 0]
+        elif sdir == 1:
+            print np.max(np.abs(ppos - part_rgba[..., 0].swapaxes(0, 1))),
+            #print part_rgba[..., 0].swapaxes(0, 1)
+        elif sdir == 2:
+            print np.max(np.abs(ppos - part_rgba[..., 0].swapaxes(0, 1).swapaxes(0, 2))),
+            #print part_rgba[..., 0].swapaxes(0, 1).swapaxes(0, 2)
+            ## Comparaison des résultats pour le scalaire des particules (part.A)
+            print np.max(np.abs(pscal - part_rgba[..., 3])),
+            print 'gain :', time_base / time_img
+    queue.finish()
+
+if __name__ == "__main__":
+    nb_run = 1
+    nb_kernel = 2
+    sizes = [ 128, 196]
+    t_sizes = np.array(sizes) ** 2
+    times = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)]
+    flops = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)]
+    bw = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)]
+    # main(0, nb_run)
+    # main(1, nb_run)
+    # main(2, nb_run)
+
+    for nb in sizes:
+        print "==== {0}**3 ====".format(nb)
+        for d in xrange(grid_dim):
+            print "-- sDIR = ", d
+            main(d, nb_run)
+
+    pl.figure(1)
+    for d in xrange(grid_dim):
+        pl.plot(t_sizes, times[d][0], '-o', label="base d=" + str(d))
+        for d in xrange(grid_dim):
+            pl.plot(t_sizes, times[d][1], '-o', label="img d=" + str(d))
+            pl.legend(("base d=0", "base d=1", "base d=2",
+                       "img d=0", "img d=1", "img d=2"), loc='center left')
+            #pl.axis([0, 13000000, 0, 0.08])
+            pl.xlabel("Nb particles")
+            pl.ylabel("time (s)")
+            pl.title("Advection times")
+            pl.xscale('log')
+
+    pl.figure(2)
+    for d in xrange(grid_dim):
+        pl.plot(t_sizes, flops[d][0], '-o', label="base d=" + str(d))
+        for d in xrange(grid_dim):
+            pl.plot(t_sizes, flops[d][1], '-o', label="img d=" + str(d))
+            pl.legend(("base d=0", "base d=1", "base d=2",
+                       "img d=0", "img d=1", "img d=2"), loc='center left')
+            #pl.axis([0, 13000000, 0, 5.0])
+            pl.xlabel("Nb particles")
+            pl.ylabel("GFlops")
+            pl.title("Advection flops")
+            pl.xscale('log')
+
+    pl.figure(3)
+    for d in xrange(grid_dim):
+        pl.plot(t_sizes, bw[d][0], '-o', label="base d=" + str(d))
+        for d in xrange(grid_dim):
+            pl.plot(t_sizes, bw[d][1], '-o', label="img d=" + str(d))
+            pl.legend(("base d=0", "base d=1", "base d=2",
+                       "img d=0", "img d=1", "img d=2"), loc='upper left')
+            pl.xlabel("Nb particles")
+            pl.ylabel("GB/s")
+            pl.title("Advection bw")
+            pl.xscale('log')
+
+    signal.signal(signal.SIGINT, signal.SIG_DFL)
+    pl.show()
+
diff --git a/parmepy/kernel_benchmark/test_advec_img2d_kernel.cl b/parmepy/kernel_benchmark/test_advec_img2d_kernel.cl
new file mode 100644
index 000000000..b0fcb70de
--- /dev/null
+++ b/parmepy/kernel_benchmark/test_advec_img2d_kernel.cl
@@ -0,0 +1,82 @@
+// Integration Kernel (RK2 method)
+__kernel void advection_img_v2(__read_only image2d_t grid,
+            __write_only image2d_t part,
+			__private const float t,
+			__private const float dt,
+			__private const uint dir,
+			__private const uint line_nb_pts,
+			__private const float4 min_v,
+			__private const float4 length_v,
+			__private const uint4 nb,
+			__private const float4 size
+			)
+{
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+  __private int2 coord;
+  __private int coord_t[2];
+  __private float4 part_px;
+  __private float4 grid_px;
+  __private float dx_dir, min_dir;
+  __private int nb_dir, indice;
+  __private float velo_dir;
+  __private size_t p;
+  __private float2 buffer_loc[MAX_LINE_POINTS]__attribute__((aligned)), buff;
+
+  if(dir == 0)
+    {
+      dx_dir = size.x;
+      nb_dir = nb.x;
+      min_dir = min_v.x;
+      coord_t[1] = get_global_id(0);
+      for(p=0;p<line_nb_pts;p++)
+	{
+	  //lecture vitesse
+	  coord = (int2)(coord_t[1],  p);
+	  grid_px = read_imagef(grid, sampler, coord); //Read 1 pixel (4floats) //TODO lire directement grid_px[dir]
+	  buffer_loc[p].x = grid_px.x;
+	  buffer_loc[p].y = grid_px.w;
+	}
+    }
+  else
+    {
+      dx_dir = size.y;
+      nb_dir = nb.y;
+      min_dir = min_v.y;
+      coord_t[1] = get_global_id(0);
+      for(p=0;p<line_nb_pts;p++)
+	{
+	  //lecture vitesse
+	  coord = (int2)(p, coord_t[0]);
+	  grid_px = read_imagef(grid, sampler, coord); //Read 1 pixel (4floats) //TODO lire directement grid_px[dir]
+	  buffer_loc[p].x = grid_px.y;
+	  buffer_loc[p].y = grid_px.w;
+	}
+    }
+
+  for(p=0;p<line_nb_pts;p++)
+    {
+      buff = buffer_loc[p];
+      //Lecture Vitesse
+      velo_dir = buff.x;
+      //calcul position
+      part_px.x = fma(velo_dir, 0.5f*dt, p*dx_dir); //4flop   OpenCL : fma(a,b,c) = c+a*b
+      //calcul indice left grid point
+      indice = (convert_int_rtn(part_px.x/dx_dir) + nb_dir) % nb_dir; //1flop
+      //calcul distance (particle to left grid point)
+      part_px.z = fma(-indice, dx_dir, part_px.x)/dx_dir; //4flop   OpenCL : fma(a,b,c) = c+a*b
+      //interpolation
+      velo_dir = mix(buffer_loc[indice].x, buffer_loc[(indice + 1) % nb_dir].x, part_px.z);// 3flop   OpenCL mix(x,y,a)=x+(y-x)*a
+      //calcul position
+      part_px.x = fma(velo_dir, dt, p*dx_dir); //3flop   OpenCL : fma(a,b,c) = c+a*b
+      //calcul indice left grid point
+      indice = (convert_int_rtn(part_px.x/dx_dir) + nb_dir) % nb_dir; //1flop
+      //calcul distance (particle to left grid point)
+      part_px.z = fma(-indice, dx_dir, part_px.x)/dx_dir; //4flop   OpenCL : fma(a,b,c) = c+a*b
+      //scalaire
+      part_px.w = buff.y;
+      part_px.x += min_dir; // 1flop
+      coord_t[dir] = p;
+      coord = (int2)( coord_t[1], coord_t[0]);
+      write_imagef(part, coord, part_px); //Write 1 pixel (4floats)
+    }
+}
-- 
GitLab