From c4d249b1bc22cd5c4ce6716677ebaaf064d37d56 Mon Sep 17 00:00:00 2001 From: Christophe Picard <Christophe.Picard@imag.fr> Date: Wed, 16 May 2012 13:46:04 +0000 Subject: [PATCH] Test Img2d --- parmepy/kernel_benchmark/test_advec2d.py | 211 ++++++++++++++++++ .../test_advec_img2d_kernel.cl | 82 +++++++ 2 files changed, 293 insertions(+) create mode 100755 parmepy/kernel_benchmark/test_advec2d.py create mode 100644 parmepy/kernel_benchmark/test_advec_img2d_kernel.cl diff --git a/parmepy/kernel_benchmark/test_advec2d.py b/parmepy/kernel_benchmark/test_advec2d.py new file mode 100755 index 000000000..64a5902c2 --- /dev/null +++ b/parmepy/kernel_benchmark/test_advec2d.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import pyopencl as cl +import numpy as np +import pylab as pl +import signal + +nb = 8 +grid_dim = 2 +dtype_integer = np.uint32 +dtype_real = np.float32 +grid_shape = (nb, nb, 2) +grid_size = 1. / np.array(grid_shape[0:grid_dim], dtype=dtype_real) +times = None +flops = None +bw = None + +#Get platform. +platform = cl.get_platforms()[0] +#Get device. +device = platform.get_devices(cl.device_type.GPU)[0] +print "Running on", device.name, "of", platform.name, "platform." +#Creates GPU Context +ctx = cl.Context([device]) +#Create CommandQueue on the GPU Context +queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + + +def main(sdir, nb_run): + print "sDir", sdir, " nbPart", nb, "**3" + # Creation des tableaux numpy + ppos = np.empty((nb, nb), dtype=dtype_real) + pscal = np.empty((nb, nb), dtype=dtype_real) + gvelo = np.empty((nb, nb, grid_dim), dtype=dtype_real) + ## Initialisation aléatoire des vitesses et scalaires + gvelo = np.random.random_sample((nb, nb, grid_dim)).astype(dtype_real) / (nb * 10.) + gscal = np.random.random_sample((nb, nb)).astype(dtype_real) + + # Création des buffer pour le kernel de base + ppos_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, size=ppos.nbytes) + pscal_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, size=pscal.nbytes) + gvelo_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=gvelo.nbytes) + gscal_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=gscal.nbytes) + + # Création des images3d pour le nouveau kernel + ## Image grid_img : R : vitesse.x + ## G : vitesse.y + ## B : vitesse.z + ## A : scalaire + ## Image part_img : R : position (dans la direction de splitting) + ## G : - + ## B : distance au point de grille de gauche + ## A : scalaire + grid_rgba = np.zeros((nb, nb, 4), dtype=dtype_real) + grid_rgba[..., 0:2] = gvelo + grid_rgba[..., 3] = gscal + part_rgba = np.zeros((nb, nb, 4), dtype=dtype_real) + + print gvelo.shape + print grid_rgba.shape + img_fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT) + grid_img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_fmt, (nb, nb)) + part_img = cl.Image(ctx, cl.mem_flags.WRITE_ONLY, img_fmt, (nb, nb)) + + # Envoi des données pour les buffer (mémoire organisée pour un acces contigu) + offset = 0 + cl.enqueue_copy(queue, gvelo_buf, gvelo[..., 0].ravel(), device_offset=np.long(offset)) + offset += gvelo[..., 0].nbytes + cl.enqueue_copy(queue, gvelo_buf, gvelo[..., 1].swapaxes(1, 0).ravel(), device_offset=np.long(offset)) + offset += gvelo[..., 1].nbytes + #cl.enqueue_copy(queue, gvelo_buf, gvelo[..., 2].swapaxes(1, 0).swapaxes(2, 0).ravel(), device_offset=np.long(offset)) + cl.enqueue_copy(queue, gscal_buf, gscal) + # Envoi des données sur l'image3d + cl.enqueue_copy(queue, grid_img, grid_rgba, origin=(0, 0), region=(nb, nb)) + + gpu_src = "" + f = open("./utils.h", 'r') + gpu_src += "".join(f.readlines()) + f.close() + f = open("./test_advec_base_kernel.cl", 'r') + gpu_src += "".join(f.readlines()) + f.close() + f = open("./test_advec_img2d_kernel.cl", 'r') + gpu_src += "".join(f.readlines()) + f.close() + print cl.device_info + prg = cl.Program(ctx, gpu_src).build("-D DIM=" + str(grid_dim) + " -D MAX_LINE_POINTS=" + str(nb)) + + ## Run du kernel de base (buffers) (résultat de référence) + queue.finish() + t = 0. + for i in xrange(nb_run): + evt = prg.advection(queue, tuple((nb,)), None, + gvelo_buf, ppos_buf, + dtype_real(0.), dtype_real(1.), dtype_integer(sdir), dtype_integer(nb), + np.array([1., 2., 3., 4.], dtype=dtype_real), + np.array([1., 1., 1., 1.], dtype=dtype_real), + np.array([nb, nb, nb, 0], dtype=dtype_integer), + np.array([1. / (1. * nb), 1. / (1. * nb), 1. / (1. * nb), 1. / (1. * nb)], dtype=dtype_real) + ) + queue.finish() + t += (evt.profile.end - evt.profile.start) * 1e-9 + evt = cl.enqueue_copy(queue, pscal_buf, gscal_buf) + queue.finish() + t += (evt.profile.end - evt.profile.start) * 1e-9 + time_base = t / (nb_run * 1.) + times[sdir][0].append(time_base) + flops[sdir][0].append(1e-9 * (20 * nb ** grid_dim) / time_base) + bw[sdir][0].append(1e-9 * (2 * 4 * nb ** grid_dim) / time_base) + print "base\t", time_base + cl.enqueue_copy(queue, ppos, ppos_buf) + cl.enqueue_copy(queue, pscal, pscal_buf) + queue.finish() + + ## Run kernel avec image3d + queue.finish() + t = 0. + print "================= Direction is", sdir + for i in xrange(nb_run): + evt = prg.advection_img_v2(queue, tuple((nb,)), None, + grid_img, part_img, + dtype_real(0.), dtype_real(1.), dtype_integer(sdir), dtype_integer(nb), + np.array([1., 2., 3., 4.], dtype=dtype_real), + np.array([1., 1., 1., 1.], dtype=dtype_real), + np.array([nb, nb, nb, 0], dtype=dtype_integer), + np.array([1. / (1. * nb), 1. / (1. * nb), 1. / (1. * nb), 1. / (1. * nb)], dtype=dtype_real) + ) + queue.finish() + t += (evt.profile.end - evt.profile.start) * 1e-9 + time_img = t / (nb_run * 1.) + times[sdir][1].append(time_img) + flops[sdir][1].append(1e-9 * (21 * nb ** grid_dim) / time_img) + bw[sdir][1].append(1e-9 * (8 * 4 * nb ** grid_dim) / time_img) + print "img \t", time_img, ' \t', + cl.enqueue_copy(queue, part_rgba, part_img, origin=(0, 0), region=(nb, nb)) + queue.finish() + ## Comparaison des résultats pour la position des particules (part.R) + if sdir == 0: + print np.max(np.abs(ppos - part_rgba[..., 0])), + #print part_rgba[..., 0] + elif sdir == 1: + print np.max(np.abs(ppos - part_rgba[..., 0].swapaxes(0, 1))), + #print part_rgba[..., 0].swapaxes(0, 1) + elif sdir == 2: + print np.max(np.abs(ppos - part_rgba[..., 0].swapaxes(0, 1).swapaxes(0, 2))), + #print part_rgba[..., 0].swapaxes(0, 1).swapaxes(0, 2) + ## Comparaison des résultats pour le scalaire des particules (part.A) + print np.max(np.abs(pscal - part_rgba[..., 3])), + print 'gain :', time_base / time_img + queue.finish() + +if __name__ == "__main__": + nb_run = 1 + nb_kernel = 2 + sizes = [ 128, 196] + t_sizes = np.array(sizes) ** 2 + times = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)] + flops = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)] + bw = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)] + # main(0, nb_run) + # main(1, nb_run) + # main(2, nb_run) + + for nb in sizes: + print "==== {0}**3 ====".format(nb) + for d in xrange(grid_dim): + print "-- sDIR = ", d + main(d, nb_run) + + pl.figure(1) + for d in xrange(grid_dim): + pl.plot(t_sizes, times[d][0], '-o', label="base d=" + str(d)) + for d in xrange(grid_dim): + pl.plot(t_sizes, times[d][1], '-o', label="img d=" + str(d)) + pl.legend(("base d=0", "base d=1", "base d=2", + "img d=0", "img d=1", "img d=2"), loc='center left') + #pl.axis([0, 13000000, 0, 0.08]) + pl.xlabel("Nb particles") + pl.ylabel("time (s)") + pl.title("Advection times") + pl.xscale('log') + + pl.figure(2) + for d in xrange(grid_dim): + pl.plot(t_sizes, flops[d][0], '-o', label="base d=" + str(d)) + for d in xrange(grid_dim): + pl.plot(t_sizes, flops[d][1], '-o', label="img d=" + str(d)) + pl.legend(("base d=0", "base d=1", "base d=2", + "img d=0", "img d=1", "img d=2"), loc='center left') + #pl.axis([0, 13000000, 0, 5.0]) + pl.xlabel("Nb particles") + pl.ylabel("GFlops") + pl.title("Advection flops") + pl.xscale('log') + + pl.figure(3) + for d in xrange(grid_dim): + pl.plot(t_sizes, bw[d][0], '-o', label="base d=" + str(d)) + for d in xrange(grid_dim): + pl.plot(t_sizes, bw[d][1], '-o', label="img d=" + str(d)) + pl.legend(("base d=0", "base d=1", "base d=2", + "img d=0", "img d=1", "img d=2"), loc='upper left') + pl.xlabel("Nb particles") + pl.ylabel("GB/s") + pl.title("Advection bw") + pl.xscale('log') + + signal.signal(signal.SIGINT, signal.SIG_DFL) + pl.show() + diff --git a/parmepy/kernel_benchmark/test_advec_img2d_kernel.cl b/parmepy/kernel_benchmark/test_advec_img2d_kernel.cl new file mode 100644 index 000000000..b0fcb70de --- /dev/null +++ b/parmepy/kernel_benchmark/test_advec_img2d_kernel.cl @@ -0,0 +1,82 @@ +// Integration Kernel (RK2 method) +__kernel void advection_img_v2(__read_only image2d_t grid, + __write_only image2d_t part, + __private const float t, + __private const float dt, + __private const uint dir, + __private const uint line_nb_pts, + __private const float4 min_v, + __private const float4 length_v, + __private const uint4 nb, + __private const float4 size + ) +{ + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; + __private int2 coord; + __private int coord_t[2]; + __private float4 part_px; + __private float4 grid_px; + __private float dx_dir, min_dir; + __private int nb_dir, indice; + __private float velo_dir; + __private size_t p; + __private float2 buffer_loc[MAX_LINE_POINTS]__attribute__((aligned)), buff; + + if(dir == 0) + { + dx_dir = size.x; + nb_dir = nb.x; + min_dir = min_v.x; + coord_t[1] = get_global_id(0); + for(p=0;p<line_nb_pts;p++) + { + //lecture vitesse + coord = (int2)(coord_t[1], p); + grid_px = read_imagef(grid, sampler, coord); //Read 1 pixel (4floats) //TODO lire directement grid_px[dir] + buffer_loc[p].x = grid_px.x; + buffer_loc[p].y = grid_px.w; + } + } + else + { + dx_dir = size.y; + nb_dir = nb.y; + min_dir = min_v.y; + coord_t[1] = get_global_id(0); + for(p=0;p<line_nb_pts;p++) + { + //lecture vitesse + coord = (int2)(p, coord_t[0]); + grid_px = read_imagef(grid, sampler, coord); //Read 1 pixel (4floats) //TODO lire directement grid_px[dir] + buffer_loc[p].x = grid_px.y; + buffer_loc[p].y = grid_px.w; + } + } + + for(p=0;p<line_nb_pts;p++) + { + buff = buffer_loc[p]; + //Lecture Vitesse + velo_dir = buff.x; + //calcul position + part_px.x = fma(velo_dir, 0.5f*dt, p*dx_dir); //4flop OpenCL : fma(a,b,c) = c+a*b + //calcul indice left grid point + indice = (convert_int_rtn(part_px.x/dx_dir) + nb_dir) % nb_dir; //1flop + //calcul distance (particle to left grid point) + part_px.z = fma(-indice, dx_dir, part_px.x)/dx_dir; //4flop OpenCL : fma(a,b,c) = c+a*b + //interpolation + velo_dir = mix(buffer_loc[indice].x, buffer_loc[(indice + 1) % nb_dir].x, part_px.z);// 3flop OpenCL mix(x,y,a)=x+(y-x)*a + //calcul position + part_px.x = fma(velo_dir, dt, p*dx_dir); //3flop OpenCL : fma(a,b,c) = c+a*b + //calcul indice left grid point + indice = (convert_int_rtn(part_px.x/dx_dir) + nb_dir) % nb_dir; //1flop + //calcul distance (particle to left grid point) + part_px.z = fma(-indice, dx_dir, part_px.x)/dx_dir; //4flop OpenCL : fma(a,b,c) = c+a*b + //scalaire + part_px.w = buff.y; + part_px.x += min_dir; // 1flop + coord_t[dir] = p; + coord = (int2)( coord_t[1], coord_t[0]); + write_imagef(part, coord, part_px); //Write 1 pixel (4floats) + } +} -- GitLab