Skip to content
Snippets Groups Projects
Commit c4d249b1 authored by Christophe Picard's avatar Christophe Picard
Browse files

Test Img2d

parent 799eba8d
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pyopencl as cl
import numpy as np
import pylab as pl
import signal
nb = 8
grid_dim = 2
dtype_integer = np.uint32
dtype_real = np.float32
grid_shape = (nb, nb, 2)
grid_size = 1. / np.array(grid_shape[0:grid_dim], dtype=dtype_real)
times = None
flops = None
bw = None
#Get platform.
platform = cl.get_platforms()[0]
#Get device.
device = platform.get_devices(cl.device_type.GPU)[0]
print "Running on", device.name, "of", platform.name, "platform."
#Creates GPU Context
ctx = cl.Context([device])
#Create CommandQueue on the GPU Context
queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
def main(sdir, nb_run):
print "sDir", sdir, " nbPart", nb, "**3"
# Creation des tableaux numpy
ppos = np.empty((nb, nb), dtype=dtype_real)
pscal = np.empty((nb, nb), dtype=dtype_real)
gvelo = np.empty((nb, nb, grid_dim), dtype=dtype_real)
## Initialisation aléatoire des vitesses et scalaires
gvelo = np.random.random_sample((nb, nb, grid_dim)).astype(dtype_real) / (nb * 10.)
gscal = np.random.random_sample((nb, nb)).astype(dtype_real)
# Création des buffer pour le kernel de base
ppos_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, size=ppos.nbytes)
pscal_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, size=pscal.nbytes)
gvelo_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=gvelo.nbytes)
gscal_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=gscal.nbytes)
# Création des images3d pour le nouveau kernel
## Image grid_img : R : vitesse.x
## G : vitesse.y
## B : vitesse.z
## A : scalaire
## Image part_img : R : position (dans la direction de splitting)
## G : -
## B : distance au point de grille de gauche
## A : scalaire
grid_rgba = np.zeros((nb, nb, 4), dtype=dtype_real)
grid_rgba[..., 0:2] = gvelo
grid_rgba[..., 3] = gscal
part_rgba = np.zeros((nb, nb, 4), dtype=dtype_real)
print gvelo.shape
print grid_rgba.shape
img_fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT)
grid_img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_fmt, (nb, nb))
part_img = cl.Image(ctx, cl.mem_flags.WRITE_ONLY, img_fmt, (nb, nb))
# Envoi des données pour les buffer (mémoire organisée pour un acces contigu)
offset = 0
cl.enqueue_copy(queue, gvelo_buf, gvelo[..., 0].ravel(), device_offset=np.long(offset))
offset += gvelo[..., 0].nbytes
cl.enqueue_copy(queue, gvelo_buf, gvelo[..., 1].swapaxes(1, 0).ravel(), device_offset=np.long(offset))
offset += gvelo[..., 1].nbytes
#cl.enqueue_copy(queue, gvelo_buf, gvelo[..., 2].swapaxes(1, 0).swapaxes(2, 0).ravel(), device_offset=np.long(offset))
cl.enqueue_copy(queue, gscal_buf, gscal)
# Envoi des données sur l'image3d
cl.enqueue_copy(queue, grid_img, grid_rgba, origin=(0, 0), region=(nb, nb))
gpu_src = ""
f = open("./utils.h", 'r')
gpu_src += "".join(f.readlines())
f.close()
f = open("./test_advec_base_kernel.cl", 'r')
gpu_src += "".join(f.readlines())
f.close()
f = open("./test_advec_img2d_kernel.cl", 'r')
gpu_src += "".join(f.readlines())
f.close()
print cl.device_info
prg = cl.Program(ctx, gpu_src).build("-D DIM=" + str(grid_dim) + " -D MAX_LINE_POINTS=" + str(nb))
## Run du kernel de base (buffers) (résultat de référence)
queue.finish()
t = 0.
for i in xrange(nb_run):
evt = prg.advection(queue, tuple((nb,)), None,
gvelo_buf, ppos_buf,
dtype_real(0.), dtype_real(1.), dtype_integer(sdir), dtype_integer(nb),
np.array([1., 2., 3., 4.], dtype=dtype_real),
np.array([1., 1., 1., 1.], dtype=dtype_real),
np.array([nb, nb, nb, 0], dtype=dtype_integer),
np.array([1. / (1. * nb), 1. / (1. * nb), 1. / (1. * nb), 1. / (1. * nb)], dtype=dtype_real)
)
queue.finish()
t += (evt.profile.end - evt.profile.start) * 1e-9
evt = cl.enqueue_copy(queue, pscal_buf, gscal_buf)
queue.finish()
t += (evt.profile.end - evt.profile.start) * 1e-9
time_base = t / (nb_run * 1.)
times[sdir][0].append(time_base)
flops[sdir][0].append(1e-9 * (20 * nb ** grid_dim) / time_base)
bw[sdir][0].append(1e-9 * (2 * 4 * nb ** grid_dim) / time_base)
print "base\t", time_base
cl.enqueue_copy(queue, ppos, ppos_buf)
cl.enqueue_copy(queue, pscal, pscal_buf)
queue.finish()
## Run kernel avec image3d
queue.finish()
t = 0.
print "================= Direction is", sdir
for i in xrange(nb_run):
evt = prg.advection_img_v2(queue, tuple((nb,)), None,
grid_img, part_img,
dtype_real(0.), dtype_real(1.), dtype_integer(sdir), dtype_integer(nb),
np.array([1., 2., 3., 4.], dtype=dtype_real),
np.array([1., 1., 1., 1.], dtype=dtype_real),
np.array([nb, nb, nb, 0], dtype=dtype_integer),
np.array([1. / (1. * nb), 1. / (1. * nb), 1. / (1. * nb), 1. / (1. * nb)], dtype=dtype_real)
)
queue.finish()
t += (evt.profile.end - evt.profile.start) * 1e-9
time_img = t / (nb_run * 1.)
times[sdir][1].append(time_img)
flops[sdir][1].append(1e-9 * (21 * nb ** grid_dim) / time_img)
bw[sdir][1].append(1e-9 * (8 * 4 * nb ** grid_dim) / time_img)
print "img \t", time_img, ' \t',
cl.enqueue_copy(queue, part_rgba, part_img, origin=(0, 0), region=(nb, nb))
queue.finish()
## Comparaison des résultats pour la position des particules (part.R)
if sdir == 0:
print np.max(np.abs(ppos - part_rgba[..., 0])),
#print part_rgba[..., 0]
elif sdir == 1:
print np.max(np.abs(ppos - part_rgba[..., 0].swapaxes(0, 1))),
#print part_rgba[..., 0].swapaxes(0, 1)
elif sdir == 2:
print np.max(np.abs(ppos - part_rgba[..., 0].swapaxes(0, 1).swapaxes(0, 2))),
#print part_rgba[..., 0].swapaxes(0, 1).swapaxes(0, 2)
## Comparaison des résultats pour le scalaire des particules (part.A)
print np.max(np.abs(pscal - part_rgba[..., 3])),
print 'gain :', time_base / time_img
queue.finish()
if __name__ == "__main__":
nb_run = 1
nb_kernel = 2
sizes = [ 128, 196]
t_sizes = np.array(sizes) ** 2
times = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)]
flops = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)]
bw = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)]
# main(0, nb_run)
# main(1, nb_run)
# main(2, nb_run)
for nb in sizes:
print "==== {0}**3 ====".format(nb)
for d in xrange(grid_dim):
print "-- sDIR = ", d
main(d, nb_run)
pl.figure(1)
for d in xrange(grid_dim):
pl.plot(t_sizes, times[d][0], '-o', label="base d=" + str(d))
for d in xrange(grid_dim):
pl.plot(t_sizes, times[d][1], '-o', label="img d=" + str(d))
pl.legend(("base d=0", "base d=1", "base d=2",
"img d=0", "img d=1", "img d=2"), loc='center left')
#pl.axis([0, 13000000, 0, 0.08])
pl.xlabel("Nb particles")
pl.ylabel("time (s)")
pl.title("Advection times")
pl.xscale('log')
pl.figure(2)
for d in xrange(grid_dim):
pl.plot(t_sizes, flops[d][0], '-o', label="base d=" + str(d))
for d in xrange(grid_dim):
pl.plot(t_sizes, flops[d][1], '-o', label="img d=" + str(d))
pl.legend(("base d=0", "base d=1", "base d=2",
"img d=0", "img d=1", "img d=2"), loc='center left')
#pl.axis([0, 13000000, 0, 5.0])
pl.xlabel("Nb particles")
pl.ylabel("GFlops")
pl.title("Advection flops")
pl.xscale('log')
pl.figure(3)
for d in xrange(grid_dim):
pl.plot(t_sizes, bw[d][0], '-o', label="base d=" + str(d))
for d in xrange(grid_dim):
pl.plot(t_sizes, bw[d][1], '-o', label="img d=" + str(d))
pl.legend(("base d=0", "base d=1", "base d=2",
"img d=0", "img d=1", "img d=2"), loc='upper left')
pl.xlabel("Nb particles")
pl.ylabel("GB/s")
pl.title("Advection bw")
pl.xscale('log')
signal.signal(signal.SIGINT, signal.SIG_DFL)
pl.show()
// Integration Kernel (RK2 method)
__kernel void advection_img_v2(__read_only image2d_t grid,
__write_only image2d_t part,
__private const float t,
__private const float dt,
__private const uint dir,
__private const uint line_nb_pts,
__private const float4 min_v,
__private const float4 length_v,
__private const uint4 nb,
__private const float4 size
)
{
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__private int2 coord;
__private int coord_t[2];
__private float4 part_px;
__private float4 grid_px;
__private float dx_dir, min_dir;
__private int nb_dir, indice;
__private float velo_dir;
__private size_t p;
__private float2 buffer_loc[MAX_LINE_POINTS]__attribute__((aligned)), buff;
if(dir == 0)
{
dx_dir = size.x;
nb_dir = nb.x;
min_dir = min_v.x;
coord_t[1] = get_global_id(0);
for(p=0;p<line_nb_pts;p++)
{
//lecture vitesse
coord = (int2)(coord_t[1], p);
grid_px = read_imagef(grid, sampler, coord); //Read 1 pixel (4floats) //TODO lire directement grid_px[dir]
buffer_loc[p].x = grid_px.x;
buffer_loc[p].y = grid_px.w;
}
}
else
{
dx_dir = size.y;
nb_dir = nb.y;
min_dir = min_v.y;
coord_t[1] = get_global_id(0);
for(p=0;p<line_nb_pts;p++)
{
//lecture vitesse
coord = (int2)(p, coord_t[0]);
grid_px = read_imagef(grid, sampler, coord); //Read 1 pixel (4floats) //TODO lire directement grid_px[dir]
buffer_loc[p].x = grid_px.y;
buffer_loc[p].y = grid_px.w;
}
}
for(p=0;p<line_nb_pts;p++)
{
buff = buffer_loc[p];
//Lecture Vitesse
velo_dir = buff.x;
//calcul position
part_px.x = fma(velo_dir, 0.5f*dt, p*dx_dir); //4flop OpenCL : fma(a,b,c) = c+a*b
//calcul indice left grid point
indice = (convert_int_rtn(part_px.x/dx_dir) + nb_dir) % nb_dir; //1flop
//calcul distance (particle to left grid point)
part_px.z = fma(-indice, dx_dir, part_px.x)/dx_dir; //4flop OpenCL : fma(a,b,c) = c+a*b
//interpolation
velo_dir = mix(buffer_loc[indice].x, buffer_loc[(indice + 1) % nb_dir].x, part_px.z);// 3flop OpenCL mix(x,y,a)=x+(y-x)*a
//calcul position
part_px.x = fma(velo_dir, dt, p*dx_dir); //3flop OpenCL : fma(a,b,c) = c+a*b
//calcul indice left grid point
indice = (convert_int_rtn(part_px.x/dx_dir) + nb_dir) % nb_dir; //1flop
//calcul distance (particle to left grid point)
part_px.z = fma(-indice, dx_dir, part_px.x)/dx_dir; //4flop OpenCL : fma(a,b,c) = c+a*b
//scalaire
part_px.w = buff.y;
part_px.x += min_dir; // 1flop
coord_t[dir] = p;
coord = (int2)( coord_t[1], coord_t[0]);
write_imagef(part, coord, part_px); //Write 1 pixel (4floats)
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment