From b50214768269f2ffbdc736b9551b59d0a2cc1cef Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@imag.fr> Date: Fri, 11 May 2012 08:49:54 +0000 Subject: [PATCH] kernel benchmark improvements --- parmepy/kernel_benchmark/test_advec.py | 23 ++-- parmepy/kernel_benchmark/test_remesh.py | 79 +++++++------ .../test_remesh_base_kernel.cl | 13 ++- .../test_remesh_img_kernel.cl | 105 +----------------- 4 files changed, 72 insertions(+), 148 deletions(-) diff --git a/parmepy/kernel_benchmark/test_advec.py b/parmepy/kernel_benchmark/test_advec.py index 778603dfe..3bb0d4894 100644 --- a/parmepy/kernel_benchmark/test_advec.py +++ b/parmepy/kernel_benchmark/test_advec.py @@ -163,38 +163,41 @@ if __name__ == "__main__": pl.figure(1) for d in xrange(grid_dim): - pl.plot(t_sizes, times[d][0], label="base d=" + str(d)) + pl.plot(t_sizes, times[d][0], '-o', label="base d=" + str(d)) for d in xrange(grid_dim): - pl.plot(t_sizes, times[d][1], label="img d=" + str(d)) + pl.plot(t_sizes, times[d][1], '-o', label="img d=" + str(d)) pl.legend(("base d=0", "base d=1", "base d=2", - "img d=0", "img d=1", "img d=2"), loc='center right') - pl.axis([0, 13000000, 0, 0.08]) + "img d=0", "img d=1", "img d=2"), loc='center left') + #pl.axis([0, 13000000, 0, 0.08]) pl.xlabel("Nb particles") pl.ylabel("time (s)") pl.title("Advection times") + pl.xscale('log') pl.figure(2) for d in xrange(grid_dim): - pl.plot(t_sizes, flops[d][0], label="base d=" + str(d)) + pl.plot(t_sizes, flops[d][0], '-o', label="base d=" + str(d)) for d in xrange(grid_dim): - pl.plot(t_sizes, flops[d][1], label="img d=" + str(d)) + pl.plot(t_sizes, flops[d][1], '-o', label="img d=" + str(d)) pl.legend(("base d=0", "base d=1", "base d=2", - "img d=0", "img d=1", "img d=2"), loc='center right') - pl.axis([0, 13000000, 0, 5.0]) + "img d=0", "img d=1", "img d=2"), loc='center left') + #pl.axis([0, 13000000, 0, 5.0]) pl.xlabel("Nb particles") pl.ylabel("GFlops") pl.title("Advection flops") + pl.xscale('log') pl.figure(3) for d in xrange(grid_dim): - pl.plot(t_sizes, bw[d][0], label="base d=" + str(d)) + pl.plot(t_sizes, bw[d][0], '-o', label="base d=" + str(d)) for d in xrange(grid_dim): - pl.plot(t_sizes, bw[d][1], label="img d=" + str(d)) + pl.plot(t_sizes, bw[d][1], '-o', label="img d=" + str(d)) pl.legend(("base d=0", "base d=1", "base d=2", "img d=0", "img d=1", "img d=2"), loc='upper left') pl.xlabel("Nb particles") pl.ylabel("GB/s") pl.title("Advection bw") + pl.xscale('log') signal.signal(signal.SIGINT, signal.SIG_DFL) pl.show() diff --git a/parmepy/kernel_benchmark/test_remesh.py b/parmepy/kernel_benchmark/test_remesh.py index 3299d3a08..360c17781 100644 --- a/parmepy/kernel_benchmark/test_remesh.py +++ b/parmepy/kernel_benchmark/test_remesh.py @@ -26,6 +26,7 @@ queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_EN def main(sdir, nb_run): print "sDir", sdir, " nbPart", nb, "**3" + # Creation des tableaux numpy grid_shape = (nb, nb, nb, 3) grid_size = 1. / np.array(grid_shape[0:grid_dim], dtype=dtype_real) ppos = np.empty((nb, nb, nb), dtype=dtype_real) @@ -41,13 +42,23 @@ def main(sdir, nb_run): pscal = np.random.random_sample((nb, nb, nb)).astype(dtype_real) gscal = np.empty_like(pscal).astype(dtype_real) - avance = np.random.random_sample((nb, nb, nb)).astype(dtype_real) + avance = np.random.random_sample((nb, nb, nb)).astype(dtype_real) * 0.9 ppos += grid_size[sdir] * avance + # Création des buffer pour le kernel de base ppos_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=ppos.nbytes) pscal_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=pscal.nbytes) gscal_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, size=gscal.nbytes) + # Création des images3d pour le nouveau kernel + ## Image grid_img : R : vitesse.x + ## G : vitesse.y + ## B : vitesse.z + ## A : scalaire + ## Image part_img : R : position (dans la direction de splitting) + ## G : - + ## B : distance au point de grille de gauche + ## A : scalaire img_fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT) grid_rgba = np.zeros((nb, nb, nb, 4), dtype=dtype_real) grid_img = cl.Image(ctx, cl.mem_flags.WRITE_ONLY, img_fmt, shape=(nb, nb, nb)) @@ -57,8 +68,15 @@ def main(sdir, nb_run): part_rgba[..., 3] = pscal part_img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_fmt, shape=(nb, nb, nb)) - cl.enqueue_copy(queue, ppos_buf, ppos) + # Envoi des données pour les buffer (mémoire organisée pour un acces contigu) + if sdir == 0: + cl.enqueue_copy(queue, ppos_buf, ppos.ravel()) + elif sdir == 1: + cl.enqueue_copy(queue, ppos_buf, ppos.swapaxes(1, 0).ravel()) + else: + cl.enqueue_copy(queue, ppos_buf, ppos.swapaxes(1, 0).swapaxes(2, 0).ravel()) cl.enqueue_copy(queue, pscal_buf, pscal) + # Envoi des données sur l'image3d cl.enqueue_copy(queue, part_img, part_rgba, origin=(0, 0, 0), region=(nb, nb, nb)) queue.finish() @@ -74,6 +92,8 @@ def main(sdir, nb_run): f.close() prg = cl.Program(ctx, gpu_src).build("-D DIM=" + str(grid_dim) + " -D MAX_LINE_POINTS=" + str(nb)) + ## Run du kernel de base (buffers) (résultat de référence) + queue.finish() t = 0. for i in xrange(nb_run): evt = prg.remeshing(queue, (nb, nb), (1, nb), @@ -96,7 +116,7 @@ def main(sdir, nb_run): queue.finish() #print gscal - + ## Run kernel avec image3d t = 0. for i in xrange(nb_run): evt = prg.remeshing_img(queue, (nb, nb), (1, nb), @@ -109,26 +129,18 @@ def main(sdir, nb_run): ) queue.finish() t += (evt.profile.end - evt.profile.start) * 1e-9 - time_base = t / (nb_run * 1.) - times[sdir][1].append(time_base) - flops[sdir][1].append(1e-9 * (71 * nb ** grid_dim) / time_base) - bw[sdir][1].append(1e-9 * (8 * 4 * nb ** grid_dim) / time_base) - print "img\t", time_base, + time_img = t / (nb_run * 1.) + times[sdir][1].append(time_img) + flops[sdir][1].append(1e-9 * (74 * nb ** grid_dim) / time_img) + bw[sdir][1].append(1e-9 * (8 * 4 * nb ** grid_dim) / time_img) + print "img\t", time_img, temp1 = np.empty_like(gscal) cl.enqueue_copy(queue, grid_rgba, grid_img, origin=(0, 0, 0), region=(nb, nb, nb)) queue.finish() - ## Comparaison des résultats pour la position des particules (grid.A) - if sdir == 0: - print np.max(np.abs(gscal - grid_rgba[..., 3])) - #print grid_rgba[..., 3] - elif sdir == 1: - print np.max(np.abs(gscal - grid_rgba[..., 3].swapaxes(0, 1))) - #print grid_rgba[..., 3].swapaxes(0, 1) - elif sdir == 2: - print np.max(np.abs(gscal - grid_rgba[..., 3].swapaxes(0, 1).swapaxes(0, 2))) - #print grid_rgba[..., 3].swapaxes(0, 1).swapaxes(0, 2) + ## Comparaison des résultats pour le scalaire sur la grille (grid.A) #print grid_rgba[..., 3] - print np.max(np.abs(gscal - grid_rgba[..., 3])) + print np.max(np.abs(gscal - grid_rgba[..., 3])), + print 'gain :', time_base / time_img if __name__ == "__main__": @@ -139,9 +151,9 @@ if __name__ == "__main__": times = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)] flops = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)] bw = [[[] for k in xrange(nb_kernel)] for d in xrange(grid_dim)] - #main(0, nb_run) - #main(1, nb_run) - #main(2, nb_run) + # main(0, nb_run) + # main(1, nb_run) + # main(2, nb_run) for nb in sizes: print "==== {0}**3 ====".format(nb) @@ -151,38 +163,41 @@ if __name__ == "__main__": pl.figure(1) for d in xrange(grid_dim): - pl.plot(t_sizes, times[d][0], label="base d=" + str(d)) + pl.plot(t_sizes, times[d][0], '-o', label="base d=" + str(d)) for d in xrange(grid_dim): - pl.plot(t_sizes, times[d][1], label="img d=" + str(d)) + pl.plot(t_sizes, times[d][1], '-o', label="img d=" + str(d)) pl.legend(("base d=0", "base d=1", "base d=2", - "img d=0", "img d=1", "img d=2"), loc='center right') - pl.axis([0, 13000000, 0, 0.2]) + "img d=0", "img d=1", "img d=2"), loc='center left') + #pl.axis([0, 13000000, 0, 0.2]) pl.xlabel("Nb particles") pl.ylabel("time (s)") pl.title("Remeshing times") + pl.xscale('log') pl.figure(2) for d in xrange(grid_dim): - pl.plot(t_sizes, flops[d][0], label="base d=" + str(d)) + pl.plot(t_sizes, flops[d][0], '-o', label="base d=" + str(d)) for d in xrange(grid_dim): - pl.plot(t_sizes, flops[d][1], label="img d=" + str(d)) + pl.plot(t_sizes, flops[d][1], '-o', label="img d=" + str(d)) pl.legend(("base d=0", "base d=1", "base d=2", - "img d=0", "img d=1", "img d=2"), loc='center right') - pl.axis([0, 13000000, 0, 20]) + "img d=0", "img d=1", "img d=2"), loc='center left') + #pl.axis([0, 13000000, 0, 20]) pl.xlabel("Nb particles") pl.ylabel("GFlops") pl.title("Remeshing flops") + pl.xscale('log') pl.figure(3) for d in xrange(grid_dim): - pl.plot(t_sizes, bw[d][0], label="base d=" + str(d)) + pl.plot(t_sizes, bw[d][0], '-o', label="base d=" + str(d)) for d in xrange(grid_dim): - pl.plot(t_sizes, bw[d][1], label="img d=" + str(d)) + pl.plot(t_sizes, bw[d][1], '-o', label="img d=" + str(d)) pl.legend(("base d=0", "base d=1", "base d=2", "img d=0", "img d=1", "img d=2"), loc='upper left') pl.xlabel("Nb particles") pl.ylabel("GB/s") pl.title("Remeshing bw") + pl.xscale('log') signal.signal(signal.SIGINT, signal.SIG_DFL) pl.show() diff --git a/parmepy/kernel_benchmark/test_remesh_base_kernel.cl b/parmepy/kernel_benchmark/test_remesh_base_kernel.cl index 8d8d46edf..0d0bf49d0 100644 --- a/parmepy/kernel_benchmark/test_remesh_base_kernel.cl +++ b/parmepy/kernel_benchmark/test_remesh_base_kernel.cl @@ -20,7 +20,7 @@ __kernel void remeshing(__global const float* part, __private uint ind_line, ind_line_scal; // Line begin buffer index __private int ind_c; // Remeshing grid point __private float x0; // Remeshing distance from nearest left grid point - __private float dx_dir; // Space step along line + __private float dx_dir, min_dir; // Space step along line __private float ppos_c; // Particle position temp along line __private float pscal_c; // Particle scalar along line __private float weights[6]; // Remeshing weights @@ -34,18 +34,21 @@ __kernel void remeshing(__global const float* part, kernel_init_3_0(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); ind_m = get_global_id(0)*nb.z+get_global_id(1); stride_m = nb.y*nb.z; + min_dir = min_v.x; } else if (dir == 1) { kernel_init_3_1(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); ind_m = get_global_id(0)*nb.z+get_global_id(1); stride_m = nb.x*nb.z; + min_dir = min_v.y; } else { kernel_init_3_2(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); ind_m = get_global_id(0)*nb.y+get_global_id(1); stride_m = nb.x*nb.y; + min_dir = min_v.z; }} else{ if (dir == 0) @@ -53,12 +56,14 @@ __kernel void remeshing(__global const float* part, kernel_init_2_0(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); ind_m = get_global_id(0); stride_m = nb.y; + min_dir = min_v.x; } else { kernel_init_2_1(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); ind_m = get_global_id(0); stride_m = nb.x; + min_dir = min_v.y; } } @@ -73,7 +78,7 @@ __kernel void remeshing(__global const float* part, pscal_c = pscal[ind_line_scal + p*stride_along_dir_scal]; // Read : 1float // Compute nearest left grid point // Compute nearest left grid point distance - x0 = remquo_rtn(ppos_c - min_v[dir], dx_dir, &ind_c)/dx_dir; //5flop + x0 = remquo_rtn(ppos_c - min_dir, dx_dir, &ind_c)/dx_dir; //5flop // Compute remeshing weights weights[0] = x0 * (x0 * (x0 * (x0 * (13.0f - 5.0f * x0) - 9.0f) - 1.0f) + 2.0f) / 24.0f; // 10flop weights[1] = x0 * (x0 * (x0 * (x0 * (25.0f * x0 - 64.0f) + 39.0f) + 16.0f) - 16.0f) / 24.0f; //10flop @@ -94,14 +99,10 @@ __kernel void remeshing(__global const float* part, gscal_buffer_loc[i] += weights[4] * pscal_c; // 2flop i = (i+1)%line_nb_pts; gscal_buffer_loc[i] += weights[5] * pscal_c; // 2flop - //debug - //gscal_buffer_loc[p] = ppos_c; } i = ind_line_scal; for(p=0;p<line_nb_pts;p++) gscal[i + p*stride_along_dir_scal] = gscal_buffer_loc[p]; // Write : 1float - //debug - //gscal[i + p*stride_along_dir_scal] = i + p*stride_along_dir_scal; } diff --git a/parmepy/kernel_benchmark/test_remesh_img_kernel.cl b/parmepy/kernel_benchmark/test_remesh_img_kernel.cl index 2c99ebf88..242f45ca6 100644 --- a/parmepy/kernel_benchmark/test_remesh_img_kernel.cl +++ b/parmepy/kernel_benchmark/test_remesh_img_kernel.cl @@ -15,7 +15,7 @@ __kernel void remeshing_img(__read_only image3d_t part, { const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; __private uint p, i; - __private float buffer_loc[MAX_LINE_POINTS]; + __private float buffer_loc[MAX_LINE_POINTS]__attribute__((aligned)); __private float4 part_px, grid_px; __private int4 coord; __private int coord_t[3], ind, nb_dir; @@ -53,10 +53,9 @@ __kernel void remeshing_img(__read_only image3d_t part, coord = (int4)(coord_t[2], coord_t[1], coord_t[0], 0); part_px = read_imagef(part, sampler, coord); // Read 1 pixel (4float) // Compute remeshing weights - x0 = part_px.z; - //ind = (convert_int_rtn(part_px.x/dx_dir) + nb_dir) % nb_dir; ind = convert_int_rtn(part_px.x/dx_dir); // 1flop - //x0 = (part_px.x - ind * dx_dir)/dx_dir; + //x0 = part_px.z; // lecture de la distance si déja calculée par advection + x0 = (part_px.x - ind * dx_dir)/dx_dir; // 3flop weights[0] = x0 * (x0 * (x0 * (x0 * (13.0f - 5.0f * x0) - 9.0f) - 1.0f) + 2.0f) / 24.0f; // 10flop weights[1] = x0 * (x0 * (x0 * (x0 * (25.0f * x0 - 64.0f) + 39.0f) + 16.0f) - 16.0f) / 24.0f; //10flop weights[2] = x0 * x0 * (x0 * (x0 * (126.0f - 50.0f * x0) - 70.0f) - 30.0f) / 24.0f + 1.0f; //10flop @@ -76,111 +75,17 @@ __kernel void remeshing_img(__read_only image3d_t part, buffer_loc[i] += weights[4] * part_px.w; // 2flop i = (i+1)%line_nb_pts; buffer_loc[i] += weights[5] * part_px.w; // 2flop - //debug - //buffer_loc[p] = part_px.x; } - for(p=0;p<line_nb_pts;p++) { - //read particle informations + //write pixel coord_t[dir] = p; coord = (int4)(coord_t[2], coord_t[1], coord_t[0], 0); + // /!\ : grid_px.xyz correspond à la vitesse sur la grille qui est écrasée !! grid_px = (float4)(0.0f, 0.0f, 0.0f, buffer_loc[p]); - //debug - //grid_px.w = get_global_id(1) + get_global_id(0) * nb.z * nb.y + p*nb.z; write_imagef(grid, coord, grid_px); // write 1 pixel (4float) - } - - - - /* - __private uint i; // DIM loop index - __private uint p; // Particle line loop index - __private uint stride_along_dir, stride_along_dir_scal; // Buffer stride between two particle of the same line - __private uint ind_line, ind_line_scal; // Line begin buffer index - __private int ind_c; // Remeshing grid point - __private float x0; // Remeshing distance from nearest left grid point - __private float dx_dir; // Space step along line - __private float ppos_c; // Particle position temp along line - __private float pscal_c; // Particle scalar along line - __private float weights[6]; // Remeshing weights - __private float gscal_buffer_loc[MAX_LINE_POINTS]; // Local buffer to store gscal for the current line - __private uint ind_m, stride_m; - // Compute strides and index. Depend on array order (C order here) - dx_dir = size[dir]; - if(DIM == 3){ - if (dir == 0) - { - kernel_init_3_0(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); - ind_m = get_global_id(0)*nb.z+get_global_id(1); - stride_m = nb.y*nb.z; - } - else if (dir == 1) - { - kernel_init_3_1(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); - ind_m = get_global_id(0)*nb.z+get_global_id(1); - stride_m = nb.x*nb.z; } - else - { - kernel_init_3_2(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); - ind_m = get_global_id(0)*nb.y+get_global_id(1); - stride_m = nb.x*nb.y; - }} - else{ - if (dir == 0) - { - kernel_init_2_0(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); - ind_m = get_global_id(0); - stride_m = nb.y; - } - else - { - kernel_init_2_1(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); - ind_m = get_global_id(0); - stride_m = nb.x; - } - } - - // Initialise grid scalar - for(p=0;p<line_nb_pts;p++) - gscal_buffer_loc[p] = 0.0f; - for(p=0;p<line_nb_pts;p++) - { - // read particle position - ppos_c = part[ind_m + p*stride_m]; // Read : 1float (4bytes) - // read particle scalar - pscal_c = pscal[ind_line_scal + p*stride_along_dir_scal]; // Read : 1float - // Compute nearest left grid point - // Compute nearest left grid point distance - x0 = remquo_rtn(ppos_c - min_v[dir], dx_dir, &ind_c)/dx_dir; //5flop - // Compute remeshing weights - weights[0] = x0 * (x0 * (x0 * (x0 * (13.0f - 5.0f * x0) - 9.0f) - 1.0f) + 2.0f) / 24.0f; // 10flop - weights[1] = x0 * (x0 * (x0 * (x0 * (25.0f * x0 - 64.0f) + 39.0f) + 16.0f) - 16.0f) / 24.0f; //10flop - weights[2] = x0 * x0 * (x0 * (x0 * (126.0f - 50.0f * x0) - 70.0f) - 30.0f) / 24.0f + 1.0f; //10flop - weights[3] = x0 * (x0 * (x0 * (x0 * (50.0f * x0 - 124.0f) + 66.0f) + 16.0f) + 16.0f) / 24.0f; //10flop - weights[4] = x0 * (x0 * (x0 * (x0 * (61.0f - 25.0f * x0) - 33.0f) - 1.0f) - 2.0f) / 24.0f; //10flop - weights[5] = x0 * x0 * x0 * (x0 * (5.0f * x0 - 12.0f) + 7.0f) / 24.0f; // 8flop - // Apply remeshing weights - i = (ind_c-2+line_nb_pts)%line_nb_pts; - gscal_buffer_loc[i] += weights[0] * pscal_c; // 2flop - i = (i+1)%line_nb_pts; - gscal_buffer_loc[i] += weights[1] * pscal_c; // 2flop - i = (i+1)%line_nb_pts; - gscal_buffer_loc[i] += weights[2] * pscal_c; // 2flop - i = (i+1)%line_nb_pts; - gscal_buffer_loc[i] += weights[3] * pscal_c; // 2flop - i = (i+1)%line_nb_pts; - gscal_buffer_loc[i] += weights[4] * pscal_c; // 2flop - i = (i+1)%line_nb_pts; - gscal_buffer_loc[i] += weights[5] * pscal_c; // 2flop - } - i = ind_line_scal; - for(p=0;p<line_nb_pts;p++) - gscal[i + p*stride_along_dir_scal] = p; - //gscal[i + p*stride_along_dir_scal] = gscal_buffer_loc[p]; // Write : 1float - */ } -- GitLab