From 05c0f5e2e835abedbd4e2b2b650adbcad83155de Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@imag.fr> Date: Wed, 14 Aug 2013 16:50:53 +0000 Subject: [PATCH] GPU configuration improvements --- HySoP/CMake/ParmesTests.cmake | 3 +- HySoP/CMakeLists.txt | 3 +- HySoP/hysop/fields/continuous.py | 2 +- HySoP/hysop/gpu/cl_src/advection/basic.cl | 56 +++ .../hysop/gpu/cl_src/advection/basic_noVec.cl | 51 +++ HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl | 93 +++++ .../gpu/cl_src/advection/basic_rk4_noVec.cl | 82 +++++ HySoP/hysop/gpu/cl_src/advection/builtin.cl | 27 +- .../gpu/cl_src/advection/builtin_noVec.cl | 40 ++- .../hysop/gpu/cl_src/advection/builtin_rk4.cl | 87 +++++ .../gpu/cl_src/advection/builtin_rk4_noVec.cl | 78 +++++ HySoP/hysop/gpu/cl_src/common.cl | 6 +- .../kernels/advection_and_remeshing_noVec.cl | 9 +- .../gpu/cl_src/kernels/advection_noVec.cl | 14 +- HySoP/hysop/gpu/cl_src/kernels/copy_noVec.cl | 2 +- .../gpu/cl_src/kernels/remeshing_noVec.cl | 66 ++++ .../hysop/gpu/cl_src/kernels/transpose_xy.cl | 14 +- .../gpu/cl_src/kernels/transpose_xy_noVec.cl | 79 +++++ .../hysop/gpu/cl_src/kernels/transpose_xz.cl | 25 +- .../gpu/cl_src/kernels/transpose_xz_noVec.cl | 77 +++++ .../gpu/cl_src/kernels/transpose_xz_slice.cl | 74 ++++ .../kernels/transpose_xz_slice_noVec.cl | 70 ++++ .../hysop/gpu/cl_src/remeshing/basic_noVec.cl | 2 + .../gpu/cl_src/remeshing/private_noVec.cl | 95 ++++++ HySoP/hysop/gpu/cl_src/remeshing/weights.cl | 2 +- .../gpu/cl_src/remeshing/weights_builtin.cl | 2 +- .../gpu/cl_src/remeshing/weights_noVec.cl | 2 +- .../cl_src/remeshing/weights_noVec_builtin.cl | 2 +- HySoP/hysop/gpu/config_cayman.py | 155 +++++++++ HySoP/hysop/gpu/config_k20m.py | 161 +++++++++ HySoP/hysop/gpu/gpu_discrete.py | 14 +- HySoP/hysop/gpu/gpu_particle_advection.py | 117 ++----- HySoP/hysop/gpu/gpu_particle_advection_1k.py | 37 +- HySoP/hysop/gpu/gpu_particle_advection_2k.py | 87 ++--- HySoP/hysop/gpu/kernel_benchmark.py | 320 ++++++++---------- HySoP/hysop/gpu/tools.py | 86 ++--- HySoP/hysop/numerics/tests/__init__.py | 0 HySoP/hysop/tools/timers.py | 46 ++- HySoP/setup.py.in | 1 + 39 files changed, 1602 insertions(+), 485 deletions(-) create mode 100644 HySoP/hysop/gpu/cl_src/advection/basic.cl create mode 100644 HySoP/hysop/gpu/cl_src/advection/basic_noVec.cl create mode 100644 HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl create mode 100644 HySoP/hysop/gpu/cl_src/advection/basic_rk4_noVec.cl create mode 100644 HySoP/hysop/gpu/cl_src/advection/builtin_rk4.cl create mode 100644 HySoP/hysop/gpu/cl_src/advection/builtin_rk4_noVec.cl create mode 100644 HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl create mode 100644 HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl create mode 100644 HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl create mode 100644 HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl create mode 100644 HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl create mode 100644 HySoP/hysop/gpu/cl_src/remeshing/private_noVec.cl create mode 100644 HySoP/hysop/gpu/config_cayman.py create mode 100644 HySoP/hysop/gpu/config_k20m.py create mode 100644 HySoP/hysop/numerics/tests/__init__.py diff --git a/HySoP/CMake/ParmesTests.cmake b/HySoP/CMake/ParmesTests.cmake index af9db544a..87d3f8874 100644 --- a/HySoP/CMake/ParmesTests.cmake +++ b/HySoP/CMake/ParmesTests.cmake @@ -9,6 +9,7 @@ set(py_src_dirs operator problem tools + numerics ) if(USE_MPI) @@ -61,7 +62,7 @@ endforeach() ## that names are not __init__ or test_ and that contains '>>>' set(py_doctest_files) foreach(testdir ${py_src_dirs}) - file(GLOB testfiles parmepy/${testdir}/*.py) + file(GLOB testfiles parmepy/${testdir}/[a-zA-Z]*.py) foreach(testfile ${testfiles}) file(STRINGS ${testfile} test_doctest REGEX ">>>") if(NOT "${test_doctest}" STREQUAL "") diff --git a/HySoP/CMakeLists.txt b/HySoP/CMakeLists.txt index 6d3af4846..045fd87e2 100644 --- a/HySoP/CMakeLists.txt +++ b/HySoP/CMakeLists.txt @@ -56,7 +56,7 @@ endif() # 3 cmake -DCMAKE_INSTALL_PREFIX = some_path ==> install in some_path # Note that case 2 and 3 require a proper set of PYTHONPATH environment var for parmepy to work. if(USER_INSTALL) - # Unset CMAKE_INSTALL_PREFIX to allow user to reset it and to use python default (--user) rather than cmake default. + # Unset CMAKE_INSTALL_PREFIX to allow user to reset it and to use python default (--user) rather than cmake default. set(CMAKE_INSTALL_PREFIX "" CACHE PATH "install path") endif() @@ -131,6 +131,7 @@ string(STRIP ${${PROJECT_NAME}_PYTHON_BUILD_DIR} ${PROJECT_NAME}_PYTHON_BUILD_DI # --- OpenCL --- if(WITH_GPU) find_python_module(pyopencl REQUIRED) + find_python_module(sympy REQUIRED) endif() # --- MPI --- diff --git a/HySoP/hysop/fields/continuous.py b/HySoP/hysop/fields/continuous.py index c1ffa113a..e3913378a 100644 --- a/HySoP/hysop/fields/continuous.py +++ b/HySoP/hysop/fields/continuous.py @@ -150,7 +150,7 @@ class Field(object): s += 'scalar field ' s += 'with the following (local) discretisations :\n' for f in self.discreteFields.values(): - s += f.__str__(True) + s += f.__str__() return s def norm(self, topo=None): diff --git a/HySoP/hysop/gpu/cl_src/advection/basic.cl b/HySoP/hysop/gpu/cl_src/advection/basic.cl new file mode 100644 index 000000000..2bc92e4f0 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection/basic.cl @@ -0,0 +1,56 @@ +/** + * @file basic.cl + * Advection function, vectorized version, no use of builtins functions. + */ + +float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc); + + +/** + * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field. + * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types. + * + * @param i Particle index. + * @param dt Time step. + * @param dx Space step. + * @param invdx 1/dx. + * @param gvelo_loc Local velocity cache. + * @return Particle position. + * + * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. + * @remark <code>__N__</code> is expanded at compilation time by vector width. + * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. + * @see parmepy.gpu.tools.parse_file + */ +float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc) +{ + float__N__ v, /* Velocity at point */ + vp, /* Velocity at right point */ + p, /* Intermediary position */ + c, /* initial coordinate */ + hdt = (float__N__)(0.5*dt); /* half time step */ + int__N__ i_ind, /* Interpolation left point */ + i_ind_p; /* Interpolation right point */ + + c = (float__N__)((i+__NN__)*dx, + ); + + v = (float__N__)(gvelo_loc[noBC_id(i+__NN__)], + ); + + p = (c + hdt * v) * invdx; + + i_ind = convert_int__N___rtn(p); + p = p - convert_float__N__(i_ind); + + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + + v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)], + ); + vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)], + ); + v = (p*(vp-v) + v); + + return c + dt * v; +} diff --git a/HySoP/hysop/gpu/cl_src/advection/basic_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/basic_noVec.cl new file mode 100644 index 000000000..2be4a657f --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection/basic_noVec.cl @@ -0,0 +1,51 @@ +/** + * @file basic_noVec.cl + * Advection function, basic version + */ + +float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc); + + +/** + * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field. + * Use of builtin OpenCL functions fma and mix. + * + * @param i Particle index. + * @param dt Time step. + * @param dx Space step. + * @param invdx 1/dx. + * @param gvelo Global velocity field. + * @return Particle position + * + * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last. + */ +float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc) +{ + float v, /* Velocity at point */ + vp, /* Velocity at right point */ + p, /* Normalized intermediary position */ + c = i * dx, /* initial coordinate */ + hdt = 0.5 * dt; /* half time step */ + int i_ind, /* Interpolation left point */ + i_ind_p; /* Interpolation right point */ + + v = gvelo_loc[noBC_id(i)]; + + p = (c + hdt*v) * invdx; + + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + + v = gvelo_loc[noBC_id(i_ind)]; + vp = gvelo_loc[noBC_id(i_ind_p)]; + v = (p*(vp-v) + v); + + return c + dt * v; +} +/* Operations number : */ +/* - 2 positions = 2 * 2 */ +/* - 1 iterpolation = 9 */ +/* Total = 13 */ diff --git a/HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl b/HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl new file mode 100644 index 000000000..90b7592d5 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl @@ -0,0 +1,93 @@ +/** + * @file basic_rk4.cl + * Advection function (RK4 scheme), vectorized version, no use of builtins functions. + */ + +float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc); + + +/** + * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field. + * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types. + * + * @param i Particle index. + * @param dt Time step. + * @param dx Space step. + * @param invdx 1/dx. + * @param gvelo_loc Local velocity cache. + * @return Particle position. + * + * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. + * @remark <code>__N__</code> is expanded at compilation time by vector width. + * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. + * @see parmepy.gpu.tools.parse_file + */ +float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc) +{ + float__N__ v, /* Velocity at point */ + vp, /* Velocity at right point */ + p, /* Intermediary position */ + k, /* rk averaged velocity */ + kn, /* rk intermediate velocity */ + c, /* initial coordinate */ + hdt = (float__N__)(0.5*dt); /* half time step */ + int__N__ i_ind, /* Interpolation left point */ + i_ind_p; /* Interpolation right point */ + + c = (float__N__)((i+__NN__)*dx, + ); + + //k1 = f(t,y) + //k2 = f(t + dt/2, y + dt/2 * k1) + //k3 = f(t + dt/2, y + dt/2 * k2) + //k4 = f(t + dt, y + dt * k3) + //result = y + dt/6( k1 + 2 * k2 + 2 * k3 + k4) + + k = (float__N__)(gvelo_loc[noBC_id(i+__NN__)], + ); + + p = (c + hdt * k) * invdx; + i_ind = convert_int__N___rtn(p); + p = p - convert_float__N__(i_ind); + + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)], + ); + vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)], + ); + kn = p*(vp-v) + v; + + k += 2.0 * kn; + + p = (c + hdt * kn) * invdx; + i_ind = convert_int__N___rtn(p); + p = p - convert_float__N__(i_ind); + + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)], + ); + vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)], + ); + kn = p*(vp-v) + v; + + k += 2.0 * kn; + + p = (c + dt * kn) * invdx; + i_ind = convert_int__N___rtn(p); + p = p - convert_float__N__(i_ind); + + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)], + ); + vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)], + ); + kn = p*(vp-v) + v; + + k += kn; + + + return c + (float__N__)(dt / 6.0) * k; +} diff --git a/HySoP/hysop/gpu/cl_src/advection/basic_rk4_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/basic_rk4_noVec.cl new file mode 100644 index 000000000..b571c4bc4 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection/basic_rk4_noVec.cl @@ -0,0 +1,82 @@ +/** + * @file basic_rk4_noVec.cl + * Advection function (RK4 scheme), basic version + */ + +float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc); + + +/** + * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field. + * Use of builtin OpenCL functions fma and mix. + * + * @param i Particle index. + * @param dt Time step. + * @param dx Space step. + * @param invdx 1/dx. + * @param gvelo Global velocity field. + * @return Particle position + * + * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last. + */ +float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc) +{ + float v, /* Velocity at point */ + vp, /* Velocity at right point */ + p, /* Intermediary position */ + k, /* rk averaged velocity */ + kn, /* rk intermediate velocity */ + c = i * dx, /* initial coordinate */ + hdt = 0.5 * dt; /* half time step */ + int i_ind, /* Interpolation left point */ + i_ind_p; /* Interpolation right point */ + + //k1 = f(t,y) + //k2 = f(t + dt/2, y + dt/2 * k1) + //k3 = f(t + dt/2, y + dt/2 * k2) + //k4 = f(t + dt, y + dt * k3) + //result = y + dt/6( k1 + 2 * k2 + 2 * k3 + k4) + + k = gvelo_loc[noBC_id(i)]; /* k = k1 */ + + p = (c + hdt * k) * invdx; + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + v = gvelo_loc[noBC_id(i_ind)]; + vp = gvelo_loc[noBC_id(i_ind_p)]; + kn = p*(vp-v) + v; /* kn = k2 */ + + k += 2.0 * kn; /* k = k1 + 2*k2 */ + + p = (c + hdt * kn) * invdx; + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + v = gvelo_loc[noBC_id(i_ind)]; + vp = gvelo_loc[noBC_id(i_ind_p)]; + kn = p*(vp-v) + v; /* kn = k3 */ + + k += 2.0 * kn; /* k = k1 + 2*k2 + 2*k3 */ + + p = (c + dt * kn) * invdx; + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + v = gvelo_loc[noBC_id(i_ind)]; + vp = gvelo_loc[noBC_id(i_ind_p)]; + kn = p*(vp-v) + v; /* kn = k4 */ + + k += kn; /* k = k1 + 2*k2 + 2*k3 + k4 */ + + return c + dt * k / 6.0; +} +/* Operations number : */ +/* - 4 positions = 4 * 2 + 3 */ +/* - 3 iterpolation = 3 * 9 */ +/* - velocity weights = 5*/ +/* Total = 41 */ + diff --git a/HySoP/hysop/gpu/cl_src/advection/builtin.cl b/HySoP/hysop/gpu/cl_src/advection/builtin.cl index cadc6085b..80757b014 100644 --- a/HySoP/hysop/gpu/cl_src/advection/builtin.cl +++ b/HySoP/hysop/gpu/cl_src/advection/builtin.cl @@ -25,25 +25,28 @@ float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gve float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc) { float__N__ v, /* Velocity at point */ - p, /* Intermediary position */ vp, /* Velocity at right point */ - y, /* Normalized distance to left point */ - coord; /* Grid point coordinate */ - int__N__ ind, /* Integer coordinate (cell id) */ - i_ind, /* Interpolation left point */ + p, /* Intermediary position */ + c, /* initial coordinate */ + hdt = (float__N__)(0.5*dt); /* half time step */ + int__N__ i_ind, /* Interpolation left point */ i_ind_p; /* Interpolation right point */ + + c = (float__N__)((i+__NN__)*dx, + ); + v = (float__N__)(gvelo_loc[noBC_id(i+__NN__)], ); - coord = (float__N__)((i+__NN__)*dx, - ); - p = fma((float__N__)(0.5*dt),v,coord); - ind = convert_int__N___rtn(p * invdx); - y = (fma(- convert_float__N__(ind),dx,p))* invdx ; - i_ind = ((ind + NB_I) % NB_I); + + p = fma(hdt, v, c) * invdx; + i_ind = convert_int__N___rtn(p); + p = p - convert_float__N__(i_ind); + + i_ind = ((i_ind + NB_I) % NB_I); i_ind_p = ((i_ind + 1) % NB_I); v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)], ); vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)], ); - return fma(mix(v,vp,y),dt,coord); + return fma(mix(v,vp,p),dt,c); } diff --git a/HySoP/hysop/gpu/cl_src/advection/builtin_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/builtin_noVec.cl index 5460cd3fe..a6ba8b975 100644 --- a/HySoP/hysop/gpu/cl_src/advection/builtin_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/advection/builtin_noVec.cl @@ -3,14 +3,14 @@ * Advection function, basic version */ -float advection(uint i, float dt, float dx, float invdx, __global float* gvelo); +float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc); /** * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field. * Use of builtin OpenCL functions fma and mix. * - * @param i Velocity. + * @param i Particle index. * @param dt Time step. * @param dx Space step. * @param invdx 1/dx. @@ -19,23 +19,31 @@ float advection(uint i, float dt, float dx, float invdx, __global float* gvelo); * * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last. */ -float advection(uint i, float dt, float dx, float invdx, __global float* gvelo) +float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc) { float v, /* Velocity at point */ p, /* Intermediary position */ - vp, /* Velocity at right point */ - y; /* Normalized distance to left point */ - int ind, /* Integer coordinate (cell id) */ - i_ind, /* Interpolation left point */ + c = i * dx, /* initial coordinate */ + hdt = 0.5 * dt; /* half time step */ + int i_ind, /* Interpolation left point */ i_ind_p; /* Interpolation right point */ - uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */ - v = gvelo[i+line_index]; - p = fma((float)(0.5*dt),v,i*dx); - ind = convert_int_rtn(p * invdx); - y = (fma(- convert_float(ind),dx,p))* invdx ; - i_ind = ((ind + NB_I) % NB_I); + + v = gvelo_loc[noBC_id(i)]; + + p = fma(hdt, v, c) * invdx; + + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + + i_ind = ((i_ind + NB_I) % NB_I); i_ind_p = ((i_ind + 1) % NB_I); - return fma(mix(gvelo[i_ind+line_index], - gvelo[i_ind_p+line_index],y), - dt,i*dx); + v = mix(gvelo_loc[noBC_id(i_ind)], + gvelo_loc[noBC_id(i_ind_p)],p); + + return fma(dt, v, c); } +/* Operations number : */ +/* - 2 positions = 2 * fma */ +/* - 1 iterpolation = 6 + 1 * mix */ +/* Total = 2 fma + 1 mix + 6 */ + diff --git a/HySoP/hysop/gpu/cl_src/advection/builtin_rk4.cl b/HySoP/hysop/gpu/cl_src/advection/builtin_rk4.cl new file mode 100644 index 000000000..a67c69c16 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection/builtin_rk4.cl @@ -0,0 +1,87 @@ +/** + * @file builtin_rk4.cl + * Advection function, vectorized version. + */ + +float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc); + + +/** + * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field. + * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types. + * + * @param i Particle index. + * @param dt Time step. + * @param dx Space step. + * @param invdx 1/dx. + * @param gvelo_loc Local velocity cache. + * @return Particle position. + * + * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. + * @remark <code>__N__</code> is expanded at compilation time by vector width. + * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. + * @see parmepy.gpu.tools.parse_file + */ +float__N__ advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc) +{ + float__N__ v, /* Velocity at point */ + vp, /* Velocity at right point */ + p, /* Intermediary position */ + k, /* rk averaged velocity */ + kn, /* rk intermediate velocity */ + c, /* initial coordinate */ + hdt = (float__N__)(0.5*dt); /* half time step */ + int__N__ i_ind, /* Interpolation left point */ + i_ind_p; /* Interpolation right point */ + + c = (float__N__)((i+__NN__)*dx, + ); + + k = (float__N__)(gvelo_loc[noBC_id(i+__NN__)], + ); + + p = fma(hdt, k, c) * invdx; + i_ind = convert_int__N___rtn(p); + p = p - convert_float__N__(i_ind); + + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)], + ); + vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)], + ); + kn = mix(v,vp,p); + + k += 2.0 * kn; + + p = fma(hdt, kn, c) * invdx; + i_ind = convert_int__N___rtn(p); + p = p - convert_float__N__(i_ind); + + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)], + ); + vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)], + ); + kn = mix(v,vp,p); + + k += 2.0 * kn; + + p = fma((float__N__)(dt), kn, c) * invdx; + i_ind = convert_int__N___rtn(p); + p = p - convert_float__N__(i_ind); + + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + v = (float__N__)(gvelo_loc[noBC_id(i_ind.s__NN__)], + ); + vp = (float__N__)(gvelo_loc[noBC_id(i_ind_p.s__NN__)], + ); + kn = mix(v,vp,p); + + k += kn; + + + return fma(k,(float__N__)(dt/6.0),c); +} diff --git a/HySoP/hysop/gpu/cl_src/advection/builtin_rk4_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/builtin_rk4_noVec.cl new file mode 100644 index 000000000..da79c8950 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection/builtin_rk4_noVec.cl @@ -0,0 +1,78 @@ +/** + * @file builtin_rk4_noVec.cl + * Advection function, basic version + */ + +float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc); + + +/** + * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field. + * Use of builtin OpenCL functions fma and mix. + * + * @param i Particle index. + * @param dt Time step. + * @param dx Space step. + * @param invdx 1/dx. + * @param gvelo Global velocity field. + * @return Particle position + * + * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last. + */ +float advection(uint i, float dt, float dx, float invdx, __local float* gvelo_loc) +{ + float p, /* Intermediary position */ + k, /* rk averaged velocity */ + kn, /* rk intermediate velocity */ + c = i * dx, /* initial coordinate */ + hdt = 0.5 * dt; /* half time step */ + int i_ind, /* Interpolation left point */ + i_ind_p; /* Interpolation right point */ + + //k1 = f(t,y) + //k2 = f(t + dt/2, y + dt/2 * k1) + //k3 = f(t + dt/2, y + dt/2 * k2) + //k4 = f(t + dt, y + dt * k3) + //result = y + dt/6( k1 + 2 * k2 + 2 * k3 + k4) + + k = gvelo_loc[noBC_id(i)]; /* k = k1 */ + + + p = fma(hdt, k, c) * invdx; + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + kn = mix(gvelo_loc[noBC_id(i_ind)], + gvelo_loc[noBC_id(i_ind_p)],p); /* kn = k2 */ + + k += 2.0 * kn; /* k = k1 + 2*k2 */ + + p = fma(hdt, kn, c) * invdx; + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + kn = mix(gvelo_loc[noBC_id(i_ind)], + gvelo_loc[noBC_id(i_ind_p)],p); /* kn = k3 */ + + k += 2.0 * kn; /* k = k1 + 2*k2 + 2*k3 */ + + p = fma(dt, kn, c) * invdx; + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind = ((i_ind + NB_I) % NB_I); + i_ind_p = ((i_ind + 1) % NB_I); + kn = mix(gvelo_loc[noBC_id(i_ind)], + gvelo_loc[noBC_id(i_ind_p)],p); /* kn = k4 */ + + k += kn; /* k = k1 + 2*k2 + 2*k3 + k4 */ + + return fma(k, dt/6.0, c); +} + +/* Operations number : */ +/* - 4 positions = 4 * fma + 3 */ +/* - 3 iterpolation = 3 * (1 * mix + 6) */ +/* - velocity weights = 5*/ +/* Total = 4 fma + 3 mix + 19 */ diff --git a/HySoP/hysop/gpu/cl_src/common.cl b/HySoP/hysop/gpu/cl_src/common.cl index ba88801f0..db36ee0c8 100644 --- a/HySoP/hysop/gpu/cl_src/common.cl +++ b/HySoP/hysop/gpu/cl_src/common.cl @@ -32,7 +32,7 @@ inline uint noBC_id(int id){ /** * Constants for remeshing formulas: - * - M4PRIME 1 + * - L2STAR 1 * - L2STAR_C2 2 * - L2STAR_C3 3 * - L2STAR_C4 4 @@ -46,7 +46,7 @@ inline uint noBC_id(int id){ * - M8PRIME 12 * - L8STAR 13 */ -#define M4PRIME 1 +#define L2STAR 1 #define L2STAR_C2 2 #define L2STAR_C3 3 #define L2STAR_C4 4 @@ -63,7 +63,7 @@ inline uint noBC_id(int id){ /** * Shift to left grid point */ -#if FORMULA == M4PRIME +#if FORMULA == L2STAR #define REMESH_SHIFT 1 #elif FORMULA == L2STAR_C2 #define REMESH_SHIFT 1 diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl index e147fb18f..225476a85 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl @@ -34,17 +34,16 @@ __kernel void advection_and_remeshing(__global const float* gvelo, float invdx = 1.0/dx; /* Space step inverse */ uint i; /* Particle index in 1D problem */ float p, /* Particle position */ - s, /* Particle scalar */ - v; /* Particle velocity */ + s; /* Particle scalar */ uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */ __local float gscal_loc[NB_I]; /* Local buffer for result */ + __local float gvelo_loc[NB_I]; /* Velocity cache */ for(i=gidX; i<NB_I; i+=(WI_NB)) { /* Read velocity */ - v = gvelo[i+line_index]; /* Fill velocity cache */ - gvelo_loc[noBC_id(i)] = v; + gvelo_loc[noBC_id(i)] = gvelo[i+line_index]; /* Initialize result buffer */ gscal_loc[noBC_id(i)] = 0.0; } @@ -57,7 +56,7 @@ __kernel void advection_and_remeshing(__global const float* gvelo, /* Read Particle scalar */ s = pscal[i + line_index]; /* Compute particle position */ - p = advection(i, dt, dx, invdx, gvelo); + p = advection(i, dt, dx, invdx, gvelo_loc); /* Remesh particle */ remesh(i, dx, invdx, s, p, gscal_loc); } diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl index ef2371559..6a27cdd91 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl @@ -26,9 +26,21 @@ __kernel void advection_kernel(__global const float* gvelo, uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ float invdx = 1.0/dx; /* Space step inverse */ uint i; /* Particle index in 1D problem */ + uint line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */ + + __local float velocity_cache[NB_I]; /* Velocity cache */ + + for(i=gidX; i<NB_I; i+=WI_NB) + { + /* Fill the cache */ + velocity_cache[noBC_id(i)] = gvelo[i+line_index]; + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); for(i=gidX; i<NB_I; i+=WI_NB) { - ppos[i+gidY*NB_I+gidZ*NB_I*NB_II] = advection(i, dt, dx, invdx, gvelo); + ppos[i+line_index] = advection(i, dt, dx, invdx, velocity_cache) + min_position; } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/copy_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/copy_noVec.cl index 869ce36a7..e9d79c51b 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/copy_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/copy_noVec.cl @@ -13,7 +13,7 @@ __kernel void copy(__global const float* in, __global float* out) { - uint xIndex = (get_group_id(0) * TILE_DIM_COPY + get_local_id(0)*__N__); + uint xIndex = get_group_id(0) * TILE_DIM_COPY + get_local_id(0); uint yIndex = get_group_id(1) * TILE_DIM_COPY + get_local_id(1); uint zIndex = get_global_id(2); uint index = xIndex + yIndex * NB_I + zIndex*NB_I*NB_II; diff --git a/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl new file mode 100644 index 000000000..188fc0fa9 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl @@ -0,0 +1,66 @@ +/** + * @file remeshing.cl + * Remeshing kernel. + */ + +/** + * Performs remeshing of the particles' scalar. + * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. + * Each work-item computes <code>NB_I/WI_NB</code> particles positions. To avoid concurrent witings, in case of strong velocity gradients, work-items computes contiguous particles. + * Particle are computed through OpenCL vector types of lenght 2, 4 or 8. + * Scalar results are stored in a local buffer as a cache and then copied to global memory buffer. + * + * @param ppos Particle position + * @param pscal Particle scalar + * @param gscal Grid scalar + * @param min_position Domain lower coordinate + * @param dx Space step + * + * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. + * @remark <code>WI_NB</code> corresponds to the work-item number. + * @remark <code>__N__</code> is expanded at compilation time by vector width. + * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. + * @see parmepy.gpu.tools.parse_file + */ +__kernel void remeshing_kernel(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) +{ + uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ + uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ + uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + float invdx = 1.0/dx; /* Space step inverse */ + uint i; /* Particle index in 1D problem */ + float p, /* Particle position */ + s; /* Particle scalar */ + uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */ + __local float gscal_loc[NB_I]; /* Local buffer for result */ + + for(i=gidX; i<NB_I; i+=WI_NB) + { + /* Initialize result buffer */ + gscal_loc[i] = 0.0; + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=1) + { + /* Read particle position */ + p = ppos[i + line_index] - min_position; + /* Read particle scalar */ + s = pscal[i + line_index]; + /* Remesh particle */ + remesh(i, dx, invdx, s, p, gscal_loc); + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=gidX; i<NB_I; i+=WI_NB) + { + /* Store result */ + gscal[i + line_index] = gscal_loc[noBC_id(i)]; + } +} diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl index acbd8405f..418a9d1ec 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy.cl @@ -37,6 +37,8 @@ __kernel void transpose_xy(__global const float* in, float__N__ temp; /* Temporary variable */ uint group_id_x; /* Work-group coordinate in global space index X */ uint group_id_y; /* Work-group coordinate in global space index Y */ + uint lid_x = get_local_id(0); + uint lid_y = get_local_id(1); /* Use of diagonal coordinates */ #if NB_II == NB_I group_id_x = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); @@ -48,14 +50,14 @@ __kernel void transpose_xy(__global const float* in, #endif /* Global input index for work-item */ - uint xIndex = group_id_x * TILE_DIM_XY + get_local_id(0)*__N__; - uint yIndex = group_id_y * TILE_DIM_XY + get_local_id(1); + uint xIndex = group_id_x * TILE_DIM_XY + lid_x*__N__; + uint yIndex = group_id_y * TILE_DIM_XY + lid_y; uint zIndex = get_global_id(2); uint index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I; /* Global output index */ - xIndex = group_id_y * TILE_DIM_XY + get_local_id(0)*__N__; - yIndex = group_id_x * TILE_DIM_XY + get_local_id(1); + xIndex = group_id_y * TILE_DIM_XY + lid_x*__N__; + yIndex = group_id_x * TILE_DIM_XY + lid_y; uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */ @@ -64,7 +66,7 @@ __kernel void transpose_xy(__global const float* in, { /* Fill the tile */ temp = vload__N__((index_in + i * NB_II)/__N__, in); - tile[get_local_id(1) + i][get_local_id(0)*__N__+__NN__] = temp.s__NN__; + tile[lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__; } /* Synchronize work-group */ @@ -73,7 +75,7 @@ __kernel void transpose_xy(__global const float* in, for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) { /* Write transposed data */ - temp = (float__N__)(tile[get_local_id(0)*__N__+__NN__][get_local_id(1) + i], + temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y + i], ); vstore__N__(temp, (index_out + i*NB_I)/__N__, out); } diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl new file mode 100644 index 000000000..87e72c1e8 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xy_noVec.cl @@ -0,0 +1,79 @@ +/** + * @file transpose_xy.cl + * Transposition in XY plane, coalesced, diagonal coordinates, vectorized version. + */ + +/** + * Performs a transposition in xy plane. + * Optimizations used are: + * - Coalesced reads and writes by means of local memory buffer (tile), + * - Local memory padding to avoir banck conflicts (optional), + * - Work groups are mapped to diagonal coordinates in global memory, + * - Reads and writes are performed by OpenCL vector types. + * + * A work group handle transposition for a tile. Transposition is done when reading data in tile. + * Work-group layout: \code + * ________________________ + * |0,0 | 1,0 | ... + * |N,0 | 0,1 | 1,2 | ... + * | . . | 0,2 | ... + * | . . + * | . . + * | + * \endcode + * + * @param in Input data + * @param out Output data + * + * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. Output layout is <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code>. + * @remark <code>PADDING_XY</code> : local memory padding width. + * @remark <code>__N__</code> is expanded at compilation time by vector width. + * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. + * @see parmepy.gpu.tools.parse_file + */ +__kernel void transpose_xy(__global const float* in, + __global float* out) +{ + uint group_id_x; /* Work-group coordinate in global space index X */ + uint group_id_y; /* Work-group coordinate in global space index Y */ + uint lid_x = get_local_id(0); + uint lid_y = get_local_id(1); + /* Use of diagonal coordinates */ +#if NB_II == NB_I + group_id_x = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); + group_id_y = get_group_id(0); +#else + uint bid = get_group_id(0) + get_group_id(1) * get_num_groups(0); + group_id_y = bid%get_num_groups(1); + group_id_x = ((bid/get_num_groups(1)) + group_id_y)%get_num_groups(0); +#endif + + /* Global input index for work-item */ + uint xIndex = group_id_x * TILE_DIM_XY + lid_x; + uint yIndex = group_id_y * TILE_DIM_XY + lid_y; + uint zIndex = get_global_id(2); + uint index_in = xIndex + yIndex * NB_II + zIndex * NB_II * NB_I; + + /* Global output index */ + xIndex = group_id_y * TILE_DIM_XY + lid_x; + yIndex = group_id_x * TILE_DIM_XY + lid_y; + uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; + + __local float tile[TILE_DIM_XY][TILE_DIM_XY+PADDING_XY]; /* Tile with padding */ + + for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) + { + /* Fill the tile */ + tile[lid_y + i][lid_x] = in[index_in + i * NB_II]; + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) + { + /* Write transposed data */ + out[index_out + i*NB_I] = tile[lid_x][lid_y + i]; + } +} + diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl index 743b4db43..b116a9551 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz.cl @@ -25,8 +25,12 @@ __kernel void transpose_xz(__global const float* in, __global float* out) { + float__N__ temp; /* Temporary variable */ uint group_id_x; /* Work-group coordinate in global space index X */ uint group_id_z; /* Work-group coordinate in global space index Y */ + uint lid_x = get_local_id(0); + uint lid_y = get_local_id(1); + uint lid_z = get_local_id(2); /* Use of diagonal coordinates */ #if NB_III == NB_I @@ -39,36 +43,39 @@ __kernel void transpose_xz(__global const float* in, #endif /* Global input index for work-item */ - uint xIndex = group_id_x * TILE_DIM_XZ + get_local_id(0); - uint yIndex = get_group_id(1) * TILE_DIM_XZ + get_local_id(1); - uint zIndex = group_id_z * TILE_DIM_XZ + get_local_id(2); + uint xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__; + uint yIndex = get_group_id(1) * TILE_DIM_XZ + lid_y; + uint zIndex = group_id_z * TILE_DIM_XZ + lid_z; uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; /* Global output index */ - xIndex = group_id_z * TILE_DIM_XZ + get_local_id(0); - zIndex = group_id_x * TILE_DIM_XZ + get_local_id(2); + xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__; + zIndex = group_id_x * TILE_DIM_XZ + lid_z; uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ - for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_ROWS_XZ) + for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) { /* Fill the tile */ - tile[get_local_id(2) + j][get_local_id(1) + i][get_local_id(0)] = in[index_in + i*NB_III + j*NB_III*NB_II]; + temp = vload__N__((index_in + i*NB_III + j*NB_III*NB_II)/__N__, in); + tile[lid_z + j][lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__; } } /* Synchronize work-group */ barrier(CLK_LOCAL_MEM_FENCE); - for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_ROWS_XZ) + for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) { for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) { /* Write transposed data */ - out[index_out + i*NB_I + j*NB_I*NB_II] = tile[get_local_id(0)][get_local_id(1)+i][get_local_id(2) + j]; + temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_y+i][lid_z + j], + ); + vstore__N__(temp, (index_out + i*NB_I + j*NB_I*NB_II)/__N__, out); } } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl new file mode 100644 index 000000000..28181ea0f --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_noVec.cl @@ -0,0 +1,77 @@ +/** + * @file transpose_xz.cl + * Transposition in XZ plane, coalesced, diagonal coordinates, 3D tiles. + */ + +/** + * Perfoms a transposition in XZ plane. As data have to be contiguously read an write in global memory, we use a 3D tile. + * Optimizations used are: + * - Coalesced reads and writes by means of local memory buffer (tile), + * - Local memory padding to avoir banck conflicts (optional), + * - Work groups are mapped to diagonal coordinates in global memory, + * - Reads and writes are performed by OpenCL vector types. + * + * + * @param in Input data + * @param out Output data + * + * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. Output layout is <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code>. + * @remark <code>PADDING_XZ</code> : local memory padding width. + * @remark <code>__N__</code> is expanded at compilation time by vector width. + * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. + * @see parmepy.gpu.tools.parse_file + * @see transpose_xy.cl + */ +__kernel void transpose_xz(__global const float* in, + __global float* out) +{ + uint group_id_x; /* Work-group coordinate in global space index X */ + uint group_id_z; /* Work-group coordinate in global space index Y */ + uint lid_x = get_local_id(0); + uint lid_y = get_local_id(1); + uint lid_z = get_local_id(2); + + /* Use of diagonal coordinates */ +#if NB_III == NB_I + group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0); + group_id_z = get_group_id(0); +#else + uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0); + group_id_z = bid%get_num_groups(2); + group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0); +#endif + + /* Global input index for work-item */ + uint xIndex = group_id_x * TILE_DIM_XZ + lid_x; + uint yIndex = get_group_id(1) * TILE_DIM_XZ + lid_y; + uint zIndex = group_id_z * TILE_DIM_XZ + lid_z; + uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; + + /* Global output index */ + xIndex = group_id_z * TILE_DIM_XZ + lid_x; + zIndex = group_id_x * TILE_DIM_XZ + lid_z; + uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; + + __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ + + for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) + { + for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) + { + /* Fill the tile */ + tile[lid_z + j][lid_y + i][lid_x] = in[index_in + i*NB_III + j*NB_III*NB_II]; + } + + } + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) + { + for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) + { + /* Write transposed data */ + out[index_out + i*NB_I + j*NB_I*NB_II] = tile[lid_x][lid_y+i][lid_z + j]; + } + } +} diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl new file mode 100644 index 000000000..611aa465b --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice.cl @@ -0,0 +1,74 @@ +/** + * @file transpose_xz.cl + * Transposition in XZ plane, coalesced, diagonal coordinates, 3D tiles. + */ + +/** + * Perfoms a transposition in XZ plane. As data have to be contiguously read an write in global memory, we use a 3D tile. + * Optimizations used are: + * - Coalesced reads and writes by means of local memory buffer (tile), + * - Local memory padding to avoir banck conflicts (optional), + * - Work groups are mapped to diagonal coordinates in global memory, + * - Reads and writes are performed by OpenCL vector types. + * + * + * @param in Input data + * @param out Output data + * + * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. Output layout is <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code>. + * @remark <code>PADDING_XZ</code> : local memory padding width. + * @remark <code>__N__</code> is expanded at compilation time by vector width. + * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. + * @see parmepy.gpu.tools.parse_file + * @see transpose_xy.cl + */ +__kernel void transpose_xz(__global const float* in, + __global float* out) +{ + float__N__ temp; /* Temporary variable */ + uint group_id_x; /* Work-group coordinate in global space index X */ + uint group_id_z; /* Work-group coordinate in global space index Y */ + uint lid_x = get_local_id(0); + uint lid_z = get_local_id(2); + + /* Use of diagonal coordinates */ +#if NB_III == NB_I + group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0); + group_id_z = get_group_id(0); +#else + uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0); + group_id_z = bid%get_num_groups(2); + group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0); +#endif + + /* Global input index for work-item */ + uint xIndex = group_id_x * TILE_DIM_XZ + lid_x*__N__; + uint yIndex = get_global_id(1); + uint zIndex = group_id_z * TILE_DIM_XZ + lid_z; + uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; + + /* Global output index */ + xIndex = group_id_z * TILE_DIM_XZ + lid_x*__N__; + zIndex = group_id_x * TILE_DIM_XZ + lid_z; + uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; + + __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ + + for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) + { + /* Fill the tile */ + temp = vload__N__((index_in + j*NB_III*NB_II)/__N__, in); + tile[lid_z + j][lid_x*__N__+__NN__] = temp.s__NN__; + + } + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) + { + /* Write transposed data */ + temp = (float__N__)(tile[lid_x*__N__+__NN__][lid_z + j], + ); + vstore__N__(temp, (index_out + j*NB_I*NB_II)/__N__, out); + } +} diff --git a/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl new file mode 100644 index 000000000..fc4b1322a --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/kernels/transpose_xz_slice_noVec.cl @@ -0,0 +1,70 @@ +/** + * @file transpose_xz.cl + * Transposition in XZ plane, coalesced, diagonal coordinates, 3D tiles. + */ + +/** + * Perfoms a transposition in XZ plane. As data have to be contiguously read an write in global memory, we use a 3D tile. + * Optimizations used are: + * - Coalesced reads and writes by means of local memory buffer (tile), + * - Local memory padding to avoir banck conflicts (optional), + * - Work groups are mapped to diagonal coordinates in global memory, + * - Reads and writes are performed by OpenCL vector types. + * + * + * @param in Input data + * @param out Output data + * + * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. Output layout is <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code>. + * @remark <code>PADDING_XZ</code> : local memory padding width. + * @remark <code>__N__</code> is expanded at compilation time by vector width. + * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. + * @see parmepy.gpu.tools.parse_file + * @see transpose_xy.cl + */ +__kernel void transpose_xz(__global const float* in, + __global float* out) +{ + uint group_id_x; /* Work-group coordinate in global space index X */ + uint group_id_z; /* Work-group coordinate in global space index Y */ + uint lid_x = get_local_id(0); + uint lid_z = get_local_id(2); + + /* Use of diagonal coordinates */ +#if NB_III == NB_I + group_id_x = (get_group_id(0) + get_group_id(2)) % get_num_groups(0); + group_id_z = get_group_id(0); +#else + uint bid = get_group_id(0) + get_group_id(2) * get_num_groups(0); + group_id_z = bid%get_num_groups(2); + group_id_x = ((bid/get_num_groups(2)) + group_id_z)%get_num_groups(0); +#endif + + /* Global input index for work-item */ + uint xIndex = group_id_x * TILE_DIM_XZ + lid_x; + uint yIndex = get_global_id(1); + uint zIndex = group_id_z * TILE_DIM_XZ + lid_z; + uint index_in = xIndex + yIndex * NB_III + zIndex * NB_III * NB_II; + + /* Global output index */ + xIndex = group_id_z * TILE_DIM_XZ + lid_x; + zIndex = group_id_x * TILE_DIM_XZ + lid_z; + uint index_out = xIndex + yIndex * NB_I + zIndex * NB_I * NB_II; + + __local float tile[TILE_DIM_XZ][TILE_DIM_XZ+PADDING_XZ]; /* Tile with padding */ + + for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) + { + /* Fill the tile */ + tile[lid_z + j][lid_x] = in[index_in + j*NB_III*NB_II]; + + } + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_DEPH_XZ) + { + /* Write transposed data */ + out[index_out + j*NB_I*NB_II] = tile[lid_x][lid_z + j]; + } +} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl index b43a4adb7..a9d67638d 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl @@ -31,6 +31,8 @@ void remesh(uint i, float dx, float invdx, float s, float p, __local float* gsca int ind; /* Integer coordinate */ uint index; /* Remeshing index */ + + ind = convert_int_rtn(p * invdx); y = (p - convert_float(ind) * dx) * invdx; diff --git a/HySoP/hysop/gpu/cl_src/remeshing/private_noVec.cl b/HySoP/hysop/gpu/cl_src/remeshing/private_noVec.cl new file mode 100644 index 000000000..df1538d86 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/remeshing/private_noVec.cl @@ -0,0 +1,95 @@ +/** + * @file private.cl + * Remeshing function, vectorized, private variable. + */ + +void remesh(uint i, float dx, float invdx, float s, float p, __local float* gscal_loc); + + +/** + * Remesh particles in local buffer. + * + * Remeshing formula is given a compiling time. + * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types. + * Use of a private temporary variable for remeshing weights. + * + * @param i Particle index + * @param dx Space step + * @param invdx 1/dx + * @param s Particle scalar + * @param p Particle position + * @param gscal_loc Local buffer for result + * + * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. + * @remark <code>__N__</code> is expanded at compilation time by vector width. + * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. + * @remark <code>FORMULA</code> : remeshing formula flag {<code>M4PRIME</code>, <code>M6PRIME</code>, <code>M8PRIME</code>, <code>L6STAR</code>} + * @see parmepy.gpu.tools.parse_file + * @see parmepy.gpu.cl_src.common + */ +void remesh(uint i, float dx, float invdx, float s, float p, __local float* gscal_loc){ + float y, /* Normalized distance to nearest left grid point */ + temp; /* Temporary remeshing weights */ + int ind; /* Integer coordinate */ + uint index; /* Remeshing index */ + + ind = convert_int_rtn(p * invdx); + y = (p - convert_float(ind) * dx) * invdx; + + index = convert_uint((ind - REMESH_SHIFT + NB_I) % NB_I); + + temp = alpha(y) * s; + gscal_loc[noBC_id(index)] += temp; + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + temp = beta(y) * s; + gscal_loc[noBC_id(index)] += temp; + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + temp = gamma(y) * s; + gscal_loc[noBC_id(index)] += temp; + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + temp = delta(y) * s; + gscal_loc[noBC_id(index)] += temp; + barrier(CLK_LOCAL_MEM_FENCE); + +#if REMESH_SHIFT > 1 + index = (index + 1) % NB_I; + temp = eta(y) * s; + gscal_loc[noBC_id(index)] += temp; + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + temp = zeta(y) * s; + gscal_loc[noBC_id(index)] += temp; + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 2 + index = (index + 1) % NB_I; + temp = theta(y) * s; + gscal_loc[noBC_id(index)] += temp; + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + temp = iota(y) * s; + gscal_loc[noBC_id(index)] += temp; + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 3 + index = (index + 1) % NB_I; + temp = kappa(y) * s; + gscal_loc[noBC_id(index)] += temp; + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + temp = mu(y) * s; + gscal_loc[noBC_id(index)] += temp; + barrier(CLK_LOCAL_MEM_FENCE); +#endif +} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights.cl index fec13ff85..4e352a7e5 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/weights.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/weights.cl @@ -4,7 +4,7 @@ * Polynomials under Horner form. */ -#if FORMULA == M4PRIME +#if FORMULA == L2STAR inline float__N__ alpha(float__N__ y){ return ((y * (y * (-y + 2.0) - 1.0)) / 2.0);} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl index 910aafda3..7f8bf2a80 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl @@ -4,7 +4,7 @@ * Polynomials under Horner form. */ -#if FORMULA == M4PRIME +#if FORMULA == L2STAR inline float__N__ alpha(float__N__ y){ return (y*fma(y,fma(y,-1.0, 2.0), - 1.0)/2.0);} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl index 70782b26a..f3e2e950f 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl @@ -4,7 +4,7 @@ * Polynomials under Horner form. */ -#if FORMULA == M4PRIME +#if FORMULA == L2STAR inline float alpha(float y){ return ((y * (y * (-y + 2.0) - 1.0)) / 2.0);} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl index 69de73a5e..0a703d65a 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl @@ -4,7 +4,7 @@ * Polynomials under Horner form. */ -#if FORMULA == M4PRIME +#if FORMULA == L2STAR inline float alpha(float y){ return (y*fma(y,fma(y,-1.0, 2.0), - 1.0)/2.0);} diff --git a/HySoP/hysop/gpu/config_cayman.py b/HySoP/hysop/gpu/config_cayman.py new file mode 100644 index 000000000..d537bec97 --- /dev/null +++ b/HySoP/hysop/gpu/config_cayman.py @@ -0,0 +1,155 @@ +""" +@file config_cayman.py + +OpenCL kernels configurations. +""" +from parmepy.gpu import PARMES_REAL_GPU, PARMES_DOUBLE_GPU + +#build empty dictionaries +kernels_config = {} +kernels_config[2] = {PARMES_REAL_GPU: {}, PARMES_DOUBLE_GPU: {}} +kernels_config[3] = {PARMES_REAL_GPU: {}, PARMES_DOUBLE_GPU: {}} + +# Copy kernel: +def copy_space_index_2d(size, t_dim, b_rows, vec): + gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), 1) + lwi = (t_dim / vec, b_rows, 1) + return gwi, lwi +def copy_space_index_3d(size, t_dim, b_rows, vec): + gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), int(size[2])) + lwi = (t_dim / vec, b_rows, 1) + return gwi, lwi +# Configs : sources, tile size, block rows, vector size, index space function +kernels_config[3][PARMES_REAL_GPU]['copy'] = \ + ('kernels/copy.cl', 16, 8, 4, copy_space_index_3d) +kernels_config[3][PARMES_DOUBLE_GPU]['copy'] = \ + ('kernels/copy_locMem.cl', 32, 8, 1, copy_space_index_3d) +kernels_config[2][PARMES_REAL_GPU]['copy'] = \ + ('kernels/copy.cl', 16, 8, 2, copy_space_index_2d) +kernels_config[2][PARMES_DOUBLE_GPU]['copy'] = \ + ('kernels/copy.cl', 32, 2, 2, copy_space_index_2d) + +# Transpositions kernels: +# XY transposition +# Settings are taken from destination layout as current layout. +# gwi is computed form input layout (appears as transposed layout) +def xy_space_index_2d(size, t_dim, b_rows, vec): + gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), 1) + lwi = (t_dim / vec, b_rows, 1) + return gwi, lwi +def xy_space_index_3d(size, t_dim, b_rows, vec): + gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), int(size[2])) + lwi = (t_dim / vec, b_rows, 1) + return gwi, lwi +# Configs : sources, tile size, block rows, is padding, vector size, +# index space function +kernels_config[3][PARMES_REAL_GPU]['transpose_xy'] = \ + ('kernels/transpose_xy.cl', 16, 8, True, 2, xy_space_index_3d) +kernels_config[3][PARMES_DOUBLE_GPU]['transpose_xy'] = \ + ('kernels/transpose_xy.cl', 32, 4, True, 4, xy_space_index_3d) +kernels_config[2][PARMES_REAL_GPU]['transpose_xy'] = \ + ('kernels/transpose_xy.cl', 32, 8, True, 4, xy_space_index_2d) +kernels_config[2][PARMES_DOUBLE_GPU]['transpose_xy'] = \ + ('kernels/transpose_xy.cl', 32, 2, True, 4, xy_space_index_2d) + +# XZ transposition +# Settings are taken from destination layout as current layout. +# gwi is computed form input layout (appears as transposed layout) +def xz_space_index_3d(size, t_dim, b_rows, b_deph, vec): + gwi = (int(size[2] / vec), int(b_rows * size[1] / t_dim), int(b_deph * size[0] / t_dim)) + lwi = (t_dim / vec, b_rows, b_deph) + return gwi, lwi +# Configs : sources, tile size, block rows, block depth, is padding, +# vector size, index space function +kernels_config[3][PARMES_REAL_GPU]['transpose_xz'] = \ + ('kernels/transpose_xz.cl', 16, 4, 4, True, 1, xy_space_index_3d) +kernels_config[3][PARMES_DOUBLE_GPU]['transpose_xz'] = \ + ('kernels/transpose_xz.cl', 8, 2, 2, False, 1, xy_space_index_3d) + + +def computational_kernels_index_space(size, vec): + dim = len(size) + if dim == 3: + wi = 64 + if dim == 2: + wi = 256 + # Change work-item regarding problem size + if size[0] % wi > 0: + if dim == 3: + print "Warning : GPU best performances obtained for", + print "problem sizes multiples of 64" + else: + print "Warning : GPU best performances obtained for", + print "problem sizes multiples of 256" + while(size[0] % wi > 0): + wi = wi / 2 + # Change work-item regarding vector_width + if wi * vec > size[0]: + if size[0] % vec > 0: + raise ValueError( + "Resolution ({0}) must be a multiple of {1}".format( + size[0], vec)) + wi = size[0] // vec + if dim == 3: + gwi = (int(wi), int(size[1]), int(size[2])) + lwi = (int(wi), 1, 1) + else: + gwi = (int(wi), int(size[1])) + lwi = (int(wi), 1) + return gwi, lwi + +# Advection kernel +# Configs sources, is noBC, vector size, index space function +kernels_config[3][PARMES_REAL_GPU]['advec'] = \ + (["common.cl", "advection/builtin.cl", "kernels/advection.cl"], + False, 4, computational_kernels_index_space) +kernels_config[3][PARMES_DOUBLE_GPU]['advec'] = \ + (["common.cl", "advection/builtin.cl", "kernels/advection.cl"], + False, 2, computational_kernels_index_space) +kernels_config[2][PARMES_REAL_GPU]['advec'] = \ + (["common.cl", "advection/builtin.cl", "kernels/advection.cl"], + False, 4, computational_kernels_index_space) +kernels_config[2][PARMES_DOUBLE_GPU]['advec'] = \ + (["common.cl", "advection/builtin_noVec.cl", "kernels/advection_noVec.cl"], + False, 1, computational_kernels_index_space) + +# Remeshing kernel +# Configs sources, is noBC, vector size, index space function +kernels_config[3][PARMES_REAL_GPU]['remesh'] = \ + (["common.cl", "remeshing/weights_builtin.cl", "remeshing/private.cl", + "kernels/remeshing.cl"], + False, 4, computational_kernels_index_space) +kernels_config[3][PARMES_DOUBLE_GPU]['remesh'] = \ + (["common.cl", "remeshing/weights_builtin.cl", "remeshing/private.cl", + "kernels/remeshing.cl"], + False, 4, computational_kernels_index_space) +kernels_config[2][PARMES_REAL_GPU]['remesh'] = \ + (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic.cl", + "kernels/remeshing.cl"], + True, 4, computational_kernels_index_space) +kernels_config[2][PARMES_DOUBLE_GPU]['remesh'] = \ + (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic.cl", + "kernels/remeshing.cl"], + True, 4, computational_kernels_index_space) + +# Advection and remeshing kernel +# Configs sources, is noBC, vector size, index space function +kernels_config[3][PARMES_REAL_GPU]['advec_and_remesh'] = \ + (["common.cl", "remeshing/weights_builtin.cl", "remeshing/private.cl", + "advection/builtin.cl", "kernels/advection_and_remeshing.cl"], + False, 4, computational_kernels_index_space) +kernels_config[3][PARMES_DOUBLE_GPU]['advec_and_remesh'] = \ + (["common.cl", "remeshing/weights_builtin.cl", "remeshing/private.cl", + "advection/builtin.cl", "kernels/advection_and_remeshing.cl"], + True, 4, computational_kernels_index_space) +kernels_config[2][PARMES_REAL_GPU]['advec_and_remesh'] = \ + (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic.cl", + "advection/builtin.cl", "kernels/advection_and_remeshing.cl"], + True, 8, computational_kernels_index_space) +kernels_config[2][PARMES_DOUBLE_GPU]['advec_and_remesh'] = \ + (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic.cl", + "advection/builtin.cl", "kernels/advection_and_remeshing.cl"], + True, 4, computational_kernels_index_space) + + + diff --git a/HySoP/hysop/gpu/config_k20m.py b/HySoP/hysop/gpu/config_k20m.py new file mode 100644 index 000000000..726a79b12 --- /dev/null +++ b/HySoP/hysop/gpu/config_k20m.py @@ -0,0 +1,161 @@ +""" +@file config_k20m.py + +OpenCL kernels configurations. +""" +from parmepy.gpu import PARMES_REAL_GPU, PARMES_DOUBLE_GPU + +#build empty dictionaries +kernels_config = {} +kernels_config[2] = {PARMES_REAL_GPU: {}, PARMES_DOUBLE_GPU: {}} +kernels_config[3] = {PARMES_REAL_GPU: {}, PARMES_DOUBLE_GPU: {}} + +# Copy kernel: +def copy_space_index_2d(size, t_dim, b_rows, vec): + gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), 1) + lwi = (t_dim / vec, b_rows, 1) + return gwi, lwi +def copy_space_index_3d(size, t_dim, b_rows, vec): + gwi = (int(size[0] / vec), int(b_rows * size[1] / t_dim), int(size[2])) + lwi = (t_dim / vec, b_rows, 1) + return gwi, lwi +# Configs : sources, tile size, block rows, vector size, index space function +kernels_config[3][PARMES_REAL_GPU]['copy'] = \ + ('kernels/copy_noVec.cl', 32, 8, 1, copy_space_index_3d) +kernels_config[3][PARMES_DOUBLE_GPU]['copy'] = \ + ('kernels/copy_noVec.cl', 16, 16, 1, copy_space_index_3d) +kernels_config[2][PARMES_REAL_GPU]['copy'] = \ + ('kernels/copy_noVec.cl', 32, 8, 1, copy_space_index_2d) +kernels_config[2][PARMES_DOUBLE_GPU]['copy'] = \ + ('kernels/copy_noVec.cl', 16, 16, 1, copy_space_index_2d) + +# Transpositions kernels: +# XY transposition +# Settings are taken from destination layout as current layout. +# gwi is computed form input layout (appears as transposed layout) +def xy_space_index_2d(size, t_dim, b_rows, vec): + gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), 1) + lwi = (t_dim / vec, b_rows, 1) + return gwi, lwi +def xy_space_index_3d(size, t_dim, b_rows, vec): + gwi = (int(size[1] / vec), int(b_rows * size[0] / t_dim), int(size[2])) + lwi = (t_dim / vec, b_rows, 1) + return gwi, lwi +# Configs : sources, tile size, block rows, is padding, vector size, +# index space function +kernels_config[3][PARMES_REAL_GPU]['transpose_xy'] = \ + ('kernels/transpose_xy_noVec.cl', 32, 4, True, 1, xy_space_index_3d) +kernels_config[3][PARMES_DOUBLE_GPU]['transpose_xy'] = \ + ('kernels/transpose_xy_noVec.cl', 32, 16, True, 1, xy_space_index_3d) +kernels_config[2][PARMES_REAL_GPU]['transpose_xy'] = \ + ('kernels/transpose_xy_noVec.cl', 32, 2, True, 1, xy_space_index_2d) +kernels_config[2][PARMES_DOUBLE_GPU]['transpose_xy'] = \ + ('kernels/transpose_xy_noVec.cl', 32, 8, True, 1, xy_space_index_2d) + +# XZ transposition +# Settings are taken from destination layout as current layout. +# gwi is computed form input layout (appears as transposed layout) +def xz_space_index_3d(size, t_dim, b_rows, b_deph, vec): + gwi = (int(size[2] / vec), int(size[1]), int(b_deph * size[0] / t_dim)) + lwi = (t_dim / vec, 1, b_deph) + return gwi, lwi +# Configs : sources, tile size, block rows, is padding, vector size, +# index space function +kernels_config[3][PARMES_REAL_GPU]['transpose_xz'] = \ + ('kernels/transpose_xz_slice_noVec.cl', 32, 0, 8, True, 1, xz_space_index_3d) +kernels_config[3][PARMES_DOUBLE_GPU]['transpose_xz'] = \ + ('kernels/transpose_xz_slice_noVec.cl', 32, 0, 4, True, 1, xz_space_index_3d) + +def computational_kernels_index_space(wi, size, vec): + # Change work-item regarding vector_width + if wi * vec > size[0]: + if size[0] % vec > 0: + raise ValueError( + "Resolution ({0}) must be a multiple of {1}".format( + size[0], vec)) + wi = size[0] // vec + + if len(size) == 3: + gwi = (int(wi), int(size[1]), int(size[2])) + lwi = (int(wi), 1, 1) + else: + gwi = (int(wi), int(size[1])) + lwi = (int(wi), 1) + return gwi, lwi + +def advection_index_space_3d(size, vec): + wi = min(max(64, size[0] / 4), 1024) + return computational_kernels_index_space(wi, size, vec) +def advection_index_space_2d_SP(size, vec): + wi = min(size[0] / 8, 1024) + return computational_kernels_index_space(wi, size, vec) +def advection_index_space_2d_DP(size, vec): + wi = min(size[0] / 4, 1024) + return computational_kernels_index_space(wi, size, vec) + +def remeshing_index_space_3d(size, vec): + wi = min(size[0] / 2, 1024) + return computational_kernels_index_space(wi, size, vec) +def remeshing_index_space_2d(size, vec): + wi = min(size[0] / 4, 1024) + return computational_kernels_index_space(wi, size, vec) + +def advection_and_remeshing_index_space(size, vec): + wi = min(size[0] / 2, 1024) + return computational_kernels_index_space(wi, size, vec) + + +# Advection kernel +# Configs sources, is noBC, vector size, index space function +kernels_config[3][PARMES_REAL_GPU]['advec'] = \ + (["common.cl", "advection/builtin_rk4_noVec.cl", "kernels/advection_noVec.cl"], + False, 1, advection_index_space_3d) +kernels_config[3][PARMES_DOUBLE_GPU]['advec'] = \ + (["common.cl", "advection/builtin_rk4_noVec.cl", "kernels/advection_noVec.cl"], + False, 1, advection_index_space_3d) +kernels_config[2][PARMES_REAL_GPU]['advec'] = \ + (["common.cl", "advection/builtin_rk4_noVec.cl", "kernels/advection_noVec.cl"], + False, 1, advection_index_space_2d_SP) +kernels_config[2][PARMES_DOUBLE_GPU]['advec'] = \ + (["common.cl", "advection/builtin_rk4_noVec.cl", "kernels/advection_noVec.cl"], + False, 1, advection_index_space_2d_DP) + +# Remeshing kernel +# Configs sources, is noBC, vector size, index space function +kernels_config[3][PARMES_REAL_GPU]['remesh'] = \ + (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic_noVec.cl", + "kernels/remeshing_noVec.cl"], + False, 1, remeshing_index_space_3d) +kernels_config[3][PARMES_DOUBLE_GPU]['remesh'] = \ + (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic_noVec.cl", + "kernels/remeshing_noVec.cl"], + False, 1, remeshing_index_space_3d) +kernels_config[2][PARMES_REAL_GPU]['remesh'] = \ + (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic.cl", + "kernels/remeshing.cl"], + True, 2, remeshing_index_space_2d) +kernels_config[2][PARMES_DOUBLE_GPU]['remesh'] = \ + (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/basic.cl", + "kernels/remeshing.cl"], + True, 2, remeshing_index_space_2d) + + +# Advection and remeshing kernel +# Configs sources, is noBC, vector size, index space function +kernels_config[3][PARMES_REAL_GPU]['advec_and_remesh'] = \ + (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/private_noVec.cl", + "advection/builtin_rk4_noVec.cl", "kernels/advection_and_remeshing_noVec.cl"], + False, 1, advection_and_remeshing_index_space) +kernels_config[3][PARMES_DOUBLE_GPU]['advec_and_remesh'] = \ + (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/private_noVec.cl", + "advection/builtin_rk4_noVec.cl", "kernels/advection_and_remeshing_noVec.cl"], + False, 1, advection_and_remeshing_index_space) +kernels_config[2][PARMES_REAL_GPU]['advec_and_remesh'] = \ + (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/private_noVec.cl", + "advection/builtin_rk4_noVec.cl", "kernels/advection_and_remeshing_noVec.cl"], + False, 1, advection_and_remeshing_index_space) +kernels_config[2][PARMES_DOUBLE_GPU]['advec_and_remesh'] = \ + (["common.cl", "remeshing/weights_noVec_builtin.cl", "remeshing/private_noVec.cl", + "advection/builtin_rk4_noVec.cl", "kernels/advection_and_remeshing_noVec.cl"], + False, 1, advection_and_remeshing_index_space) + diff --git a/HySoP/hysop/gpu/gpu_discrete.py b/HySoP/hysop/gpu/gpu_discrete.py index 08e007365..9c6ad5071 100644 --- a/HySoP/hysop/gpu/gpu_discrete.py +++ b/HySoP/hysop/gpu/gpu_discrete.py @@ -51,7 +51,7 @@ class GPUDiscreteField(DiscreteField): self.gpu_data[d] = cl.Buffer(self.queue.context, cl.mem_flags.READ_WRITE, size=self.data[d].nbytes) - self.mem_size += self.gpu_data[d].size + self.mem_size += self.data[d].nbytes if __VERBOSE__: print self.name, self.mem_size, "Bytes (", print self.mem_size / (1024 ** 2), "MB)" @@ -74,18 +74,6 @@ class GPUDiscreteField(DiscreteField): vfield, queue, vfield.topology, vfield.isVector, vfield.name, precision) - vfield.gpu_data = [None] * vfield.nbComponents - vfield.layout = layout - for d in xrange(vfield.nbComponents): - vfield.data[d] = np.asarray(vfield.data[d], - dtype=precision, order=ORDER) - vfield.gpu_data[d] = cl.Buffer(vfield.queue.context, - cl.mem_flags.READ_WRITE, - size=vfield.data[d].nbytes) - vfield.mem_size += vfield.gpu_data[d].size - if __VERBOSE__: - print vfield.name, vfield.mem_size, "Bytes (", - print vfield.mem_size / (1024 ** 2), "MB)" def setInitializationKernel(self, kernel): """ diff --git a/HySoP/hysop/gpu/gpu_particle_advection.py b/HySoP/hysop/gpu/gpu_particle_advection.py index 2d218b7e5..be278dc25 100644 --- a/HySoP/hysop/gpu/gpu_particle_advection.py +++ b/HySoP/hysop/gpu/gpu_particle_advection.py @@ -10,6 +10,7 @@ from parmepy.operator.discrete.particle_advection import ParticleAdvection from parmepy.gpu import PARMES_REAL_GPU, cl from parmepy.gpu.tools import get_opencl_environment from parmepy.gpu.gpu_kernel import KernelLauncher +from parmepy.tools.timers import Timer class GPUParticleAdvection(ParticleAdvection): @@ -50,23 +51,27 @@ class GPUParticleAdvection(ParticleAdvection): ParticleAdvection.__init__(self, velocity, advectedFields, d, part_position, part_advectedFields, method) self.available_methods = [ - 'l2star_c4', 'l2star_c3', 'l2star_c2', 'm4prime', + 'l2star_c4', 'l2star_c3', 'l2star_c2', 'l2star', 'l4star_c4', 'l4star_c3', 'l4star', 'l6star_c6', 'l6star_c5', 'l6star_c4', 'l6star', 'm8prime', 'l8star'] - self.method = 'l4star' + self.method = None for m in self.available_methods: if method.find(m) >= 0: self.method = m break + if self.method is None: + print "Unknown remeshing method : use default l4star" + self.method = 'l4star' self.name += ' GPU ' + self.method self.user_gpu_src = src self.num_method = None + self.dim = self.advectedFields[0].dimension # Compute resolutions for kernels for each direction. resol = self.advectedFields[0].topology.mesh.resolution self.resolution = resol.copy() - if len(resol) == 2: + if self.dim == 2: if self.dir == 1: self.resolution[0] = resol[1] self.resolution[1] = resol[0] @@ -85,6 +90,8 @@ class GPUParticleAdvection(ParticleAdvection): resol) self.warning = True self.prg = None + ## Object to store computational times of OpenCL kernels + self.kernels_timer = Timer(self) @debug def setUp(self): @@ -103,13 +110,12 @@ class GPUParticleAdvection(ParticleAdvection): if __VERBOSE__: print "Work-items basic config : ", print self.workItemNumber, self.gwi, self.lwi - dim = self.advectedFields[0].dimension self.coord_min = np.ones(4, dtype=self.gpu_precision) self.mesh_size = np.ones(4, dtype=self.gpu_precision) - self.coord_min[0:dim] = np.asarray( + self.coord_min[0:self.dim] = np.asarray( self.advectedFields[0].topology.mesh.origin, dtype=self.gpu_precision) - self.mesh_size[0:dim] = np.asarray( + self.mesh_size[0:self.dim] = np.asarray( self.advectedFields[0].topology.mesh.space_step, dtype=self.gpu_precision) self.build_options = "" @@ -199,40 +205,15 @@ class GPUParticleAdvection(ParticleAdvection): """ build_options = self.build_options # copy settings - if len(self.resolution) == 3: - if self.gpu_precision is PARMES_REAL_GPU: - vec = 4 - src_copy = 'kernels/copy.cl' - t_dim = 16 - b_rows = 8 - else: - vec = 1 - src_copy = 'kernels/copy_locMem.cl' - t_dim = 32 - b_rows = 8 - else: - vec = 2 - if self.gpu_precision is PARMES_REAL_GPU: - src_copy = 'kernels/copy.cl' - t_dim = 16 - b_rows = 8 - else: - src_copy = 'kernels/copy.cl' - t_dim = 32 - b_rows = 2 + src, t_dim, b_rows, vec, f_space = \ + self.cl_env.kernels_config[self.dim][self.gpu_precision]['copy'] + gwi, lwi = f_space(self.resolution, t_dim, b_rows, vec) + # Build code build_options += " -D TILE_DIM_COPY={0}".format(t_dim) build_options += " -D BLOCK_ROWS_COPY={0}".format(b_rows) - # Direction specific settings - # prg = len(self.resolution) * [None] - # gwi = len(self.resolution) * [None] - # lwi = len(self.resolution) * [None] - gwi = (int(self.resolution[0] / vec), - int(b_rows * self.resolution[1] / t_dim), - int(1 if len(self.resolution) == 2 else self.resolution[2])) - lwi = (t_dim / vec, b_rows, 1) prg = self.cl_env.build_src( - src_copy, + src, build_options + self.cl_env.default_sizes_constants[self.dir], vec) self.init_copy = KernelLauncher(prg.copy, @@ -313,63 +294,36 @@ class GPUParticleAdvection(ParticleAdvection): """ # XY transposition settings build_options = self.build_options - src_transpose_xy = 'kernels/transpose_xy.cl' - build_options += " -D PADDING_XY=1" - if len(self.resolution) == 3: - if self.gpu_precision is PARMES_REAL_GPU: - vec = 2 - t_dim = 16 - b_rows = 8 - else: - vec = 4 - t_dim = 32 - b_rows = 4 + src, t_dim, b_rows, is_padding, vec, f_space = \ + self.cl_env.kernels_config[self.dim][self.gpu_precision]['transpose_xy'] + gwi, lwi = f_space(self.resolution, t_dim, b_rows, vec) + + if is_padding: + build_options += " -D PADDING_XY=1" else: - vec = 4 - if self.gpu_precision is PARMES_REAL_GPU: - t_dim = 32 - b_rows = 8 - else: - t_dim = 32 - b_rows = 2 + build_options += " -D PADDING_XY=0" build_options += " -D TILE_DIM_XY={0}".format(t_dim) build_options += " -D BLOCK_ROWS_XY={0}".format(b_rows) - # Settings are taken from destination layout as current layout. - # gwi is computed form input layout (appears as transposed layout) - gwi = (int(self.resolution[1] / vec), - int(b_rows * self.resolution[0] / t_dim), - int(1 if len(self.resolution) == 2 else self.resolution[2])) - lwi = (t_dim / vec, b_rows, 1) prg = self.cl_env.build_src( - src_transpose_xy, + src, build_options + self.cl_env.default_sizes_constants[self.dir], vec) self.init_transpose_xy = KernelLauncher( prg.transpose_xy, self.cl_env.queue, gwi, lwi) - if len(self.resolution) == 3: + if self.dim == 3: # XY transposition settings build_options = self.build_options - src_transpose_xz = 'kernels/transpose_xz.cl' - vec = 1 - if self.gpu_precision is PARMES_REAL_GPU: + src, t_dim, b_rows, b_deph, is_padding, vec, f_space = \ + self.cl_env.kernels_config[self.dim][self.gpu_precision]['transpose_xz'] + gwi, lwi = f_space(self.resolution, t_dim, b_rows, b_deph, vec) + + if is_padding: build_options += " -D PADDING_XZ=1" - t_dim = 16 - b_rows = 4 else: build_options += " -D PADDING_XZ=0" - t_dim = 8 - b_rows = 2 - build_options += " -D TILE_DIM_XZ={0}".format(t_dim) - build_options += " -D BLOCK_ROWS_XZ={0}".format(b_rows) - # Settings are taken from destination layout as current layout. - # gwi is computed form input layout (appears as transposed layout) - gwi = (int(self.resolution[2] / vec), - int(b_rows * self.resolution[1] / t_dim), - int(b_rows * self.resolution[0] / t_dim)) - lwi = (t_dim / vec, b_rows, b_rows) prg = self.cl_env.build_src( - src_transpose_xz, + src, build_options + self.cl_env.default_sizes_constants[self.dir], vec) self.init_transpose_xz = KernelLauncher( @@ -429,13 +383,14 @@ class GPUParticleAdvection(ParticleAdvection): gpudf.release() if self.init_copy.f_timer is not None: for f_timer in self.init_copy.f_timer: - self.timer.addFunctionTimer(f_timer) + self.kernels_timer.addFunctionTimer(f_timer) for f_timer in self.init_transpose_xy.f_timer: - self.timer.addFunctionTimer(f_timer) + self.kernels_timer.addFunctionTimer(f_timer) if len(self.resolution) == 3: for f_timer in self.init_transpose_xz.f_timer: - self.timer.addFunctionTimer(f_timer) + self.kernels_timer.addFunctionTimer(f_timer) ParticleAdvection.finalize(self) + self.timer.addSubTimer(self.kernels_timer, ' Details:') def __str__(self): s = "GPUParticleAdvection (DiscreteOperator). " diff --git a/HySoP/hysop/gpu/gpu_particle_advection_1k.py b/HySoP/hysop/gpu/gpu_particle_advection_1k.py index 68b045655..c7b4708db 100644 --- a/HySoP/hysop/gpu/gpu_particle_advection_1k.py +++ b/HySoP/hysop/gpu/gpu_particle_advection_1k.py @@ -138,43 +138,20 @@ class GPUParticleAdvection1k(GPUParticleAdvection): </table> """ build_options = self.build_options - file_list = ["common.cl"] - with_noBC = True - vec = 4 - if len(self.resolution) == 3: - file_list.append("remeshing/weights_builtin.cl") - if self.advectedFields[0].isVector: - file_list.append("remeshing/private_vector_3d.cl") - else: - file_list.append("remeshing/private.cl") - if self.gpu_precision is PARMES_REAL_GPU: - with_noBC = False - else: - file_list.append("remeshing/weights_noVec_builtin.cl") - if self.advectedFields[0].isVector: - file_list.append("remeshing/basic_vector_2d.cl") - else: - file_list.append("remeshing/basic.cl") - if self.gpu_precision is PARMES_REAL_GPU: - vec = 8 - file_list.append("advection/builtin.cl") - if self.advectedFields[0].nbComponents == 3: - file_list.append("kernels/advection_and_remeshing_vector_3d.cl") - elif self.advectedFields[0].nbComponents == 2: - file_list.append("kernels/advection_and_remeshing_vector_2d.cl") - else: - file_list.append("kernels/advection_and_remeshing.cl") + src, is_noBC, vec, f_space = \ + self.cl_env.kernels_config[self.dim][self.gpu_precision]['advec_and_remesh'] + gwi, lwi = f_space(self.resolution, vec) + WINb = lwi[0] build_options += " -D FORMULA=" + self.method.upper() - if with_noBC: + if is_noBC: build_options += " -D WITH_NOBC=1" - WINb, gwi, lwi = self.cl_env.get_WorkItems(self.resolution, vec) build_options_d = " -D WI_NB=" + str(WINb) build_options_d += " -D PART_NB_PER_WI=" build_options_d += str(self.resolution[0] / WINb) build_options_d += self.cl_env.default_sizes_constants[self.dir] ## Build code prg = self.cl_env.build_src( - file_list, build_options + build_options_d, vec) + src, build_options + build_options_d, vec) self.num_advec_and_remesh = KernelLauncher( prg.advection_and_remeshing, self.cl_env.queue, gwi, lwi) @@ -240,7 +217,7 @@ class GPUParticleAdvection1k(GPUParticleAdvection): def finalize(self): if self.num_advec_and_remesh.f_timer is not None: for f_timer in self.num_advec_and_remesh.f_timer: - self.timer.addFunctionTimer(f_timer) + self.kernels_timer.addFunctionTimer(f_timer) GPUParticleAdvection.finalize(self) def __str__(self): diff --git a/HySoP/hysop/gpu/gpu_particle_advection_2k.py b/HySoP/hysop/gpu/gpu_particle_advection_2k.py index f3ec47922..02037b03b 100644 --- a/HySoP/hysop/gpu/gpu_particle_advection_2k.py +++ b/HySoP/hysop/gpu/gpu_particle_advection_2k.py @@ -62,13 +62,12 @@ class GPUParticleAdvection2k(GPUParticleAdvection): Allocates a buffer for storing particle positions. """ ## Velocity. - GPUDiscreteField.fromField(self.cl_env.queue, self.velocity, - self.gpu_precision) + GPUDiscreteField.fromField( self.cl_env.queue,self.velocity, + self.gpu_precision) ## Transported field. - GPUDiscreteField.fromField(self.cl_env.queue, - self.advectedFields[0], - self.gpu_precision, - layout=False) + GPUDiscreteField.fromField( self.cl_env.queue,self.advectedFields[0], + self.gpu_precision, + layout=False) ## Particle position if self.part_position is None: self.part_position = \ @@ -76,9 +75,8 @@ class GPUParticleAdvection2k(GPUParticleAdvection): name="Particle_Position", isVector=False ).discretize(self.advectedFields[0].topology) - GPUDiscreteField.fromField( - self.cl_env.queue, self.part_position, - self.gpu_precision) + GPUDiscreteField.fromField( self.cl_env.queue,self.part_position, + self.gpu_precision) ## Result scalar if self.part_advectedFields is None: self.part_advectedFields = [ @@ -86,10 +84,8 @@ class GPUParticleAdvection2k(GPUParticleAdvection): name="Particle_AdvectedFields", isVector=self.advectedFields[0].isVector ).discretize(self.advectedFields[0].topology)] - GPUDiscreteField.fromField( - self.cl_env.queue, self.part_advectedFields[0], - self.gpu_precision, - layout=False) + GPUDiscreteField.fromField( self.cl_env.queue,self.part_advectedFields[0], + self.gpu_precision, layout=False) self.variables = [self.advectedFields[0], self.velocity, self.part_position, self.part_advectedFields[0]] @@ -183,31 +179,17 @@ class GPUParticleAdvection2k(GPUParticleAdvection): </table> """ build_options = self.build_options - file_list = ["common.cl"] - with_noBC = False - if len(self.resolution) == 3: - file_list.append("advection/builtin.cl") - file_list.append("kernels/advection.cl") - if self.gpu_precision is PARMES_REAL_GPU: - vec = 4 - else: - vec = 2 - else: - if self.gpu_precision is PARMES_REAL_GPU: - file_list.append("advection/builtin.cl") - file_list.append("kernels/advection.cl") - vec = 4 - else: - file_list.append("advection/builtin_noVec.cl") - file_list.append("kernels/advection_noVec.cl") - vec = 1 - if with_noBC: + src, is_noBC, vec, f_space = \ + self.cl_env.kernels_config[self.dim][self.gpu_precision]['advec'] + gwi, lwi = f_space(self.resolution, vec) + WINb = lwi[0] + + if is_noBC: build_options += " -D WITH_NOBC=1" - WINb, gwi, lwi = self.cl_env.get_WorkItems(self.resolution, vec) build_options_d = " -D WI_NB=" + str(WINb) ## Build code prg = self.cl_env.build_src( - file_list, + src, build_options_d + self.cl_env.default_sizes_constants[self.dir], vec) self.num_advec = KernelLauncher( @@ -215,40 +197,21 @@ class GPUParticleAdvection2k(GPUParticleAdvection): ## remeshing build_options = self.build_options - file_list = ["common.cl"] - vec = 4 - if len(self.resolution) == 3: - file_list.append("remeshing/weights_builtin.cl") - if self.advectedFields[0].isVector: - file_list.append("remeshing/private_vector_3d.cl") - else: - file_list.append("remeshing/private.cl") - with_noBC = False - else: - file_list.append("remeshing/weights_noVec_builtin.cl") - if self.advectedFields[0].isVector: - file_list.append("remeshing/basic_vector_2d.cl") - else: - file_list.append("remeshing/basic.cl") - with_noBC = True - if self.advectedFields[0].nbComponents == 3: - file_list.append("kernels/remeshing_vector_3d.cl") - elif self.advectedFields[0].nbComponents == 2: - file_list.append("kernels/remeshing_vector_2d.cl") - else: - file_list.append("kernels/remeshing.cl") + src, is_noBC, vec, f_space = \ + self.cl_env.kernels_config[self.dim][self.gpu_precision]['remesh'] + gwi, lwi = f_space(self.resolution, vec) + WINb = lwi[0] + build_options += " -D FORMULA=" + self.method.upper() - if with_noBC: + if is_noBC: build_options += " -D WITH_NOBC=1" - WINb, gwi, lwi = self.cl_env.get_WorkItems( - self.resolution, vec) build_options_d = " -D WI_NB=" + str(WINb) build_options_d += " -D PART_NB_PER_WI=" build_options_d += str(self.resolution[0] / WINb) build_options_d += self.cl_env.default_sizes_constants[self.dir] ## Build code prg = self.cl_env.build_src( - file_list, build_options + build_options_d, vec) + src, build_options + build_options_d, vec) self.num_remesh = KernelLauncher( prg.remeshing_kernel, self.cl_env.queue, gwi, lwi) @@ -315,9 +278,9 @@ class GPUParticleAdvection2k(GPUParticleAdvection): def finalize(self): if self.num_advec.f_timer is not None: for f_timer in self.num_advec.f_timer: - self.timer.addFunctionTimer(f_timer) + self.kernels_timer.addFunctionTimer(f_timer) for f_timer in self.num_remesh.f_timer: - self.timer.addFunctionTimer(f_timer) + self.kernels_timer.addFunctionTimer(f_timer) GPUParticleAdvection.finalize(self) diff --git a/HySoP/hysop/gpu/kernel_benchmark.py b/HySoP/hysop/gpu/kernel_benchmark.py index 93333bf55..4f2212695 100644 --- a/HySoP/hysop/gpu/kernel_benchmark.py +++ b/HySoP/hysop/gpu/kernel_benchmark.py @@ -3,7 +3,8 @@ Package for benchmarking OpenCL kernels. """ -import pyopencl as cl +from parmepy.gpu import PARMES_DOUBLE_GPU, cl +from parmepy.gpu.tools import get_opencl_environment import numpy as np import pickle @@ -11,21 +12,26 @@ import pickle class BenchmarkSuite: """Benchark suite management""" - def __init__(self, sizes, kernel_name, - versions, configs, test=False, true_res=None, arg_to_test=0, + def __init__(self, sizes, name, + kernels, configs, versions, setupFunction, + test=False, true_res=None, arg_to_test=0, inputs={}, file_name="Benchmarks_data", - precision='SP', nb_run=20): + precision=PARMES_DOUBLE_GPU, nb_run=20): """ Creates a benchmak suite, that consists in a list of Benchmark. @param sizes : list of different problem sizes to benchmark. - @param kernel_name : name of the kernel to benchmark. - @param versions : list of tuples containing kernel versions - (kernel sources, kernel OpenCL name ). + @param name : name of the kernel to benchmark. + @param kernels : list of tuples containing kernel versions + (kernel sources file, OpenCL kernel name ). @param configs : dictionary of configurations. keys are kernel OpenCL name, - values are tuples containing (compilation flags function, kernel - arguments settings, config name, condition related to problem size) + values are tuples containing (kernel parameters vectorization + and identifier in last position). + @param versions : kernel versions to bench (used as dictionaries keys) + @param setupFunction : Function that returns building options and + kernel arguments (assuming arrays are numpy arrays) depending on a + given config, size and input dictionary. @param test : by default no results tests are performed @param true_res : function to compute true results @param arg_to_test : index of kernel arguments that contains result @@ -39,20 +45,22 @@ class BenchmarkSuite: If no such file, a new database is created. """ self.pickle_file_name = file_name - if precision == 'SP': - self.pickle_file_name += '_SP' - else: + if precision == PARMES_DOUBLE_GPU: self.pickle_file_name += '_DP' + else: + self.pickle_file_name += '_SP' self.sizes = sizes self.versions = versions + self.kernels_files = kernels self.configs = configs self.inputs = inputs self.test = test self.compute_true_res = true_res self.arg_to_test = arg_to_test - self.kernel_name = kernel_name + self.kernel_name = name self.precision = precision self.nb_run = nb_run + self.setupFunction = setupFunction if not self.test: try: @@ -65,6 +73,8 @@ class BenchmarkSuite: print 'start new database' self.timings = {} else: + assert not true_res is None + assert arg_to_test >= 2 self.timings = {} self.complete_timings() @@ -77,16 +87,11 @@ class BenchmarkSuite: if self.kernel_name not in self.timings.keys(): self.timings[self.kernel_name] = {} for v in self.versions: - if not v[1] in self.timings[self.kernel_name].keys(): - self.timings[self.kernel_name][v[1]] = {} - if isinstance(self.configs, dict): - for c in self.configs[v[1]]: - if not c[2] in self.timings[self.kernel_name][v[1]].keys(): - self.timings[self.kernel_name][v[1]][c[2]] = {} - else: - for c in self.configs: - if not c[2] in self.timings[self.kernel_name][v[1]].keys(): - self.timings[self.kernel_name][v[1]][c[2]] = {} + if not v in self.timings[self.kernel_name].keys(): + self.timings[self.kernel_name][v] = {} + for c in self.configs[v]: + if not c[-1] in self.timings[self.kernel_name][v].keys(): + self.timings[self.kernel_name][v][c[-1]] = {} def launch(self): """ @@ -101,34 +106,19 @@ class BenchmarkSuite: self.true_res = {} self.compute_true_res(self.sizes, self.true_res, self.inputs) for v in self.versions: - if isinstance(self.configs, dict): - conf_list = self.configs[v[1]] - else: - conf_list = self.configs + conf_list = self.configs[v] for conf in conf_list: - try: - allowed_size = conf[3] - except IndexError: - allowed_size = None - if callable(conf[0]): - b = Benchmark( - v[0], v[1], self.sizes, - lambda s: "-D WIDTH=" + str(s[0]) + conf[0](s), - inputs=self.inputs, allowed_size=allowed_size, - precision=self.precision, nb_run=self.nb_run) - else: - b = Benchmark( - v[0], v[1], self.sizes, - lambda s: "-D WIDTH=" + str(s[0]) + conf[0], - inputs=self.inputs, allowed_size=allowed_size, - precision=self.precision, nb_run=self.nb_run) - b.kernelSetup = conf[1] - print self.kernel_name, v[1], conf[2] + b = Benchmark( + self.kernels_files[v][0], self.kernels_files[v][1], + self.sizes, conf, self.setupFunction, + inputs=self.inputs, + precision=self.precision, nb_run=self.nb_run) + print self.kernel_name, v, conf[-1] if self.test: b.test(self.true_res, self.arg_to_test) else: b.launch() - [self.timings[self.kernel_name][v[1]][conf[2]].__setitem__( + [self.timings[self.kernel_name][v][conf[-1]].__setitem__( t[0], t[1]) for t in b.timings.items()] pickle.dump( self.timings, open( @@ -208,9 +198,16 @@ class BenchmarkSuite: f.write(v + " ") f.write(c + " ") #print c - cs = c.replace( - 'Nx', str(s[0])).replace('x', '*').split('_') - cse = cs[0] + '/' + cs[1][1] if len(cs) == 2 else cs[0] + # Compute work-item number from configuration string: + # If config, start with 'wi=N', work-item number is set to N + # Else, it assume a configuration matching 'A[xB]+[_fn]?' + # It replace 'x' by '*' and divide by n. String is evaluated as python instruction + if c[0:3] == 'wi=': + cse = c.split('=')[1].split('_')[0] + else: + cs = c.replace( + 'Nx', str(s[0])).replace('x', '*').split('_') + cse = cs[0] + '/' + cs[1][1] if len(cs) == 2 else cs[0] #print cse f.write(str(eval(cse)) + ' ') try: @@ -243,92 +240,30 @@ def find_min(filename, kernel=None, version=None, config=None, size=None): class Benchmark: """Benchmark management""" - def __init__(self, code, kernel_name, sizes, build_opt, nb_run=20, - inputs=None, allowed_size=None, - precision='SP'): + def __init__(self, kernel_file, kernel_name, sizes, config, setupFunction, + nb_run=20, inputs=None, precision=PARMES_DOUBLE_GPU): """ - Creates a benchmark for a given source code, kernel for differnet - problem sizes. + Creates a benchmark for a given source kernel_file, kernel for + different problem sizes. - @param code : kernels source code as a string + @param kernel_file : kernels source file @param kernel_name : name of the kernel to benchmark as a string @param sizes : list of different problem sizes to launch kernel - @param build_opt : OpenCL compiler options + @param config : list of kernel parameters + @param setupFunction : Function that returns building options and + kernel arguments (assuming arrays are numpy arrays) depending on a + given config, size and input dictionary. @param nb_run : number of launches to average time (default = 20) @param inputs : input data - @param allowed_size : boolean function that allows benchmarks - regarding problem size (depends on configuration) @param precision : Floating point precision for kernels """ - #Get platform. - try: - ## OpenCL platform - self.platform = cl.get_platforms()[0] - except IndexError: - print " Incorrect platform_id :", platform_id, ".", - print " Only ", len(cl.get_platforms()), " available.", - print " Getting defalut platform. " - self.platform = cl.get_platforms()[0] - #Get device. - try: - ## OpenCL device - self.device = self.platform.get_devices(cl.device_type.GPU)[0] - except cl.RuntimeError as e: - print "RuntimeError:", e - self.device = cl.create_some_context().devices[0] - except AttributeError as e: - print "AttributeError:", e - self.device = cl.create_some_context().devices[0] - print "\n Benchmark on : ", self.platform.name, - print " with ", self.platform.version, - print " over : ", self.device.name, - print " with ", self.device.opencl_c_version - print " Precision capability ", - self.correctlyroundeddividesqrt = True - if precision == 'SP': - print "for Single Precision: ", - for v in ['DENORM', 'INF_NAN', - 'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF', - 'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']: - try: - if eval('(self.device.single_fp_config' + - ' & cl.device_fp_config.' + - v + ') == cl.device_fp_config.' + v): - print v, - except AttributeError as ae: - if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT': - self.correctlyroundeddividesqrt = False - print '\n', v, 'is not supported in OpenCL C 1.2. ', - print 'Exception catched : ', ae - else: - print "for Double Precision: ", - if self.device.double_fp_config > 0: - for v in ['DENORM', 'INF_NAN', - 'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF', - 'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', - 'SOFT_FLOAT']: - try: - if eval('(self.device.double_fp_config' + - ' & cl.device_fp_config.' + - v + ') == cl.device_fp_config.' + v): - print v, - except AttributeError as ae: - if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT': - self.correctlyroundeddividesqrt = False - print '\n', v, 'is supported in OpenCL C 1.2. ', - print 'Exception catched : ', ae - else: - raise ValueError("Double Precision is not supported by device") - print "" - #Creates GPU Context - ## OpenCL context - self.ctx = cl.Context([self.device]) - #Create CommandQueue on the GPU Context - ## OpenCL command queue - self.queue = cl.CommandQueue( - self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - ## OpenCL Source code - self.code = code + self.cl_env = get_opencl_environment(0, 0, 'gpu', precision, sizes[0]) + self.platform = self.cl_env.platform + self.device = self.cl_env.device + self.ctx = self.cl_env.ctx + self.queue = self.cl_env.queue + ## OpenCL Source kernel_file + self.kernel_file = kernel_file ## Kernel name self.kernel = kernel_name ## Compiled programs @@ -344,28 +279,39 @@ class Benchmark: ## Problems inputs self.inputs = inputs ## Function to test size - self.is_size_allowed = allowed_size self.precision = precision - if self.code is not None: + self.setupFunction = setupFunction + + self.setup = {} + if self.kernel_file is not None: for size in self.sizes: - if self.is_size_allowed is None or self.is_size_allowed(size): - build_options = "" # " -cl-opt-disable" - if self.correctlyroundeddividesqrt: - build_options += \ - " -cl-fp32-correctly-rounded-divide-sqrt " - if self.precision == 'SP': - prg = cl.Program( - self.ctx, self.code.replace('.0', '.0f')) - build_options += " -cl-single-precision-constant " - else: - prg = cl.Program( - self.ctx, self.code.replace('float', 'double')) - prg.build(build_options + build_opt(size)) - print prg.get_build_info( - self.device, cl.program_build_info.OPTIONS) - self.prg[size] = prg - ## Function to setup kernels arguments - self.kernelSetup = None + self.setup[size] = self.setupFunction(config, size, self.inputs) + if self.setup[size] is not None: + toDelete = False + #print np.prod(self.setup[size][1][1]), "WI (",self.device.max_work_group_size ," max )" + if np.prod(self.setup[size][1][1]) > self.device.max_work_group_size: + toDelete = True + global_mem_used=0 + for arg in self.setup[size][1]: + if isinstance(arg, np.ndarray) and \ + len(arg.shape) > 1: + #print "Alloc : ", arg.nbytes, "Bytes (", self.device.max_mem_alloc_size, "max)" + if arg.nbytes > self.device.max_mem_alloc_size: + toDelete = True + global_mem_used += arg.nbytes + #print "Total Alloc : ", global_mem_used, "Bytes (", self.device.global_mem_size, "max)" + if global_mem_used > self.device.global_mem_size: + toDelete = True + #print "Local Alloc : ", self.setup[size][2], "Bytes (", self.device.local_mem_size, "max)" + if self.setup[size][2] > self.device.local_mem_size: + toDelete = True + if toDelete: + self.setup[size] = None + if self.kernel_file is not None: + for size in self.sizes: + if self.setup[size] is not None: + self.prg[size] = self.cl_env.build_src( + kernel_file, self.setup[size][0], config[-2]) def test(self, true_res, ind_res): """ @@ -376,9 +322,9 @@ class Benchmark: """ print "Testing : " for size in self.sizes: - if self.is_size_allowed is None or self.is_size_allowed(size): + if self.setup[size] is not None: kernel = eval('self.prg[size].' + self.kernel) - kernelArgs = self.kernelSetup(size, self.inputs) + kernelArgs = self.setup[size][1] res = np.empty_like(kernelArgs[ind_res]) mem_used = 0 for i in xrange(len(kernelArgs)): @@ -408,10 +354,10 @@ class Benchmark: res = res[:size[0], :size[1], :size[2]] else: res = res[:size[0], :size[1]] - if self.precision == 'SP': - exp = 6 - else: + if self.precision == PARMES_DOUBLE_GPU: exp = 15 + else: + exp = 6 np.testing.assert_array_almost_equal( res, true_res[size], decimal=exp) print 'Ok' @@ -442,41 +388,41 @@ class Benchmark: """ print "\nRunning : " for size in self.sizes: - if self.is_size_allowed is None or self.is_size_allowed(size): - print size, + print size, + if self.setup[size] is not None: kernel = eval('self.prg[size].' + self.kernel) - if not self.kernelSetup is None: - kernelArgs = self.kernelSetup(size, self.inputs) - mem_used = 0 - for i in xrange(len(kernelArgs)): - if isinstance(kernelArgs[i], np.ndarray) and \ - len(kernelArgs[i].shape) > 1: - buff = cl.Buffer( - self.ctx, cl.mem_flags.READ_WRITE, - size=kernelArgs[i].nbytes) - cl.enqueue_copy(self.queue, buff, kernelArgs[i]) - mem_used += kernelArgs[i].nbytes - kernelArgs[i] = buff - print "Memory used : {0:.5f} GiB ({1:.2f}%)".format( - mem_used / (1024. ** 3), - 100. * mem_used / (self.device.global_mem_size * 1.), - mem_used / (1024. ** 3)) - self.queue.finish() - print kernelArgs[0:2] + kernelArgs = self.setup[size][1] + mem_used = 0 + for i in xrange(len(kernelArgs)): + if isinstance(kernelArgs[i], np.ndarray) and \ + len(kernelArgs[i].shape) > 1: + buff = cl.Buffer( + self.ctx, cl.mem_flags.READ_WRITE, + size=kernelArgs[i].nbytes) + cl.enqueue_copy(self.queue, buff, kernelArgs[i]) + mem_used += kernelArgs[i].nbytes + kernelArgs[i] = buff + print "Memory used : {0:.5f} GiB ({1:.2f}%)".format( + mem_used / (1024. ** 3), + 100. * mem_used / (self.device.global_mem_size * 1.), + mem_used / (1024. ** 3)), + self.queue.finish() + print kernelArgs[0:2], + evt = kernel(self.queue, *tuple(kernelArgs)) + self.queue.finish() + evts = [] + for i in xrange(self.nb_run): evt = kernel(self.queue, *tuple(kernelArgs)) + evts.append(evt) self.queue.finish() - evts = [] - for i in xrange(self.nb_run): - evt = kernel(self.queue, *tuple(kernelArgs)) - evts.append(evt) - self.queue.finish() - time = 0. - for evt in evts: - time += (evt.profile.end - evt.profile.start) * 1e-9 - self.timings[size] = time / self.nb_run - self.kernel_args[size] = kernelArgs[0:2] - print self.timings[size], "args : ", kernelArgs[0:2] - for i in xrange(len(kernelArgs)): - if isinstance(kernelArgs[i], cl.MemoryObject): - kernelArgs[i].release() - + time = 0. + for evt in evts: + time += (evt.profile.end - evt.profile.start) * 1e-9 + self.timings[size] = time / self.nb_run + self.kernel_args[size] = kernelArgs[0:2] + print self.timings[size] + for i in xrange(len(kernelArgs)): + if isinstance(kernelArgs[i], cl.MemoryObject): + kernelArgs[i].release() + else: + print "Incompatible sizes" diff --git a/HySoP/hysop/gpu/tools.py b/HySoP/hysop/gpu/tools.py index 674010ea1..affed92bf 100644 --- a/HySoP/hysop/gpu/tools.py +++ b/HySoP/hysop/gpu/tools.py @@ -5,7 +5,8 @@ Tools for gpu management. """ from parmepy import __VERBOSE__ from parmepy.constants import S_DIR -from parmepy.gpu import cl, PARMES_REAL_GPU, GPU_SRC, CL_PROFILE +from parmepy.gpu import cl, PARMES_REAL_GPU, PARMES_DOUBLE_GPU, \ + GPU_SRC, CL_PROFILE ## Global variable handling an OpenCL Environment instance __cl_env = None @@ -40,7 +41,7 @@ class OpenCLEnvironment(object): ## Floating point precision self.precision = precision self.macros = {} - self.default_build_opts = "" + self.default_build_opts = "-Werror" if not self.precision is None: self.default_build_opts += self._get_precision_opts() if resolution is not None: @@ -61,6 +62,19 @@ class OpenCLEnvironment(object): else: self.default_sizes_constants = None + print self.device.name + ## Kernels configuration dictionary + if self.device.name == "Cayman": + from config_cayman import kernels_config as kernel_cfg + elif self.device.name == "Tesla K20m": + from config_k20m import kernels_config as kernel_cfg + else: + # from config_default import kernels_config as kernel_cfg + raise ValueError( + "Unknown device. Default configuration is not yet implemented") + self.kernels_config = kernel_cfg + + def modify(self, platform_id, device_id, device_type, precision, resolution, gl_sharing=False): """ @@ -129,7 +143,7 @@ class OpenCLEnvironment(object): >>> clenv = OpenCLEnvironment(0,0,'gpu', None, None) >>> clenv._get_platform(0) == cl.get_platforms()[0] True - >>> clenv._get_platform(-1) == cl.get_platforms()[0] + >>> clenv._get_platform(-1) == cl.get_platforms()[-1] True >>> \endcode @@ -282,48 +296,35 @@ class OpenCLEnvironment(object): # Precision supported if __VERBOSE__: print " Precision capability ", - capable = True + fp32_rounding_flag = True if self.precision is PARMES_REAL_GPU: opts += " -cl-single-precision-constant" - if __VERBOSE__: - print "for Single Precision: " - for v in ['DENORM', 'INF_NAN', - 'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF', - 'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']: - try: - if eval('(self.device.single_fp_config &' + - ' cl.device_fp_config.' + - v + ') == cl.device_fp_config.' + v): - if __VERBOSE__: - print v - except AttributeError as ae: - if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT': - capable = False - if __VERBOSE__: - print v, 'is not supported in OpenCL C 1.2.\n', - print ' Exception catched : ', ae + prec = "single" else: - if __VERBOSE__: - print "for Double Precision: " - if self.device.double_fp_config > 0: - for v in ['DENORM', 'INF_NAN', 'FMA', - 'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF', - 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']: - try: - if eval('(self.device.double_fp_config &' + - ' cl.device_fp_config.' + - v + ') == cl.device_fp_config.' + v): - if __VERBOSE__: - print v - except AttributeError as ae: - if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT': - capable = False - if __VERBOSE__: - print v, 'is supported in OpenCL C 1.2.\n', - print ' Exception catched : ', ae - else: + if self.device.double_fp_config <= 0: raise ValueError("Double Precision is not supported by device") - if capable: + prec = "double" + if __VERBOSE__: + print "for " + prec + " Precision: " + for v in ['DENORM', 'INF_NAN', + 'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF', + 'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']: + try: + if eval('(self.device.' + prec + '_fp_config &' + + ' cl.device_fp_config.' + + v + ') == cl.device_fp_config.' + v): + if __VERBOSE__: + print v + else: + if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT': + fp32_rounding_flag = False + except AttributeError as ae: + if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT': + fp32_rounding_flag = False + if __VERBOSE__: + print v, 'is not supported in OpenCL C 1.2.\n', + print ' Exception catched : ', ae + if fp32_rounding_flag: opts += " -cl-fp32-correctly-rounded-divide-sqrt" return opts @@ -338,6 +339,8 @@ class OpenCLEnvironment(object): Parse the sources to handle single and double precision. """ gpu_src = "" + if self.precision is PARMES_DOUBLE_GPU: + gpu_src += "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n" if isinstance(files, list): file_list = files else: @@ -370,6 +373,7 @@ class OpenCLEnvironment(object): for sf in file_list: print " - ", sf print "Build options : ", self.default_build_opts + options + print "Vectorization : ", vector_width raise e if __VERBOSE__: #print options diff --git a/HySoP/hysop/numerics/tests/__init__.py b/HySoP/hysop/numerics/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/HySoP/hysop/tools/timers.py b/HySoP/hysop/tools/timers.py index a3861eb69..418bfb2ee 100644 --- a/HySoP/hysop/tools/timers.py +++ b/HySoP/hysop/tools/timers.py @@ -6,6 +6,7 @@ thanks to decorator. """ from parmepy.mpi import MPI, main_rank from parmepy import __VERBOSE__ +ftime = MPI.Wtime def timed_function(f): @@ -57,9 +58,9 @@ class FunctionTimer(object): @return wrapped function. """ def f(*args, **kwargs): - t0 = MPI.Wtime() + t0 = ftime() res = func(*args, **kwargs) - t = MPI.Wtime() - t0 + t = ftime() - t0 self.t += t self.ncalls += 1 #self.times.append(t) @@ -121,9 +122,12 @@ class Timer(object): t._compute_summary() self.t += t.t for t_k in t.f_timers.keys(): - t.f_timers[t_k].func_name = \ - t._obj.name + "_" + t.f_timers[t_k].func_name - self.f_timers[t._obj.name + "_" + t_k] = t.f_timers[t_k] + if isinstance(t.f_timers[t_k], Timer): + self.f_timers[t_k] = t.f_timers[t_k].f_timers + else: + t.f_timers[t_k].func_name = \ + t._obj.name + "_" + t.f_timers[t_k].func_name + self.f_timers[t._obj.name + "_" + t_k] = t.f_timers[t_k] return self def getFunctionTimer(self, func): @@ -148,20 +152,35 @@ class Timer(object): """ self.f_timers[ft.func_name] = ft + def addSubTimer(self, t, prefix): + if len(t.f_timers.keys()) > 0: + self.f_timers[t._obj.name + prefix] = t + + def _compute_summary(self): """ - Copute a summary of the different functions referenced herein. + Compute a summary of the different functions referenced herein. """ self.t = 0. for f in self.f_timers.keys(): - self.t += self.f_timers[f].t + try: + self.t += self.f_timers[f].t + except AttributeError: + # self.f_timers[f] is a sub-Timer times are details of + # functions times. these times are already in sum. + pass def __str__(self): self._compute_summary() s = "[" + str(main_rank) + "] Time spent in " s += self._obj.name + ' = ' + str(self.t) + ' s' for f in sorted(self.f_timers.keys()): - s += "\n" + str(self.f_timers[f]) + if isinstance(self.f_timers[f], dict): + s += "\n[" + str(main_rank) + "] |-- " + f + for sf in sorted(self.f_timers[f].keys()): + s += "\n" + str(self.f_timers[f][sf]).replace('|--', ' |--') + else: + s += "\n" + str(self.f_timers[f]) return s def toString(self): @@ -171,7 +190,12 @@ class Timer(object): """ h = self._obj.name.replace(' ', '_') + ' ' t = str(self.t) + ' ' - l = sorted(self.f_timers.keys()) - h += ' '.join(l) + ' ' - t += ' '.join([str(self.f_timers[k].t) for k in l]) + ' ' + for f in sorted(self.f_timers.keys()): + if isinstance(self.f_timers[f], dict): + for sf in sorted(self.f_timers[f].keys()): + h += (f + sf).replace(' ', '_') + ' ' + t += str(self.f_timers[f][sf].t) + ' ' + else: + h += f.replace(' ', '_') + ' ' + t += str(self.f_timers[f].t) + ' ' return h, t diff --git a/HySoP/setup.py.in b/HySoP/setup.py.in index 436340c16..2db7c5451 100644 --- a/HySoP/setup.py.in +++ b/HySoP/setup.py.in @@ -30,6 +30,7 @@ packages_for_tests = ['parmepy.domain.tests', 'parmepy.operator.tests', 'parmepy.tools.tests', 'parmepy.problem.tests', + 'parmepy.numerics.tests', ] if("@USE_MPI@" is "ON"): -- GitLab