From 75018cdbea82f21774ecbc9126c6efa30e3a24ad Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@univ-reims.fr> Date: Fri, 25 Jul 2014 15:25:19 +0200 Subject: [PATCH] Bring some optimisations for single GPU --- .gitignore | 3 +- HySoP/hysop/.pyflymakercc | Bin 267 -> 0 bytes HySoP/hysop/gpu/cl_src/advection/basic_rk2.cl | 5 +- .../gpu/cl_src/advection/basic_rk2_noVec.cl | 6 +- HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl | 5 +- .../gpu/cl_src/advection/basic_rk4_noVec.cl | 7 +- .../hysop/gpu/cl_src/advection/builtin_rk2.cl | 5 +- .../gpu/cl_src/advection/builtin_rk2_noVec.cl | 7 +- .../hysop/gpu/cl_src/advection/builtin_rk4.cl | 5 +- .../gpu/cl_src/advection/builtin_rk4_noVec.cl | 6 +- .../cl_src/advection/comm_basic_rk2_noVec.cl | 74 ++++ .../advection/comm_builtin_rk2_noVec.cl | 73 ++++ .../gpu/cl_src/advection/velocity_cache.cl | 388 +++++------------- .../cl_src/advection/velocity_cache_noVec.cl | 7 +- HySoP/hysop/gpu/cl_src/common.cl | 19 + HySoP/hysop/gpu/cl_src/kernels/advection.cl | 2 +- .../cl_src/kernels/advection_and_remeshing.cl | 6 +- .../kernels/advection_and_remeshing_noVec.cl | 6 +- .../advection_and_remeshing_vector_2d.cl | 83 ---- .../advection_and_remeshing_vector_3d.cl | 90 ---- .../gpu/cl_src/kernels/advection_noVec.cl | 2 +- .../cl_src/kernels/comm_MS_advection_noVec.cl | 261 ++++++++++++ .../cl_src/kernels/comm_advection_noVec.cl | 161 ++++++++ .../gpu/cl_src/kernels/comm_diffusion.cl | 150 +++++++ .../cl_src/kernels/comm_remeshing_noVec.cl | 253 ++++++++++++ HySoP/hysop/gpu/cl_src/kernels/diffusion.cl | 119 ++++++ .../gpu/cl_src/kernels/minmax_buffers.cl | 84 ++++ HySoP/hysop/gpu/cl_src/kernels/remeshing.cl | 4 +- .../gpu/cl_src/kernels/remeshing_noVec.cl | 4 +- HySoP/hysop/gpu/cl_src/remeshing/basic.cl | 5 +- .../hysop/gpu/cl_src/remeshing/basic_noVec.cl | 5 +- .../cl_src/remeshing/basic_noVec_vector_2d.cl | 111 ----- .../cl_src/remeshing/basic_noVec_vector_3d.cl | 121 ------ .../gpu/cl_src/remeshing/basic_vector_2d.cl | 111 ----- .../gpu/cl_src/remeshing/basic_vector_3d.cl | 121 ------ .../gpu/cl_src/remeshing/comm_basic_noVec.cl | 124 ++++++ HySoP/hysop/gpu/cl_src/remeshing/private.cl | 5 +- .../gpu/cl_src/remeshing/private_noVec.cl | 5 +- .../gpu/cl_src/remeshing/private_vector_2d.cl | 112 ----- .../gpu/cl_src/remeshing/private_vector_3d.cl | 122 ------ HySoP/hysop/gpu/cl_src/remeshing/weights.cl | 168 ++++---- .../gpu/cl_src/remeshing/weights_builtin.cl | 171 ++++---- .../gpu/cl_src/remeshing/weights_noVec.cl | 171 ++++---- .../cl_src/remeshing/weights_noVec_builtin.cl | 169 ++++---- HySoP/hysop/gpu/config_default.py | 5 + HySoP/hysop/gpu/config_k20m.py | 5 + HySoP/hysop/gpu/gpu_operator.py | 2 +- HySoP/hysop/gpu/tools.py | 78 +++- HySoP/hysop/operator/monitors/printer.py | 2 +- 49 files changed, 1880 insertions(+), 1568 deletions(-) delete mode 100644 HySoP/hysop/.pyflymakercc create mode 100644 HySoP/hysop/gpu/cl_src/advection/comm_basic_rk2_noVec.cl create mode 100644 HySoP/hysop/gpu/cl_src/advection/comm_builtin_rk2_noVec.cl delete mode 100644 HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_vector_2d.cl delete mode 100644 HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_vector_3d.cl create mode 100644 HySoP/hysop/gpu/cl_src/kernels/comm_MS_advection_noVec.cl create mode 100644 HySoP/hysop/gpu/cl_src/kernels/comm_advection_noVec.cl create mode 100644 HySoP/hysop/gpu/cl_src/kernels/comm_diffusion.cl create mode 100644 HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl create mode 100644 HySoP/hysop/gpu/cl_src/kernels/diffusion.cl create mode 100644 HySoP/hysop/gpu/cl_src/kernels/minmax_buffers.cl delete mode 100644 HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_2d.cl delete mode 100644 HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_3d.cl delete mode 100644 HySoP/hysop/gpu/cl_src/remeshing/basic_vector_2d.cl delete mode 100644 HySoP/hysop/gpu/cl_src/remeshing/basic_vector_3d.cl create mode 100644 HySoP/hysop/gpu/cl_src/remeshing/comm_basic_noVec.cl delete mode 100644 HySoP/hysop/gpu/cl_src/remeshing/private_vector_2d.cl delete mode 100644 HySoP/hysop/gpu/cl_src/remeshing/private_vector_3d.cl diff --git a/.gitignore b/.gitignore index 2af93fbc7..97b084e5e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.pyc -parmepy/__init__.py \ No newline at end of file +parmepy/__init__.py +parmepy/.pyflymakercc diff --git a/HySoP/hysop/.pyflymakercc b/HySoP/hysop/.pyflymakercc deleted file mode 100644 index e53a15c69251dff9ade2f58f5c149de02b3225f2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 267 zcmZSn%**v=^Tgm}1}I<#(hfjeECVD`7#PwS8KQs;Murq7h7@LobY_MqAe)6DSc4U) zpo9fTxEh(6mVkIc29}1#FqVN4jAdl#r@;hLEC?h#-TnN7T;rYnU0j3X16%_vN`x^) zBYiymLNtKF3<#pw70A^OElw>e)^{sP%uCMJ56(|3Dc1K*Ey>7FNi7aYEGo%NF3m~I yEJ`ib2l8@(Q~`)8sMOahs7%YL%uUQrElMuo0Geczo1apelWGTYb}`6toQwb$-8%*V diff --git a/HySoP/hysop/gpu/cl_src/advection/basic_rk2.cl b/HySoP/hysop/gpu/cl_src/advection/basic_rk2.cl index 2068ac88c..39b7639b7 100644 --- a/HySoP/hysop/gpu/cl_src/advection/basic_rk2.cl +++ b/HySoP/hysop/gpu/cl_src/advection/basic_rk2.cl @@ -3,7 +3,7 @@ * Advection function, vectorized version, no use of builtins functions. */ -float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache); +float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position); /** @@ -22,7 +22,7 @@ float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __l * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. * @see parmepy.gpu.tools.parse_file */ -float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache) +float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position) { float__N__ v, /* Velocity at point */ vp, /* Velocity at right point */ @@ -34,6 +34,7 @@ float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __l c = (float__N__)((i+__NN__)*dx, ); + c = c + min_position; #if V_NB_I == NB_I // single-scale: diff --git a/HySoP/hysop/gpu/cl_src/advection/basic_rk2_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/basic_rk2_noVec.cl index 9f19c4c74..d805563e2 100644 --- a/HySoP/hysop/gpu/cl_src/advection/basic_rk2_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/advection/basic_rk2_noVec.cl @@ -3,7 +3,7 @@ * Advection function, basic version */ -float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache); +float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position); /** @@ -19,12 +19,12 @@ float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local * * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last. */ -float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache) +float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position) { float v, /* Velocity at point */ vp, /* Velocity at right point */ p, /* Normalized intermediary position */ - c = i * dx, /* initial coordinate */ + c = i * dx + min_position, /* initial coordinate */ hdt = 0.5 * dt; /* half time step */ int i_ind, /* Interpolation left point */ i_ind_p; /* Interpolation right point */ diff --git a/HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl b/HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl index 10a19921d..cc50ed3ce 100644 --- a/HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl +++ b/HySoP/hysop/gpu/cl_src/advection/basic_rk4.cl @@ -3,7 +3,7 @@ * Advection function (RK4 scheme), vectorized version, no use of builtins functions. */ -float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache); +float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position); /** @@ -22,7 +22,7 @@ float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __l * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. * @see parmepy.gpu.tools.parse_file */ -float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache) +float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position) { float__N__ v, /* Velocity at point */ vp, /* Velocity at right point */ @@ -36,6 +36,7 @@ float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __l c = (float__N__)((i+__NN__)*dx, ); + c = c + min_position; //k1 = f(t,y) //k2 = f(t + dt/2, y + dt/2 * k1) diff --git a/HySoP/hysop/gpu/cl_src/advection/basic_rk4_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/basic_rk4_noVec.cl index 1314370e8..dbbe1f395 100644 --- a/HySoP/hysop/gpu/cl_src/advection/basic_rk4_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/advection/basic_rk4_noVec.cl @@ -3,7 +3,7 @@ * Advection function (RK4 scheme), basic version */ -float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache); +float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position); /** @@ -19,14 +19,14 @@ float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local * * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last. */ -float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache) +float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position) { float v, /* Velocity at point */ vp, /* Velocity at right point */ p, /* Intermediary position */ k, /* rk averaged velocity */ kn, /* rk intermediate velocity */ - c = i * dx, /* initial coordinate */ + c = i * dx + min_position, /* initial coordinate */ hdt = 0.5 * dt; /* half time step */ int i_ind, /* Interpolation left point */ i_ind_p; /* Interpolation right point */ @@ -91,4 +91,3 @@ float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local /* - 3 iterpolation = 3 * 9 */ /* - velocity weights = 5*/ /* Total = 41 */ - diff --git a/HySoP/hysop/gpu/cl_src/advection/builtin_rk2.cl b/HySoP/hysop/gpu/cl_src/advection/builtin_rk2.cl index 958cdd3d8..0a7ac4ac5 100644 --- a/HySoP/hysop/gpu/cl_src/advection/builtin_rk2.cl +++ b/HySoP/hysop/gpu/cl_src/advection/builtin_rk2.cl @@ -3,7 +3,7 @@ * Advection function, vectorized version. */ -float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache); +float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position); /** @@ -22,7 +22,7 @@ float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __l * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. * @see parmepy.gpu.tools.parse_file */ -float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache) +float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position) { float__N__ v, /* Velocity at point */ vp, /* Velocity at right point */ @@ -34,6 +34,7 @@ float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __l c = (float__N__)((i+__NN__)*dx, ); + c = c + min_position; #if V_NB_I == NB_I // single-scale: diff --git a/HySoP/hysop/gpu/cl_src/advection/builtin_rk2_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/builtin_rk2_noVec.cl index 76712a015..5e4e8dcd1 100644 --- a/HySoP/hysop/gpu/cl_src/advection/builtin_rk2_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/advection/builtin_rk2_noVec.cl @@ -3,7 +3,7 @@ * Advection function, basic version */ -float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache); +float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position); /** @@ -19,11 +19,11 @@ float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local * * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last. */ -float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache) +float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position) { float v, /* Velocity at point */ p, /* Intermediary position */ - c = i * dx, /* initial coordinate */ + c = i * dx + min_position, /* initial coordinate */ hdt = 0.5 * dt; /* half time step */ int i_ind, /* Interpolation left point */ i_ind_p; /* Interpolation right point */ @@ -58,4 +58,3 @@ float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local /* - 2 positions = 2 * fma */ /* - 1 iterpolation = 6 + 1 * mix */ /* Total = 2 fma + 1 mix + 6 */ - diff --git a/HySoP/hysop/gpu/cl_src/advection/builtin_rk4.cl b/HySoP/hysop/gpu/cl_src/advection/builtin_rk4.cl index 0725c4148..18b5366cb 100644 --- a/HySoP/hysop/gpu/cl_src/advection/builtin_rk4.cl +++ b/HySoP/hysop/gpu/cl_src/advection/builtin_rk4.cl @@ -3,7 +3,7 @@ * Advection function, vectorized version. */ -float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache); +float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position); /** @@ -22,7 +22,7 @@ float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __l * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. * @see parmepy.gpu.tools.parse_file */ -float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache) +float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position) { float__N__ v, /* Velocity at point */ vp, /* Velocity at right point */ @@ -36,6 +36,7 @@ float__N__ advection(uint i, float dt, float dx, float invdx, float v_invdx, __l c = (float__N__)((i+__NN__)*dx, ); + c = c + min_position; #if V_NB_I == NB_I // single-scale: diff --git a/HySoP/hysop/gpu/cl_src/advection/builtin_rk4_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/builtin_rk4_noVec.cl index c5b786c55..bd92b34a5 100644 --- a/HySoP/hysop/gpu/cl_src/advection/builtin_rk4_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/advection/builtin_rk4_noVec.cl @@ -3,7 +3,7 @@ * Advection function, basic version */ -float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache); +float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position); /** @@ -19,12 +19,12 @@ float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local * * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last. */ -float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache) +float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position) { float p, /* Intermediary position */ k, /* rk averaged velocity */ kn, /* rk intermediate velocity */ - c = i * dx, /* initial coordinate */ + c = i * dx + min_position, /* initial coordinate */ hdt = 0.5 * dt; /* half time step */ int i_ind, /* Interpolation left point */ i_ind_p; /* Interpolation right point */ diff --git a/HySoP/hysop/gpu/cl_src/advection/comm_basic_rk2_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/comm_basic_rk2_noVec.cl new file mode 100644 index 000000000..fcb2dff72 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection/comm_basic_rk2_noVec.cl @@ -0,0 +1,74 @@ +/** + * @file advection/comm_basic_noVec.cl + * Advection function, basic version, mpi communications on the host side + */ + +float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position); + + +/** + * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field. + * Use of builtin OpenCL functions fma and mix. + * + * @param i Particle index. + * @param dt Time step. + * @param dx Space step. + * @param invdx 1/dx. + * @param velocity_cache Local velocity field. + * @return Particle position + * + * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last. + * @remark T_NB_I: global points number in the 1st direction (mpi cutted direction) + * @remark START_INDEX Global staring index for computational points + * @remark STOP_INDEX Global stop index for computational points + */ +float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position) +{ + float v, /* Velocity at point */ + vp, /* Velocity at right point */ + p, /* Normalized intermediary position */ + c = i * dx + min_position, /* initial coordinate */ + hdt = 0.5 * dt; /* half time step */ + int i_ind, /* Interpolation left point */ + i_ind_p; /* Interpolation right point */ + +#if (V_NB_I-2*V_GHOSTS_NB) == NB_I + // single-scale: + v = velocity_cache[noBC_id(i + V_GHOSTS_NB)]; /* k = k1 */ +#else + // multi-scale : interpolate v from velocity buffer (of length V_NB_I) + p = c * v_invdx; + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind = i_ind - (V_START_INDEX-V_GHOSTS_NB); + i_ind_p = i_ind + 1; + v = mix(velocity_cache[noBC_id(i_ind)], + velocity_cache[noBC_id(i_ind_p)],p); +#endif + p = (c + hdt*v) * v_invdx; + + i_ind = convert_int_rtn(p); + if( i_ind>=(V_START_INDEX-MS_INTERPOL_SHIFT) && i_ind < (V_STOP_INDEX-V_GHOSTS_NB)) + { + p = p - convert_float(i_ind); + + i_ind = i_ind - (V_START_INDEX-V_GHOSTS_NB); + i_ind_p = i_ind + 1; + + v = velocity_cache[noBC_id(i_ind)]; + vp = velocity_cache[noBC_id(i_ind_p)]; + v = (p*(vp-v) + v); + + p = c + dt * v; + } + else + { + p = (1000*T_NB_I)*1.0 + p; + } + + return p; +} +/* Operations number : */ +/* - 2 positions = 2 * 2 */ +/* - 1 iterpolation = 9 */ +/* Total = 13 */ diff --git a/HySoP/hysop/gpu/cl_src/advection/comm_builtin_rk2_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/comm_builtin_rk2_noVec.cl new file mode 100644 index 000000000..66e1eff53 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/advection/comm_builtin_rk2_noVec.cl @@ -0,0 +1,73 @@ +/** + * @file comm_builtin_noVec.cl + * Advection function, basic version, mpi communications on the host side + */ + +float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position); + + +/** + * Compute the position of a particle with a RK2 integration scheme. Velocity is linearly interpolated from the global field. + * Use of builtin OpenCL functions fma and mix. + * + * @param i Particle index (without velocity ghosts considering). + * @param dt Time step. + * @param dx Space step. + * @param invdx 1/dx. + * @param velocity_cache Local velocity field. + * @return Particle position + * + * @remark NB_I, NB_II, NB_III : points number in directions from 1st varying index to last. + * @remark T_NB_I: global points number in the 1st direction (mpi cutted direction) + * @remark START_INDEX Global staring index for computational points + * @remark STOP_INDEX Global stop index for computational points + */ +float advection(uint i, float dt, float dx, float invdx, float v_invdx, __local float* velocity_cache, float min_position) +{ + float v, /* Velocity at point */ + p, /* Intermediary position */ + c = i * dx + min_position, /* initial coordinate */ + hdt = 0.5 * dt; /* half time step */ + int i_ind, /* Interpolation left point */ + i_ind_p; /* Interpolation right point */ + +#if (V_NB_I-2*V_GHOSTS_NB) == NB_I + // single scale: + v = velocity_cache[noBC_id(i + V_GHOSTS_NB)]; +#else + // multi-scale : interpolate v from velocity buffer (of length V_NB_I) + p = c * v_invdx; + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind = i_ind - (V_START_INDEX-V_GHOSTS_NB); + i_ind_p = i_ind + 1; + v = mix(velocity_cache[noBC_id(i_ind)], + velocity_cache[noBC_id(i_ind_p)],p); +#endif + + p = fma(hdt, v, c) * v_invdx; + i_ind = convert_int_rtn(p); + if( i_ind>=(V_START_INDEX-MS_INTERPOL_SHIFT) && i_ind < (V_STOP_INDEX-V_GHOSTS_NB)) + { + p = p - convert_float(i_ind); + + i_ind = i_ind - (V_START_INDEX-V_GHOSTS_NB); + i_ind_p = i_ind + 1; + + v = mix(velocity_cache[noBC_id(i_ind)], + velocity_cache[noBC_id(i_ind_p)],p); + + p = fma(dt, v, c); + } + else + { + p = (1000*T_NB_I)*1.0 + p; + } + + return p; +} +/* Operations number : */ +/* - 2 positions = 2 * fma */ +/* - 1 iterpolation = 6 + 1 * mix */ +/* Total = 2 fma + 1 mix + 6 */ + diff --git a/HySoP/hysop/gpu/cl_src/advection/velocity_cache.cl b/HySoP/hysop/gpu/cl_src/advection/velocity_cache.cl index 9ecd87b49..bfe728c28 100644 --- a/HySoP/hysop/gpu/cl_src/advection/velocity_cache.cl +++ b/HySoP/hysop/gpu/cl_src/advection/velocity_cache.cl @@ -3,46 +3,6 @@ void fill_velocity_cache(__global const float* gvelo, float4 dx, float4 v_dx, __local float* gvelo_loc); - -inline float alpha_l2_1(float y){ - return ((y * (y * (-y + 2.0) - 1.0)) / 2.0);} -inline float beta_l2_1(float y){ - return ((y * y * (3.0 * y - 5.0) + 2.0) / 2.0);} -inline float gamma_l2_1(float y){ - return ((y * (y * (-3.0 * y + 4.0) + 1.0)) / 2.0);} -inline float delta_l2_1(float y){ - return ((y * y * (y - 1.0)) / 2.0);} - - -inline float alpha_l4_2(float y){ - return ((y * (y * (y * (y * (-5.0 * y + 13.0) - 9.0) - 1.0) + 2.0)) / 24.0);} -inline float beta_l4_2(float y){ - return ((y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0)) / 24.0);} -inline float gamma_l4_2(float y){ - return ((y * y * (y * (y * (-50.0 * y + 126.0) - 70.0) - 30.0) + 24.0) / 24.0);} -inline float delta_l4_2(float y){ - return ((y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0)) / 24.0);} -inline float eta_l4_2(float y){ - return ((y * (y * (y * (y * (-25.0 * y + 61.0) - 33.0) - 1.0) - 2.0)) / 24.0);} -inline float zeta_l4_2(float y){ - return ((y * y * y * (y * (5.0 * y - 12.0) + 7.0)) / 24.0);} - - -inline float alpha_l4_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-46.0 * y + 207.0) - 354.0) + 273.0) - 80.0) + 1.0) - 2.0) - 1.0) + 2.0)) / 24.0);} -inline float beta_l4_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (230.0 * y - 1035.0) + 1770.0) - 1365.0) + 400.0) - 4.0) + 4.0) + 16.0) - 16.0)) / 24.0);} -inline float gamma_l4_4(float y){ - return ((y * y * (y * y * (y * (y * (y * (y * (-460.0 * y + 2070.0) - 3540.0) + 2730.0) - 800.0) + 6.0) - 30.0) + 24.0) / 24.0);} -inline float delta_l4_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (460.0 * y - 2070.0) + 3540.0) - 2730.0) + 800.0) - 4.0) - 4.0) + 16.0) + 16.0)) / 24.0);} -inline float eta_l4_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-230.0 * y + 1035.0) - 1770.0) + 1365.0) - 400.0) + 1.0) + 2.0) - 1.0) - 2.0)) / 24.0);} -inline float zeta_l4_4(float y){ - return ((y * y * y * y * y * (y * (y * (y * (46.0 * y - 207.0) + 354.0) - 273.0) + 80.0)) / 24.0);} - -/*** TODO: correct this file to work properly with vector enable remehsing weights ***/ - void fill_velocity_cache(__global const float* gvelo, uint gidX, uint gidY, uint gidZ, float4 dx, float4 v_dx, @@ -50,10 +10,10 @@ void fill_velocity_cache(__global const float* gvelo, { uint i; float__N__ v; -#if V_NB_I == NB_I +#if (V_NB_I-2*V_GHOSTS_NB) == NB_I // Single scale : Velocity and scalar grids are identical : cache is just read from global - uint line_index = gidY*NB_I + gidZ*NB_I*NB_II; /* Current 1D problem index */ - for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__)) + uint line_index = gidY*V_NB_I + gidZ*V_NB_I*V_NB_II; /* Current 1D problem index */ + for(i=gidX*__N__; i<V_NB_I; i+=(WI_NB*__N__)) { /* Read velocity */ v = vload__N__((i+line_index)/__N__, gvelo); @@ -88,76 +48,41 @@ void fill_velocity_cache(__global const float* gvelo, #if MS_FORMULA == MS_LINEAR wY.s1 = hY; wY.s0 = 1.0 - wY.s1; -#elif MS_FORMULA == MS_L2_1 - wY.s0 = alpha_l2_1(hY); - wY.s1 = beta_l2_1(hY); - wY.s2 = gamma_l2_1(hY); - wY.s3 = 1.0 - wY.s0 - wY.s1 - wY.s2; -#elif MS_FORMULA == MS_L4_2 - wY.s0 = alpha_l4_2(hY); - wY.s1 = beta_l4_2(hY); - wY.s2 = gamma_l4_2(hY); - wY.s3 = delta_l4_2(hY); - wY.s4 = eta_l4_2(hY); - wY.s5 = 1.0 - wY.s0 - wY.s1 - wY.s2 - wY.s3 - wY.s4; -#elif MS_FORMULA == MS_l4_4 - wY.s0 = alpha_l4_4(hY); - wY.s1 = beta_l4_4(hY); - wY.s2 = gamma_l4_4(hY); - wY.s3 = delta_l4_4(hY); - wY.s4 = eta_l4_4(hY); +#else + wY.s0 = MS_INTERPOL(alpha)(hY); + wY.s1 = MS_INTERPOL(beta)(hY); + wY.s2 = MS_INTERPOL(gamma)(hY); +#if MS_INTERPOL_SHIFT > 1 + wY.s3 = MS_INTERPOL(delta)(hY); + wY.s4 = MS_INTERPOL(eta)(hY); wY.s5 = 1.0 - wY.s0 - wY.s1 - wY.s2 - wY.s3 - wY.s4; +#else + wY.s3 = 1.0 - wY.s0 - wY.s1 - wY.s2; +#endif #endif - indY = indY + V_GHOSTS_NB; + indY = indY + V_GHOSTS_NB - MS_INTERPOL_SHIFT; -#if MS_FORMULA == MS_LINEAR v_line_index.s0 = indY * V_NB_I; v_line_index.s1 = (indY + 1) * V_NB_I; -#elif MS_FORMULA == MS_L2_1 - v_line_index.s0 = (indY - 1) * V_NB_I; - v_line_index.s1 = (indY) * V_NB_I; - v_line_index.s2 = (indY + 1) * V_NB_I; - v_line_index.s3 = (indY + 2) * V_NB_I; -#elif MS_FORMULA == MS_L4_2 || MS_FORMULA == MS_L4_4 - v_line_index.s0 = (indY - 2) * V_NB_I; - v_line_index.s1 = (indY - 1) * V_NB_I; - v_line_index.s2 = (indY) * V_NB_I; - v_line_index.s3 = (indY + 1) * V_NB_I; - v_line_index.s4 = (indY + 2) * V_NB_I; - v_line_index.s5 = (indY + 3) * V_NB_I; +#if MS_INTERPOL_SHIFT > 0 + v_line_index.s2 = (indY + 2) * V_NB_I; + v_line_index.s3 = (indY + 3) * V_NB_I; +#elif MS_INTERPOL_SHIFT > 1 + v_line_index.s4 = (indY + 4) * V_NB_I; + v_line_index.s5 = (indY + 5) * V_NB_I; #endif - for(i=gidX*__N__; i<V_NB_I; i+=(WI_NB*__N__)) { -#if MS_FORMULA == MS_LINEAR - v = vload__N__((i+v_line_index.s0)/__N__, gvelo) * wY.s0; - velocity_cache[noBC_id(i+__NN__)] = v.s__NN__; - v = vload__N__((i+v_line_index.s1)/__N__, gvelo) * wY.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; -#elif MS_FORMULA == MS_L2_1 - v = vload__N__((i+v_line_index.s0)/__N__, gvelo) * wY.s0; - velocity_cache[noBC_id(i+__NN__)] = v.s__NN__; - v = vload__N__((i+v_line_index.s1)/__N__, gvelo) * wY.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i+v_line_index.s2)/__N__, gvelo) * wY.s2; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i+v_line_index.s3)/__N__, gvelo) * wY.s3; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; -#elif MS_FORMULA == MS_L4_2 || MS_FORMULA == MS_L4_4 - v = vload__N__((i+v_line_index.s0)/__N__, gvelo) * wY.s0; - velocity_cache[noBC_id(i+__NN__)] = v.s__NN__; - v = vload__N__((i+v_line_index.s1)/__N__, gvelo) * wY.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i+v_line_index.s2)/__N__, gvelo) * wY.s2; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i+v_line_index.s3)/__N__, gvelo) * wY.s3; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i+v_line_index.s4)/__N__, gvelo) * wY.s4; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i+v_line_index.s5)/__N__, gvelo) * wY.s5; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; + gvelo_loc[noBC_id(i)] = wY.s0 * gvelo[i + v_line_index.s0]; + gvelo_loc[noBC_id(i)] += wY.s1 * gvelo[i + v_line_index.s1]; +#if MS_INTERPOL_SHIFT > 0 + gvelo_loc[noBC_id(i)] += wY.s2 * gvelo[i + v_line_index.s2]; + gvelo_loc[noBC_id(i)] += wY.s3 * gvelo[i + v_line_index.s3]; +#elif MS_INTERPOL_SHIFT > 1 + gvelo_loc[noBC_id(i)] += wY.s4 * gvelo[i + v_line_index.s4]; + gvelo_loc[noBC_id(i)] += wY.s5 * gvelo[i + v_line_index.s5]; #endif } @@ -191,203 +116,94 @@ void fill_velocity_cache(__global const float* gvelo, wY.s0 = 1.0 - wY.s1; wZ.s1 = hZ; wZ.s0 = 1.0 - wZ.s1; -#elif MS_FORMULA == MS_L2_1 - wY.s0 = alpha_l2_1(hY); - wY.s1 = beta_l2_1(hY); - wY.s2 = gamma_l2_1(hY); - wY.s3 = 1.0 - wY.s0 - wY.s1 - wY.s2; - wZ.s0 = alpha_l2_1(hZ); - wZ.s1 = beta_l2_1(hZ); - wZ.s2 = gamma_l2_1(hZ); - wZ.s3 = 1.0 - wZ.s0 - wZ.s1 - wZ.s2; -#elif MS_FORMULA == MS_L4_2 - wY.s0 = alpha_l4_2(hY); - wY.s1 = beta_l4_2(hY); - wY.s2 = gamma_l4_2(hY); - wY.s3 = delta_l4_2(hY); - wY.s4 = eta_l4_2(hY); - wY.s5 = 1.0 - wY.s0 - wY.s1 - wY.s2 - wY.s3 - wY.s4; - wZ.s0 = alpha_l4_2(hZ); - wZ.s1 = beta_l4_2(hZ); - wZ.s2 = gamma_l4_2(hZ); - wZ.s3 = delta_l4_2(hZ); - wZ.s4 = eta_l4_2(hZ); - wZ.s5 = 1.0 - wZ.s0 - wZ.s1 - wZ.s2 - wZ.s3 - wZ.s4; -#elif MS_FORMULA == MS_L4_4 - wY.s0 = alpha_l4_4(hY); - wY.s1 = beta_l4_4(hY); - wY.s2 = gamma_l4_4(hY); - wY.s3 = delta_l4_4(hY); - wY.s4 = eta_l4_4(hY); +#else + wY.s0 = MS_INTERPOL(alpha)(hY); + wY.s1 = MS_INTERPOL(beta)(hY); + wY.s2 = MS_INTERPOL(gamma)(hY); + wZ.s0 = MS_INTERPOL(alpha)(hZ); + wZ.s1 = MS_INTERPOL(beta)(hZ); + wZ.s2 = MS_INTERPOL(gamma)(hZ); +#if MS_INTERPOL_SHIFT > 1 + wY.s3 = MS_INTERPOL(delta)(hY); + wY.s4 = MS_INTERPOL(eta)(hY); wY.s5 = 1.0 - wY.s0 - wY.s1 - wY.s2 - wY.s3 - wY.s4; - wZ.s0 = alpha_l4_4(hZ); - wZ.s1 = beta_l4_4(hZ); - wZ.s2 = gamma_l4_4(hZ); - wZ.s3 = delta_l4_4(hZ); - wZ.s4 = eta_l4_4(hZ); + wZ.s3 = MS_INTERPOL(delta)(hZ); + wZ.s4 = MS_INTERPOL(eta)(hZ); wZ.s5 = 1.0 - wZ.s0 - wZ.s1 - wZ.s2 - wZ.s3 - wZ.s4; +#else + wY.s3 = 1.0 - wY.s0 - wY.s1 - wY.s2; + wZ.s3 = 1.0 - wZ.s0 - wZ.s1 - wZ.s2; +#endif #endif - indY = indY + V_GHOSTS_NB; - indZ = indZ + V_GHOSTS_NB; + indY = indY + V_GHOSTS_NB - MS_INTERPOL_SHIFT; + indZ = indZ + V_GHOSTS_NB - MS_INTERPOL_SHIFT; -#if MS_FORMULA == MS_LINEAR v_line_indexY.s0 = indY * V_NB_I; v_line_indexY.s1 = (indY + 1) * V_NB_I; v_line_indexZ.s0 = indZ * V_NB_I * V_NB_II; v_line_indexZ.s1 = (indZ + 1) * V_NB_I * V_NB_II; -#elif MS_FORMULA == MS_L2_1 - v_line_indexY.s0 = (indY - 1) * V_NB_I; - v_line_indexY.s1 = (indY) * V_NB_I; - v_line_indexY.s2 = (indY + 1) * V_NB_I; - v_line_indexY.s3 = (indY + 2) * V_NB_I; - v_line_indexZ.s0 = (indZ - 1) * V_NB_I * V_NB_II; - v_line_indexZ.s1 = (indZ) * V_NB_I * V_NB_II; - v_line_indexZ.s2 = (indZ + 1) * V_NB_I * V_NB_II; - v_line_indexZ.s3 = (indZ + 2) * V_NB_I * V_NB_II; -#elif MS_FORMULA == MS_L4_2 || MS_FORMULA == MS_L4_4 - v_line_indexY.s0 = (indY - 2) * V_NB_I; - v_line_indexY.s1 = (indY - 1) * V_NB_I; - v_line_indexY.s2 = (indY) * V_NB_I; - v_line_indexY.s3 = (indY + 1) * V_NB_I; - v_line_indexY.s4 = (indY + 2) * V_NB_I; - v_line_indexY.s5 = (indY + 3) * V_NB_I; - v_line_indexZ.s0 = (indZ - 2) * V_NB_I * V_NB_II; - v_line_indexZ.s1 = (indZ - 1) * V_NB_I * V_NB_II; - v_line_indexZ.s2 = (indZ) * V_NB_I * V_NB_II; - v_line_indexZ.s3 = (indZ + 1) * V_NB_I * V_NB_II; - v_line_indexZ.s4 = (indZ + 2) * V_NB_I * V_NB_II; - v_line_indexZ.s5 = (indZ + 3) * V_NB_I * V_NB_II; +#if MS_INTERPOL_SHIFT > 0 + v_line_indexY.s2 = (indY + 2) * V_NB_I; + v_line_indexY.s3 = (indY + 3) * V_NB_I; + v_line_indexZ.s2 = (indZ + 2) * V_NB_I * V_NB_II; + v_line_indexZ.s3 = (indZ + 3) * V_NB_I * V_NB_II; +#elif MS_INTERPOL_SHIFT > 1 + v_line_indexY.s4 = (indY + 4) * V_NB_I; + v_line_indexY.s5 = (indY + 5) * V_NB_I; + v_line_indexZ.s4 = (indZ + 4) * V_NB_I * V_NB_II; + v_line_indexZ.s5 = (indZ + 5) * V_NB_I * V_NB_II; #endif for(i=gidX*__N__; i<V_NB_I; i+=(WI_NB*__N__)) { -#if MS_FORMULA == MS_LINEAR - v = vload__N__((i + v_line_indexY.s0 + v_line_indexZ.s0)/__N__, gvelo) * wY.s0 * wZ.s0; - velocity_cache[noBC_id(i+__NN__)] = v.s__NN__; - v = vload__N__((i + v_line_indexY.s0 + v_line_indexZ.s1)/__N__, gvelo) * wY.s0 * wZ.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s1 + v_line_indexZ.s0)/__N__, gvelo) * wY.s1 * wZ.s0; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s1 + v_line_indexZ.s1)/__N__, gvelo) * wY.s1 * wZ.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - -#elif MS_FORMULA == MS_L2_1 - v = vload__N__((i + v_line_indexY.s0 + v_line_indexZ.s0)/__N__, gvelo) * wY.s0 * wZ.s0; - velocity_cache[noBC_id(i+__NN__)] = v.s__NN__; - v = vload__N__((i + v_line_indexY.s0 + v_line_indexZ.s1)/__N__, gvelo) * wY.s0 * wZ.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s0 + v_line_indexZ.s2)/__N__, gvelo) * wY.s0 * wZ.s2; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s0 + v_line_indexZ.s3)/__N__, gvelo) * wY.s0 * wZ.s3; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - - v = vload__N__((i + v_line_indexY.s1 + v_line_indexZ.s0)/__N__, gvelo) * wY.s1 * wZ.s0; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s1 + v_line_indexZ.s1)/__N__, gvelo) * wY.s1 * wZ.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s1 + v_line_indexZ.s2)/__N__, gvelo) * wY.s1 * wZ.s2; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s1 + v_line_indexZ.s3)/__N__, gvelo) * wY.s1 * wZ.s3; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - - v = vload__N__((i + v_line_indexY.s2 + v_line_indexZ.s0)/__N__, gvelo) * wY.s2 * wZ.s0; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s2 + v_line_indexZ.s1)/__N__, gvelo) * wY.s2 * wZ.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s2 + v_line_indexZ.s2)/__N__, gvelo) * wY.s2 * wZ.s2; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s2 + v_line_indexZ.s3)/__N__, gvelo) * wY.s2 * wZ.s3; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - - v = vload__N__((i + v_line_indexY.s3 + v_line_indexZ.s0)/__N__, gvelo) * wY.s3 * wZ.s0; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s3 + v_line_indexZ.s1)/__N__, gvelo) * wY.s3 * wZ.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s3 + v_line_indexZ.s2)/__N__, gvelo) * wY.s3 * wZ.s2; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s3 + v_line_indexZ.s3)/__N__, gvelo) * wY.s3 * wZ.s3; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - -#elif MS_FORMULA == MS_L4_2 || MS_FORMULA == MS_L4_4 - v = vload__N__((i + v_line_indexY.s0 + v_line_indexZ.s0)/__N__, gvelo) * wY.s0 * wZ.s0; - velocity_cache[noBC_id(i+__NN__)] = v.s__NN__; - v = vload__N__((i + v_line_indexY.s0 + v_line_indexZ.s1)/__N__, gvelo) * wY.s0 * wZ.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s0 + v_line_indexZ.s2)/__N__, gvelo) * wY.s0 * wZ.s2; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s0 + v_line_indexZ.s3)/__N__, gvelo) * wY.s0 * wZ.s3; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s0 + v_line_indexZ.s4)/__N__, gvelo) * wY.s0 * wZ.s4; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s0 + v_line_indexZ.s5)/__N__, gvelo) * wY.s0 * wZ.s5; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - - v = vload__N__((i + v_line_indexY.s1 + v_line_indexZ.s0)/__N__, gvelo) * wY.s1 * wZ.s0; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s1 + v_line_indexZ.s1)/__N__, gvelo) * wY.s1 * wZ.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s1 + v_line_indexZ.s2)/__N__, gvelo) * wY.s1 * wZ.s2; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s1 + v_line_indexZ.s3)/__N__, gvelo) * wY.s1 * wZ.s3; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s1 + v_line_indexZ.s4)/__N__, gvelo) * wY.s1 * wZ.s4; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s1 + v_line_indexZ.s5)/__N__, gvelo) * wY.s1 * wZ.s5; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - - v = vload__N__((i + v_line_indexY.s2 + v_line_indexZ.s0)/__N__, gvelo) * wY.s2 * wZ.s0; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s2 + v_line_indexZ.s1)/__N__, gvelo) * wY.s2 * wZ.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s2 + v_line_indexZ.s2)/__N__, gvelo) * wY.s2 * wZ.s2; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s2 + v_line_indexZ.s3)/__N__, gvelo) * wY.s2 * wZ.s3; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s2 + v_line_indexZ.s4)/__N__, gvelo) * wY.s2 * wZ.s4; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s2 + v_line_indexZ.s5)/__N__, gvelo) * wY.s2 * wZ.s5; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - - v = vload__N__((i + v_line_indexY.s3 + v_line_indexZ.s0)/__N__, gvelo) * wY.s3 * wZ.s0; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s3 + v_line_indexZ.s1)/__N__, gvelo) * wY.s3 * wZ.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s3 + v_line_indexZ.s2)/__N__, gvelo) * wY.s3 * wZ.s2; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s3 + v_line_indexZ.s3)/__N__, gvelo) * wY.s3 * wZ.s3; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s3 + v_line_indexZ.s4)/__N__, gvelo) * wY.s3 * wZ.s4; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s3 + v_line_indexZ.s5)/__N__, gvelo) * wY.s3 * wZ.s5; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - - v = vload__N__((i + v_line_indexY.s4 + v_line_indexZ.s0)/__N__, gvelo) * wY.s4 * wZ.s0; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s4 + v_line_indexZ.s1)/__N__, gvelo) * wY.s4 * wZ.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s4 + v_line_indexZ.s2)/__N__, gvelo) * wY.s4 * wZ.s2; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s4 + v_line_indexZ.s3)/__N__, gvelo) * wY.s4 * wZ.s3; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s4 + v_line_indexZ.s4)/__N__, gvelo) * wY.s4 * wZ.s4; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s4 + v_line_indexZ.s5)/__N__, gvelo) * wY.s4 * wZ.s5; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - - v = vload__N__((i + v_line_indexY.s5 + v_line_indexZ.s0)/__N__, gvelo) * wY.s5 * wZ.s0; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s5 + v_line_indexZ.s1)/__N__, gvelo) * wY.s5 * wZ.s1; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s5 + v_line_indexZ.s2)/__N__, gvelo) * wY.s5 * wZ.s2; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s5 + v_line_indexZ.s3)/__N__, gvelo) * wY.s5 * wZ.s3; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s5 + v_line_indexZ.s4)/__N__, gvelo) * wY.s5 * wZ.s4; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; - v = vload__N__((i + v_line_indexY.s5 + v_line_indexZ.s5)/__N__, gvelo) * wY.s5 * wZ.s5; - velocity_cache[noBC_id(i+__NN__)] += v.s__NN__; + gvelo_loc[noBC_id(i)] = wY.s0 * wZ.s0 * gvelo[i + v_line_indexY.s0 + v_line_indexZ.s0]; + gvelo_loc[noBC_id(i)] += wY.s0 * wZ.s1 * gvelo[i + v_line_indexY.s0 + v_line_indexZ.s1]; + gvelo_loc[noBC_id(i)] += wY.s1 * wZ.s0 * gvelo[i + v_line_indexY.s1 + v_line_indexZ.s0]; + gvelo_loc[noBC_id(i)] += wY.s1 * wZ.s1 * gvelo[i + v_line_indexY.s1 + v_line_indexZ.s1]; +#if MS_INTERPOL_SHIFT > 0 + gvelo_loc[noBC_id(i)] += wY.s0 * wZ.s2 * gvelo[i + v_line_indexY.s0 + v_line_indexZ.s2]; + gvelo_loc[noBC_id(i)] += wY.s0 * wZ.s3 * gvelo[i + v_line_indexY.s0 + v_line_indexZ.s3]; + + gvelo_loc[noBC_id(i)] += wY.s1 * wZ.s2 * gvelo[i + v_line_indexY.s1 + v_line_indexZ.s2]; + gvelo_loc[noBC_id(i)] += wY.s1 * wZ.s3 * gvelo[i + v_line_indexY.s1 + v_line_indexZ.s3]; + + gvelo_loc[noBC_id(i)] += wY.s2 * wZ.s0 * gvelo[i + v_line_indexY.s2 + v_line_indexZ.s0]; + gvelo_loc[noBC_id(i)] += wY.s2 * wZ.s1 * gvelo[i + v_line_indexY.s2 + v_line_indexZ.s1]; + gvelo_loc[noBC_id(i)] += wY.s2 * wZ.s2 * gvelo[i + v_line_indexY.s2 + v_line_indexZ.s2]; + gvelo_loc[noBC_id(i)] += wY.s2 * wZ.s3 * gvelo[i + v_line_indexY.s2 + v_line_indexZ.s3]; + + gvelo_loc[noBC_id(i)] += wY.s3 * wZ.s0 * gvelo[i + v_line_indexY.s3 + v_line_indexZ.s0]; + gvelo_loc[noBC_id(i)] += wY.s3 * wZ.s1 * gvelo[i + v_line_indexY.s3 + v_line_indexZ.s1]; + gvelo_loc[noBC_id(i)] += wY.s3 * wZ.s2 * gvelo[i + v_line_indexY.s3 + v_line_indexZ.s2]; + gvelo_loc[noBC_id(i)] += wY.s3 * wZ.s3 * gvelo[i + v_line_indexY.s3 + v_line_indexZ.s3]; +#elif MS_INTERPOL_SHIFT > 1 + gvelo_loc[noBC_id(i)] += wY.s0 * wZ.s4 * gvelo[i + v_line_indexY.s0 + v_line_indexZ.s4]; + gvelo_loc[noBC_id(i)] += wY.s0 * wZ.s5 * gvelo[i + v_line_indexY.s0 + v_line_indexZ.s5]; + + gvelo_loc[noBC_id(i)] += wY.s1 * wZ.s4 * gvelo[i + v_line_indexY.s1 + v_line_indexZ.s4]; + gvelo_loc[noBC_id(i)] += wY.s1 * wZ.s5 * gvelo[i + v_line_indexY.s1 + v_line_indexZ.s5]; + + gvelo_loc[noBC_id(i)] += wY.s2 * wZ.s4 * gvelo[i + v_line_indexY.s2 + v_line_indexZ.s4]; + gvelo_loc[noBC_id(i)] += wY.s2 * wZ.s5 * gvelo[i + v_line_indexY.s2 + v_line_indexZ.s5]; + + gvelo_loc[noBC_id(i)] += wY.s3 * wZ.s4 * gvelo[i + v_line_indexY.s3 + v_line_indexZ.s4]; + gvelo_loc[noBC_id(i)] += wY.s3 * wZ.s5 * gvelo[i + v_line_indexY.s3 + v_line_indexZ.s5]; + + gvelo_loc[noBC_id(i)] += wY.s4 * wZ.s0 * gvelo[i + v_line_indexY.s4 + v_line_indexZ.s0]; + gvelo_loc[noBC_id(i)] += wY.s4 * wZ.s1 * gvelo[i + v_line_indexY.s4 + v_line_indexZ.s1]; + gvelo_loc[noBC_id(i)] += wY.s4 * wZ.s2 * gvelo[i + v_line_indexY.s4 + v_line_indexZ.s2]; + gvelo_loc[noBC_id(i)] += wY.s4 * wZ.s3 * gvelo[i + v_line_indexY.s4 + v_line_indexZ.s3]; + gvelo_loc[noBC_id(i)] += wY.s4 * wZ.s4 * gvelo[i + v_line_indexY.s4 + v_line_indexZ.s4]; + gvelo_loc[noBC_id(i)] += wY.s4 * wZ.s5 * gvelo[i + v_line_indexY.s4 + v_line_indexZ.s5]; + + gvelo_loc[noBC_id(i)] += wY.s5 * wZ.s0 * gvelo[i + v_line_indexY.s5 + v_line_indexZ.s0]; + gvelo_loc[noBC_id(i)] += wY.s5 * wZ.s1 * gvelo[i + v_line_indexY.s5 + v_line_indexZ.s1]; + gvelo_loc[noBC_id(i)] += wY.s5 * wZ.s2 * gvelo[i + v_line_indexY.s5 + v_line_indexZ.s2]; + gvelo_loc[noBC_id(i)] += wY.s5 * wZ.s3 * gvelo[i + v_line_indexY.s5 + v_line_indexZ.s3]; + gvelo_loc[noBC_id(i)] += wY.s5 * wZ.s4 * gvelo[i + v_line_indexY.s5 + v_line_indexZ.s4]; + gvelo_loc[noBC_id(i)] += wY.s5 * wZ.s5 * gvelo[i + v_line_indexY.s5 + v_line_indexZ.s5]; #endif } #endif diff --git a/HySoP/hysop/gpu/cl_src/advection/velocity_cache_noVec.cl b/HySoP/hysop/gpu/cl_src/advection/velocity_cache_noVec.cl index 60a19c0ca..c90cff973 100644 --- a/HySoP/hysop/gpu/cl_src/advection/velocity_cache_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/advection/velocity_cache_noVec.cl @@ -13,17 +13,16 @@ void fill_velocity_cache(__global const float* gvelo, // ******************************** // ** Single Scale // ******************************** -#if V_NB_I == NB_I +#if (V_NB_I-2*V_GHOSTS_NB) == NB_I // Single scale : Velocity and scalar grids are identical : cache is just read from global - uint line_index = gidY*NB_I + gidZ*NB_I*NB_II; /* Current 1D problem index */ - for(i=gidX; i<NB_I; i+=(WI_NB)) + uint line_index = gidY*V_NB_I + gidZ*V_NB_I*V_NB_II; /* Current 1D problem index */ + for(i=gidX; i<V_NB_I; i+=(WI_NB)) { /* Read velocity */ /* Fill velocity cache */ gvelo_loc[noBC_id(i)] = gvelo[i+line_index]; } - // ******************************** // ** Multi-Scale // ******************************** diff --git a/HySoP/hysop/gpu/cl_src/common.cl b/HySoP/hysop/gpu/cl_src/common.cl index d482af6bb..34a7ec985 100644 --- a/HySoP/hysop/gpu/cl_src/common.cl +++ b/HySoP/hysop/gpu/cl_src/common.cl @@ -124,4 +124,23 @@ inline uint noBC_id(int id){ #elif MS_FORMULA == L4_4 #define MS_INTERPOL_SHIFT 2 #define MS_INTERPOL(greek) greek##_l4_4 +#else +//Default case for single-scale (only used in comm advection) +#define MS_INTERPOL_SHIFT 0 #endif + +/* +a minmax element is a 12 int defined as follows: +*/ +#define L_MIN_X 0 +#define L_MAX_X 1 +#define L_MIN_Y 2 +#define L_MAX_Y 3 +#define L_MIN_Z 4 +#define L_MAX_Z 5 +#define R_MIN_X 6 +#define R_MAX_X 7 +#define R_MIN_Y 8 +#define R_MAX_Y 9 +#define R_MIN_Z 10 +#define R_MAX_Z 11 diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection.cl b/HySoP/hysop/gpu/cl_src/kernels/advection.cl index 20f0dc22c..538126459 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/advection.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/advection.cl @@ -45,7 +45,7 @@ __kernel void advection_kernel(__global const float* gvelo, for(i=gidX*__N__; i<NB_I; i+=WI_NB*__N__) { /* Compute position */ - p = advection(i, dt, dx.x, invdx, v_invdx, velocity_cache) + (float__N__)(min_position); + p = advection(i, dt, dx.x, invdx, v_invdx, velocity_cache, min_position); /* Store result */ vstore__N__(p, (i+line_index)/__N__, ppos); } diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing.cl index a204fcb31..21fe73008 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing.cl @@ -31,7 +31,7 @@ __kernel void advection_and_remeshing(__global const float* gvelo, __RCOMP_P__global float* gscal__ID__, __local float* velocity_cache, __RCOMP_P__local float* gscal_loc__ID__, - float dt,float min_position, float4 dx, float4 v_dx) + float dt, float min_position, float4 dx, float4 v_dx) { uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ @@ -61,9 +61,9 @@ __kernel void advection_and_remeshing(__global const float* gvelo, /* Read Particle scalar */ __RCOMP_Is__ID__ = vload__N__((i + line_index)/__N__, pscal__ID__); /* Compute particle position */ - p = advection(i, dt, dx.x, invdx, v_invdx, velocity_cache); + p = advection(i, dt, dx.x, invdx, v_invdx, velocity_cache, min_position); /* Remesh particle */ - remesh(i, dx.x, invdx, __RCOMP_Ps__ID__, p, __RCOMP_Pgscal_loc__ID__); + remesh(i, dx.x, invdx, __RCOMP_Ps__ID__, p, min_position, __RCOMP_Pgscal_loc__ID__); } /* Synchronize work-group */ diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl index 454e8bd23..1b68b2925 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_noVec.cl @@ -31,7 +31,7 @@ __kernel void advection_and_remeshing(__global const float* gvelo, __RCOMP_P__global float* gscal__ID__, __local float* gvelo_loc, __RCOMP_P__local float* gscal_loc__ID__, - float dt,float min_position, float4 dx, float4 v_dx) + float dt, float min_position, float4 dx, float4 v_dx) { uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ @@ -61,9 +61,9 @@ __kernel void advection_and_remeshing(__global const float* gvelo, /* Read Particle scalar */ __RCOMP_Is__ID__ = pscal__ID__[i + line_index]; /* Compute particle position */ - p = advection(i, dt, dx.x, invdx, v_invdx, gvelo_loc); + p = advection(i, dt, dx.x, invdx, v_invdx, gvelo_loc, min_position); /* Remesh particle */ - remesh(i, dx.x, invdx, __RCOMP_Ps__ID__, p, __RCOMP_Pgscal_loc__ID__); + remesh(i, dx.x, invdx, __RCOMP_Ps__ID__, p, min_position, __RCOMP_Pgscal_loc__ID__); } /* Synchronize work-group */ diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_vector_2d.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_vector_2d.cl deleted file mode 100644 index 10a605498..000000000 --- a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_vector_2d.cl +++ /dev/null @@ -1,83 +0,0 @@ -/** - * @file advection_and_remeshing_vector_2d.cl - * Advection and remeshing kernel for 2D vector advection. - */ - -/** - * Performs advection and then remeshing of the particles' vector. - * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. - * Each work-item computes NB_I/WI_NB particles positions. To avoid concurrent witings, in case of strong velocity gradients, work-items computes contiguous particles. - * Particle are computed through OpenCL vector types of lenght 2, 4 or 8. - * Scalar results are stored in a local buffer as a cache and then copied to global memory buffer. - * - * @param gvelo Velocity field - * @param pscal Particle scalar - * @param gscal Grid scalar - * @param dt Time step - * @param min_position Domain lower coordinate - * @param dx Space step - * - * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. - * @remark <code>WI_NB</code> corresponds to the work-item number. - * @remark <code>__N__</code> is expanded at compilation time by vector width. - * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. - * @see parmepy.gpu.tools.parse_file - */ -__kernel void advection_and_remeshing(__global const float* gvelo, - __global const float* pvec_X, - __global const float* pvec_Y, - __global float* gvec_X, - __global float* gvec_Y, - float dt,float min_position, float dx) -{ - uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ - uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ - uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ - float invdx = 1.0/dx; /* Space step inverse */ - uint i; /* Particle index in 1D problem */ - float__N__ p, /* Particle position */ - pv_X, pv_Y, /* Particle vector */ - v; /* Particle velocity */ - uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */ - - __local float gvec_X_loc[NB_I]; /* Local buffer for result */ - __local float gvec_Y_loc[NB_I]; /* Local buffer for result */ - __local float gvelo_loc[NB_I]; /* Velocity cache */ - - for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__)) - { - /* Read velocity */ - v = vload__N__((i+line_index)/__N__, gvelo); - /* Fill velocity cache */ - gvelo_loc[noBC_id(i+__NN__)] = v.s__NN__; - /* Initialize result buffer */ - gvec_X_loc[noBC_id(i+__NN__)] = 0.0; - gvec_Y_loc[noBC_id(i+__NN__)] = 0.0; - } - - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); - - for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=__N__) - { - /* Read Particle scalar */ - pv_X = vload__N__((i + line_index)/__N__, pvec_X); - pv_Y = vload__N__((i + line_index)/__N__, pvec_Y); - /* Compute particle position */ - p = advection(i, dt, dx, invdx, gvelo_loc); - /* Remesh particle */ - remesh(i, dx, invdx, pv_X, pv_Y, p, gvec_X_loc, gvec_Y_loc); - } - - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); - - for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__)) - { - /* Store result */ - vstore__N__((float__N__)(gvec_X_loc[noBC_id(i+__NN__)], - ), (i + line_index)/__N__, gvec_X); - vstore__N__((float__N__)(gvec_Y_loc[noBC_id(i+__NN__)], - ), (i + line_index)/__N__, gvec_Y); - } -} diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_vector_3d.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_vector_3d.cl deleted file mode 100644 index abbdbe006..000000000 --- a/HySoP/hysop/gpu/cl_src/kernels/advection_and_remeshing_vector_3d.cl +++ /dev/null @@ -1,90 +0,0 @@ -/** - * @file advection_and_remeshing_vector_3d.cl - * Advection and remeshing kernel for 3D vector advection. - */ - -/** - * Performs advection and then remeshing of the particles' vector. - * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. - * Each work-item computes NB_I/WI_NB particles positions. To avoid concurrent witings, in case of strong velocity gradients, work-items computes contiguous particles. - * Particle are computed through OpenCL vector types of lenght 2, 4 or 8. - * Scalar results are stored in a local buffer as a cache and then copied to global memory buffer. - * - * @param gvelo Velocity field - * @param pscal Particle scalar - * @param gscal Grid scalar - * @param dt Time step - * @param min_position Domain lower coordinate - * @param dx Space step - * - * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. - * @remark <code>WI_NB</code> corresponds to the work-item number. - * @remark <code>__N__</code> is expanded at compilation time by vector width. - * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. - * @see parmepy.gpu.tools.parse_file - */ -__kernel void advection_and_remeshing(__global const float* gvelo, - __global const float* pvec_X, - __global const float* pvec_Y, - __global const float* pvec_Z, - __global float* gvec_X, - __global float* gvec_Y, - __global float* gvec_Z, - float dt,float min_position, float dx) -{ - uint gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ - uint gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ - uint gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ - float invdx = 1.0/dx; /* Space step inverse */ - uint i; /* Particle index in 1D problem */ - float__N__ p, /* Particle position */ - pv_X, pv_Y, pv_Z, /* Particle vector */ - v; /* Particle velocity */ - uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */ - - __local float gvec_X_loc[NB_I]; /* Local buffer for result */ - __local float gvec_Y_loc[NB_I]; /* Local buffer for result */ - __local float gvec_Z_loc[NB_I]; /* Local buffer for result */ - __local float gvelo_loc[NB_I]; /* Velocity cache */ - - for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__)) - { - /* Read velocity */ - v = vload__N__((i+line_index)/__N__, gvelo); - /* Fill velocity cache */ - gvelo_loc[noBC_id(i+__NN__)] = v.s__NN__; - /* Initialize result buffer */ - gvec_X_loc[noBC_id(i+__NN__)] = 0.0; - gvec_Y_loc[noBC_id(i+__NN__)] = 0.0; - gvec_Z_loc[noBC_id(i+__NN__)] = 0.0; - } - - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); - - for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=__N__) - { - /* Read Particle scalar */ - pv_X = vload__N__((i + line_index)/__N__, pvec_X); - pv_Y = vload__N__((i + line_index)/__N__, pvec_Y); - pv_Z = vload__N__((i + line_index)/__N__, pvec_Z); - /* Compute particle position */ - p = advection(i, dt, dx, invdx, gvelo_loc); - /* Remesh particle */ - remesh(i, dx, invdx, pv_X, pv_Y, pv_Z, p, gvec_X_loc, gvec_Y_loc, gvec_Z_loc); - } - - /* Synchronize work-group */ - barrier(CLK_LOCAL_MEM_FENCE); - - for(i=gidX*__N__; i<NB_I; i+=(WI_NB*__N__)) - { - /* Store result */ - vstore__N__((float__N__)(gvec_X_loc[noBC_id(i+__NN__)], - ), (i + line_index)/__N__, gvec_X); - vstore__N__((float__N__)(gvec_Y_loc[noBC_id(i+__NN__)], - ), (i + line_index)/__N__, gvec_Y); - vstore__N__((float__N__)(gvec_Z_loc[noBC_id(i+__NN__)], - ), (i + line_index)/__N__, gvec_Z); - } -} diff --git a/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl index a7f4c285d..0055af7d2 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/advection_noVec.cl @@ -39,6 +39,6 @@ __kernel void advection_kernel(__global const float* gvelo, for(i=gidX; i<NB_I; i+=WI_NB) { - ppos[i+line_index] = advection(i, dt, dx.x, invdx, v_invdx, velocity_cache) + min_position; + ppos[i+line_index] = advection(i, dt, dx.x, invdx, v_invdx, velocity_cache, min_position); } } diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_MS_advection_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_MS_advection_noVec.cl new file mode 100644 index 000000000..3398f8bc5 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_MS_advection_noVec.cl @@ -0,0 +1,261 @@ + + + +void fill_velocity_cache_reduction(__global const float* gvelo, + uint gidX, uint gidY, uint gidZ, + float4 dx, float4 v_dx, + __local float* gvelo_loc); + +void reduce_local(__local int* minmax, int lid); + + +__kernel void buff_advec(__global const float* gvelo, + __global float* ppos, + __global float* buffer_l, + __global float* buffer_r, + __global int* minmax_global, + __local float* velocity_cache, + __local float* buff_l_loc, + __local float* buff_r_loc, + float dt, float min_position, float4 dx, float4 v_dx, + int4 l_nb, int4 r_nb) +{ + int gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ + int gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ + int gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + float invdx = 1.0/dx.x; /* Space step inverse */ + float v_invdx = 1.0/v_dx.x; /* Space step inverse */ + int i; /* Particle index in 1D problem */ + int line_index = gidY*NB_I+gidZ*NB_I*NB_II; /* Current 1D problem index */ + float p,v,c, hY, hZ; + int i_ind, i_indY, i_indZ; + + int l_start_x = minmax_global[L_MIN_X]; + int l_start_y = minmax_global[L_MIN_Y]; + int l_start_z = minmax_global[L_MIN_Z]; + int r_start_x = minmax_global[R_MIN_X]; + int r_start_y = minmax_global[R_MIN_Y]; + int r_start_z = minmax_global[R_MIN_Z]; + int4 l_nb_used = (int4)(minmax_global[L_MAX_X] - l_start_x + 1, + minmax_global[L_MAX_Y] - l_start_y + 1, + minmax_global[L_MAX_Z] - l_start_z + 1, + 0); + int4 r_nb_used = (int4)(minmax_global[R_MAX_X] - r_start_x + 1, + minmax_global[R_MAX_Y] - r_start_y + 1, + minmax_global[R_MAX_Z] - r_start_z + 1, + 0); + + __local float* loc_ptr; + + + hY = (gidY * dx.y) / v_dx.y; + hZ = (gidZ * dx.z) / v_dx.z; + i_indY = convert_int_rtn(hY); + i_indZ = convert_int_rtn(hZ); + hY = hY - convert_float(i_indY); + hZ = hZ - convert_float(i_indZ); + + for(i=gidX; i<V_NB_I; i+=(WI_NB)){ + velocity_cache[noBC_id(i)] = (1.0-hY)*(1.0-hZ) * gvelo[i + i_indY * V_NB_I + i_indZ * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += (1.0-hY)*hZ * gvelo[i + i_indY * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY*(1.0-hZ) * gvelo[i + (i_indY + 1) * V_NB_I + i_indZ * V_NB_I * V_NB_II]; + velocity_cache[noBC_id(i)] += hY*hZ * gvelo[i + (i_indY + 1) * V_NB_I + (i_indZ + 1) * V_NB_I * V_NB_II]; + } + + if ((i_indY>=l_start_y && i_indY<=minmax_global[L_MAX_Y]-1) && (i_indZ>=l_start_z && i_indZ<=minmax_global[L_MAX_Z]-1)){ + for(i=gidX; i<l_nb_used.x; i+=(WI_NB)){ + buff_l_loc[i] = (1.0-hY)*(1.0-hZ)*buffer_l[i+(i_indY-l_start_y)*l_nb.x + (i_indZ-l_start_z)*l_nb.x*l_nb.y]; + buff_l_loc[i] += (1.0-hY)*hZ*buffer_l[i+(i_indY-l_start_y)*l_nb.x + (i_indZ+1-l_start_z)*l_nb.x*l_nb.y]; + buff_l_loc[i] += hY*(1.0-hZ)*buffer_l[i+(i_indY+1-l_start_y)*l_nb.x + (i_indZ-l_start_z)*l_nb.x*l_nb.y]; + buff_l_loc[i] += hY*hZ*buffer_l[i+(i_indY+1-l_start_y)*l_nb.x + (i_indZ+1-l_start_z)*l_nb.x*l_nb.y]; + }} + + if ((i_indY>=r_start_y && i_indY<=minmax_global[R_MAX_Y]-1) && (i_indZ>=r_start_z && i_indZ<=minmax_global[R_MAX_Z]-1)){ + for(i=gidX; i<r_nb_used.x; i+=(WI_NB)){ + buff_r_loc[i] = (1.0-hY)*(1.0-hZ)*buffer_r[i+(i_indY-r_start_y)*r_nb.x + (i_indZ-r_start_z)*r_nb.x*r_nb.y]; + buff_r_loc[i] += (1.0-hY)*hZ*buffer_r[i+(i_indY-r_start_y)*r_nb.x + (i_indZ+1-r_start_z)*r_nb.x*r_nb.y]; + buff_r_loc[i] += hY*(1.0-hZ)*buffer_r[i+(i_indY+1-r_start_y)*r_nb.x + (i_indZ-r_start_z)*r_nb.x*r_nb.y]; + buff_r_loc[i] += hY*hZ*buffer_r[i+(i_indY+1-r_start_y)*r_nb.x + (i_indZ+1-r_start_z)*r_nb.x*r_nb.y]; + }} + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=gidX; i<NB_I; i+=WI_NB) + { + c = i * dx.x + min_position; + // multi-scale : interpolate v from velocity buffer (of length V_NB_I) + p = c * v_invdx; + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind = i_ind - (V_START_INDEX-V_GHOSTS_NB) - MS_INTERPOL_SHIFT; + v = mix(velocity_cache[noBC_id(i_ind)], + velocity_cache[noBC_id(i_ind+1)],p); + p = (c + 0.5*dt*v) * v_invdx; + + i_ind = convert_int_rtn(p) - MS_INTERPOL_SHIFT; + + p = p - convert_float(i_ind); + + loc_ptr = (i_ind>=(V_START_INDEX-V_GHOSTS_NB) && i_ind <= (V_STOP_INDEX+V_GHOSTS_NB)) ? velocity_cache+noBC_id(i_ind - (V_START_INDEX-V_GHOSTS_NB)) : (i_ind<(V_START_INDEX-V_GHOSTS_NB)) ? buff_l_loc+i_ind-l_start_x : buff_r_loc+i_ind-r_start_x ; + v = (1.0-p)*(*loc_ptr); + i_ind = i_ind + 1; + loc_ptr = (i_ind>=(V_START_INDEX-V_GHOSTS_NB) && i_ind <= (V_STOP_INDEX+V_GHOSTS_NB)) ? velocity_cache+noBC_id(i_ind - (V_START_INDEX-V_GHOSTS_NB)) : (i_ind<(V_START_INDEX-V_GHOSTS_NB)) ? buff_l_loc+i_ind-l_start_x : buff_r_loc+i_ind-r_start_x ; + v += p*(*loc_ptr); + ppos[i+line_index] = c + dt * v; + } + +} + + +__kernel void reduce_stage1_advec(__global const float* gvelo, + __global int* minmax_buffer, + __local float* velocity_cache, + __local int* minmax, + float dt, float min_position, float4 dx, float4 v_dx) +{ + int lid = get_global_id(0); /* OpenCL work-itme global index (X) */ + int gidY, gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + float invdx = 1.0/dx.x; /* Space step inverse */ + float v_invdx = 1.0/v_dx.x; /* Space step inverse */ + int i; /* Particle index in 1D problem */ + int line_index; /* Current 1D problem index */ + int my_minmax[12] = {1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30}; + float p,v,c; + int i_ind, i_indY, i_indZ, ix, iy, iz; + bool is_l, is_r; + + for(gidY=0;gidY<NB_II;gidY++) + { + line_index = gidY*V_NB_I + gidZ*V_NB_I*V_NB_II; + fill_velocity_cache_reduction(gvelo, lid, gidY, gidZ, dx, v_dx, velocity_cache); + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=lid; i<NB_I; i+=WI_NB_REDUCE) + { + c = i * dx.x + min_position; + // multi-scale : interpolate v from velocity buffer (of length V_NB_I) + p = c * v_invdx; + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind = i_ind - (V_START_INDEX-V_GHOSTS_NB) - MS_INTERPOL_SHIFT; + v = mix(velocity_cache[noBC_id(i_ind)], + velocity_cache[noBC_id(i_ind+1)],p); + p = (c + 0.5*dt*v) * v_invdx; + + i_ind = convert_int_rtn(p); + i_indY = convert_int_rtn((gidY * dx.y) / v_dx.y); + i_indZ = convert_int_rtn((gidZ * dx.z) / v_dx.z); + + for(ix=i_ind-MS_INTERPOL_SHIFT; ix<=i_ind+1+MS_INTERPOL_SHIFT; ix++){ + is_l = ix<(V_START_INDEX-V_GHOSTS_NB); + is_r = ix>(V_STOP_INDEX+V_GHOSTS_NB); + for (iy=i_indY-MS_INTERPOL_SHIFT; iy<=i_indY+1+MS_INTERPOL_SHIFT; iy++){ + for (iz=i_indZ-MS_INTERPOL_SHIFT; iz<=i_indZ+1+MS_INTERPOL_SHIFT; iz++){ + + my_minmax[L_MIN_X] = (is_l && my_minmax[L_MIN_X]>ix) ? ix : my_minmax[L_MIN_X]; + my_minmax[L_MAX_X] = (is_l && my_minmax[L_MAX_X]<ix) ? ix : my_minmax[L_MAX_X]; + my_minmax[L_MIN_Y] = (is_l && my_minmax[L_MIN_Y]>iy) ? iy : my_minmax[L_MIN_Y]; + my_minmax[L_MAX_Y] = (is_l && my_minmax[L_MAX_Y]<iy) ? iy : my_minmax[L_MAX_Y]; + my_minmax[L_MIN_Z] = (is_l && my_minmax[L_MIN_Z]>iz) ? iz : my_minmax[L_MIN_Z]; + my_minmax[L_MAX_Z] = (is_l && my_minmax[L_MAX_Z]<iz) ? iz : my_minmax[L_MAX_Z]; + + my_minmax[R_MIN_X] = (is_r && my_minmax[R_MIN_X]>ix) ? ix : my_minmax[R_MIN_X]; + my_minmax[R_MAX_X] = (is_r && my_minmax[R_MAX_X]<ix) ? ix : my_minmax[R_MAX_X]; + my_minmax[R_MIN_Y] = (is_r && my_minmax[R_MIN_Y]>iy) ? iy : my_minmax[R_MIN_Y]; + my_minmax[R_MAX_Y] = (is_r && my_minmax[R_MAX_Y]<iy) ? iy : my_minmax[R_MAX_Y]; + my_minmax[R_MIN_Z] = (is_r && my_minmax[R_MIN_Z]>iz) ? iz : my_minmax[R_MIN_Z]; + my_minmax[R_MAX_Z] = (is_r && my_minmax[R_MAX_Z]<iz) ? iz : my_minmax[R_MAX_Z]; + + } + } + } + } + } + + for(i=0;i<12;i++) + minmax[lid*12+i] = my_minmax[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + reduce_local(minmax, lid); + + if (lid == 0) + for(i=0;i<12;i++) + minmax_buffer[gidZ*12+i] = minmax[i]; +} + + + +void fill_velocity_cache_reduction(__global const float* gvelo, + uint gidX, uint gidY, uint gidZ, + float4 dx, float4 v_dx, + __local float* gvelo_loc) +{ + uint i; + +#if NB_III == 1 + // Multi-Scale (2D) + + float line_posY, hY; + int indY; + int2 v_line_index; + float2 wY; + + + line_posY = (gidY * dx.y) / v_dx.y; + indY = convert_int_rtn(line_posY); + hY = line_posY - convert_float(indY); + + wY.s1 = hY; + wY.s0 = 1.0 - wY.s1; + + indY = indY + V_GHOSTS_NB - MS_INTERPOL_SHIFT; + + v_line_index.s0 = indY * V_NB_I; + v_line_index.s1 = (indY + 1) * V_NB_I; + + for(i=gidX; i<V_NB_I; i+=(WI_NB_REDUCE)){ + gvelo_loc[noBC_id(i)] = wY.s0 * gvelo[i + v_line_index.s0]; + gvelo_loc[noBC_id(i)] += wY.s1 * gvelo[i + v_line_index.s1]; + + } + +#else + // Multi-Scale (3D) + + float line_posY, hY; + float line_posZ, hZ; + int indY, indZ; + int2 v_line_indexY, v_line_indexZ; + float2 wY, wZ; + + line_posY = (gidY * dx.y) / v_dx.y; + line_posZ = (gidZ * dx.z) / v_dx.z; + indY = convert_int_rtn(line_posY); + indZ = convert_int_rtn(line_posZ); + hY = line_posY - convert_float(indY); + hZ = line_posZ - convert_float(indZ); + + wY.s1 = hY; + wY.s0 = 1.0 - wY.s1; + wZ.s1 = hZ; + wZ.s0 = 1.0 - wZ.s1; + + + indY = indY + V_GHOSTS_NB - MS_INTERPOL_SHIFT; + indZ = indZ + V_GHOSTS_NB - MS_INTERPOL_SHIFT; + + v_line_indexY.s0 = indY * V_NB_I; + v_line_indexY.s1 = (indY + 1) * V_NB_I; + v_line_indexZ.s0 = indZ * V_NB_I * V_NB_II; + v_line_indexZ.s1 = (indZ + 1) * V_NB_I * V_NB_II; + + for(i=gidX; i<V_NB_I; i+=(WI_NB_REDUCE)){ + gvelo_loc[noBC_id(i)] = wY.s0 * wZ.s0 * gvelo[i + v_line_indexY.s0 + v_line_indexZ.s0]; + gvelo_loc[noBC_id(i)] += wY.s0 * wZ.s1 * gvelo[i + v_line_indexY.s0 + v_line_indexZ.s1]; + gvelo_loc[noBC_id(i)] += wY.s1 * wZ.s0 * gvelo[i + v_line_indexY.s1 + v_line_indexZ.s0]; + gvelo_loc[noBC_id(i)] += wY.s1 * wZ.s1 * gvelo[i + v_line_indexY.s1 + v_line_indexZ.s1]; + } +#endif +} diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_advection_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_noVec.cl new file mode 100644 index 000000000..ef42469dc --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_advection_noVec.cl @@ -0,0 +1,161 @@ + + + +void reduce_local(__local int* minmax, int lid); + + +__kernel void buff_advec(__global const float* gvelo, + __global float* ppos, + __global float* buffer_l, + __global float* buffer_r, + __global int* minmax_global, + __local float* velocity_cache, + __local float* buff_l_loc, + __local float* buff_r_loc, + float dt, float min_position, float4 dx, float4 v_dx, + int4 l_nb, int4 r_nb) +{ + int gidX = get_global_id(0); /* OpenCL work-itme global index (X) */ + int gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ + int gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + float invdx = 1.0/dx.x; /* Space step inverse */ + float v_invdx = 1.0/v_dx.x; /* Space step inverse */ + int i; /* Particle index in 1D problem */ + int line_index ; /* Current 1D problem index */ + + float v,vp,p,c, hdt = 0.5 * dt; + int i_ind, i_ind_p; + + int l_start_x = minmax_global[L_MIN_X]; + int l_start_y = minmax_global[L_MIN_Y]; + int l_start_z = minmax_global[L_MIN_Z]; + int r_start_x = minmax_global[R_MIN_X]; + int r_start_y = minmax_global[R_MIN_Y]; + int r_start_z = minmax_global[R_MIN_Z]; + int4 l_nb_used = (int4)(minmax_global[L_MAX_X] - l_start_x + 1, + minmax_global[L_MAX_Y] - l_start_y + 1, + minmax_global[L_MAX_Z] - l_start_z + 1, + 0); + int4 r_nb_used = (int4)(minmax_global[R_MAX_X] - r_start_x + 1, + minmax_global[R_MAX_Y] - r_start_y + 1, + minmax_global[R_MAX_Z] - r_start_z + 1, + 0); + + __local float* loc_ptr; + + if ((gidY>=l_start_y && gidY<=minmax_global[L_MAX_Y]) && (gidZ>=l_start_z && gidZ<=minmax_global[L_MAX_Z])){ + for(i=gidX; i<l_nb_used.x; i+=(WI_NB)){ + buff_l_loc[i] = buffer_l[i+(gidY-l_start_y)*l_nb.x + (gidZ-l_start_z)*l_nb.x*l_nb.y];}} + + if ((gidY>=r_start_y && gidY<=minmax_global[R_MAX_Y]) && (gidZ>=r_start_z && gidZ<=minmax_global[R_MAX_Z])){ + for(i=gidX; i<r_nb_used.x; i+=(WI_NB)){ + buff_r_loc[i] = buffer_r[i+(gidY-r_start_y)*r_nb.x + (gidZ-r_start_z)*r_nb.x*r_nb.y];}} + + line_index = gidY*V_NB_I + gidZ*V_NB_I*V_NB_II; + for(i=gidX; i<V_NB_I; i+=(WI_NB)) + { + /* Read velocity */ + /* Fill velocity cache */ + velocity_cache[noBC_id(i)] = gvelo[i+line_index]; + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + line_index = gidY*NB_I+gidZ*NB_I*NB_II; + for(i=gidX; i<NB_I; i+=WI_NB) + { + c = i * dx.x + min_position; + v = velocity_cache[noBC_id(i + V_GHOSTS_NB)]; + p = (c + hdt*v) * v_invdx; + + i_ind = convert_int_rtn(p); + p = p - convert_float(i_ind); + i_ind_p = i_ind + 1; + loc_ptr = (i_ind>=(V_START_INDEX-V_GHOSTS_NB) && i_ind <= (V_STOP_INDEX+V_GHOSTS_NB)) ? velocity_cache + noBC_id(i_ind - (V_START_INDEX-V_GHOSTS_NB)) : (i_ind<(V_START_INDEX-V_GHOSTS_NB)) ? buff_l_loc+i_ind-l_start_x : buff_r_loc+i_ind-r_start_x ; + v = *loc_ptr; + + loc_ptr = (i_ind_p>=(V_START_INDEX-V_GHOSTS_NB) && i_ind_p <= (V_STOP_INDEX+V_GHOSTS_NB)) ? velocity_cache+noBC_id(i_ind_p - (V_START_INDEX-V_GHOSTS_NB)) : (i_ind_p<(V_START_INDEX-V_GHOSTS_NB)) ? buff_l_loc+i_ind_p-l_start_x : buff_r_loc+i_ind_p-r_start_x ; + vp = *loc_ptr; + + v = (p*(vp-v) + v); + p = c + dt * v; + ppos[i+line_index] = p; + } + +} + + +__kernel void reduce_stage1_advec(__global const float* gvelo, + __global int* minmax_buffer, + __local float* velocity_cache, + __local int* minmax, + float dt, float min_position, float4 dx, float4 v_dx) +{ + int gidY, gidZ = get_global_id(2); + int lid = get_global_id(0); + int my_minmax[12] = {1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30}; + int i, line_index; + int ix; + float p, c, v; + bool is_l, is_r; + + for(gidY=0;gidY<NB_II;gidY++) + { + line_index = gidY*V_NB_I + gidZ*V_NB_I*V_NB_II; + for (i=lid; i<NB_I; i+=WI_NB_REDUCE) + { + c = i * dx.x + min_position; + v = gvelo[i+line_index]; + p = (c + 0.5 * dt * v) / v_dx.x; + + ix = convert_int_rtn(p); + + is_l = ix<(V_START_INDEX-V_GHOSTS_NB); + is_r = ix>(V_STOP_INDEX+V_GHOSTS_NB); + + my_minmax[L_MIN_X] = (is_l && my_minmax[L_MIN_X]>ix) ? ix : my_minmax[L_MIN_X]; + my_minmax[L_MAX_X] = (is_l && my_minmax[L_MAX_X]<ix) ? ix : my_minmax[L_MAX_X]; + my_minmax[L_MIN_Y] = (is_l && my_minmax[L_MIN_Y]>gidY) ? gidY : my_minmax[L_MIN_Y]; + my_minmax[L_MAX_Y] = (is_l && my_minmax[L_MAX_Y]<gidY) ? gidY : my_minmax[L_MAX_Y]; + my_minmax[L_MIN_Z] = (is_l && my_minmax[L_MIN_Z]>gidZ) ? gidZ : my_minmax[L_MIN_Z]; + my_minmax[L_MAX_Z] = (is_l && my_minmax[L_MAX_Z]<gidZ) ? gidZ : my_minmax[L_MAX_Z]; + + my_minmax[R_MIN_X] = (is_r && my_minmax[R_MIN_X]>ix) ? ix : my_minmax[R_MIN_X]; + my_minmax[R_MAX_X] = (is_r && my_minmax[R_MAX_X]<ix) ? ix : my_minmax[R_MAX_X]; + my_minmax[R_MIN_Y] = (is_r && my_minmax[R_MIN_Y]>gidY) ? gidY : my_minmax[R_MIN_Y]; + my_minmax[R_MAX_Y] = (is_r && my_minmax[R_MAX_Y]<gidY) ? gidY : my_minmax[R_MAX_Y]; + my_minmax[R_MIN_Z] = (is_r && my_minmax[R_MIN_Z]>gidZ) ? gidZ : my_minmax[R_MIN_Z]; + my_minmax[R_MAX_Z] = (is_r && my_minmax[R_MAX_Z]<gidZ) ? gidZ : my_minmax[R_MAX_Z]; + + ix = ix + 1; + is_l = ix<(V_START_INDEX-V_GHOSTS_NB); + is_r = ix>(V_STOP_INDEX+V_GHOSTS_NB); + + my_minmax[L_MIN_X] = (is_l && my_minmax[L_MIN_X]>ix) ? ix : my_minmax[L_MIN_X]; + my_minmax[L_MAX_X] = (is_l && my_minmax[L_MAX_X]<ix) ? ix : my_minmax[L_MAX_X]; + my_minmax[L_MIN_Y] = (is_l && my_minmax[L_MIN_Y]>gidY) ? gidY : my_minmax[L_MIN_Y]; + my_minmax[L_MAX_Y] = (is_l && my_minmax[L_MAX_Y]<gidY) ? gidY : my_minmax[L_MAX_Y]; + my_minmax[L_MIN_Z] = (is_l && my_minmax[L_MIN_Z]>gidZ) ? gidZ : my_minmax[L_MIN_Z]; + my_minmax[L_MAX_Z] = (is_l && my_minmax[L_MAX_Z]<gidZ) ? gidZ : my_minmax[L_MAX_Z]; + + my_minmax[R_MIN_X] = (is_r && my_minmax[R_MIN_X]>ix) ? ix : my_minmax[R_MIN_X]; + my_minmax[R_MAX_X] = (is_r && my_minmax[R_MAX_X]<ix) ? ix : my_minmax[R_MAX_X]; + my_minmax[R_MIN_Y] = (is_r && my_minmax[R_MIN_Y]>gidY) ? gidY : my_minmax[R_MIN_Y]; + my_minmax[R_MAX_Y] = (is_r && my_minmax[R_MAX_Y]<gidY) ? gidY : my_minmax[R_MAX_Y]; + my_minmax[R_MIN_Z] = (is_r && my_minmax[R_MIN_Z]>gidZ) ? gidZ : my_minmax[R_MIN_Z]; + my_minmax[R_MAX_Z] = (is_r && my_minmax[R_MAX_Z]<gidZ) ? gidZ : my_minmax[R_MAX_Z]; + } + } + + for(i=0;i<12;i++) + minmax[lid*12+i] = my_minmax[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + reduce_local(minmax, lid); + + if (lid == 0) + for(i=0;i<12;i++) + minmax_buffer[gidZ*12+i] = minmax[i]; +} diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_diffusion.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_diffusion.cl new file mode 100644 index 000000000..6b503bc21 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_diffusion.cl @@ -0,0 +1,150 @@ + + +__kernel void diffusion(__global const float* scal_in, + __global const float* ghosts, +#if (CUT_DIR_Y + CUT_DIR_Y )== 2 + __global const float* ghostsZ, +#endif + __global float* scal_out, + float nudt, float4 dx) +{ + int t_gidX = get_group_id(0); + int t_gidY = get_group_id(1); + int lidX = get_local_id(0); + int lidY = get_local_id(1); + int gidX = t_gidX*TILE_SIZE + lidX; /* OpenCL work-item global index (X) */ + int gidY = t_gidY*TILE_SIZE + lidY; /* OpenCL work-item global index (Y) */ + int gidZ; + float cx = nudt/(dx.x*dx.x); + float cy = nudt/(dx.y*dx.y); + float cz = nudt/(dx.z*dx.z); + float scal_z_m[NB_PART]; + float scal_z[NB_PART]; + float scal_z_p[NB_PART]; + float s; + uint i; + + for(i=0;i<NB_PART;i++){ +#if CUT_DIR_Z == 1 +#if CUT_DIR_Y == 1 + scal_z_m[i] = ghostsZ[gidX + (gidY+i*L_WIDTH)*NB_X + NB_X*NB_Y]; +#else + scal_z_m[i] = ghosts[gidX + (gidY+i*L_WIDTH)*NB_X + NB_X*NB_Y]; +#endif +#else + scal_z_m[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + (NB_Z-1)*NB_X*NB_Y]; +#endif + scal_z[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X]; + } + + /* scal_z[nb_parts] */ + /* for i in xrange(nb_parts) */ + /* lidY+i*L_WIDTH */ + /* gidY+i*L_WIDTH */ + + __local float tile_XY[TILE_SIZE+2][TILE_SIZE+2]; + + lidX += 1; + lidY += 1; + + // loop over Z indices but last. + for (gidZ=0; gidZ<(NB_Z-1); gidZ++) + { + for(i=0;i<NB_PART;i++){ + // fill the tile + tile_XY[lidX][lidY+i*L_WIDTH] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + + /* // fill tile edges */ + tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + } +#if CUT_DIR_Y == 1 + tile_XY[lidX][0] = (t_gidY*TILE_SIZE>=1)? scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + NB_X + gidZ*NB_X*2]; + tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE+1<NB_Y) ? scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + gidZ*NB_X*2]; +#else + tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; +#endif + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=0;i<NB_PART;i++){ + /* get scalar value in Z direction */ + scal_z_p[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + (gidZ+1)*NB_X*NB_Y]; + + // Compute stencil + // central point + s = scal_z[i] * (1.0 - 2.0 * (cx + cy + cz)); + + s += cz*(scal_z_m[i] + scal_z_p[i]); + + s += cy * tile_XY[lidX][lidY+i*L_WIDTH-1]; + s += cy * tile_XY[lidX][lidY+i*L_WIDTH+1]; + s += cx * tile_XY[lidX-1][lidY+i*L_WIDTH]; + s += cx * tile_XY[lidX+1][lidY+i*L_WIDTH]; + + // write result + scal_out[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] = s; + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=0;i<NB_PART;i++){ + // Shift Z values + scal_z_m[i] = scal_z[i]; + scal_z[i] = scal_z_p[i]; + } + } + + // Compute last point (from ghosts) + gidZ = NB_Z - 1; + + for(i=0;i<NB_PART;i++){ + // fill the tile + tile_XY[lidX][lidY+i*L_WIDTH] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + + /* // fill tile edges */ + tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + } +#if CUT_DIR_Y == 1 + tile_XY[lidX][0] = (t_gidY*TILE_SIZE>=1)? scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + NB_X + gidZ*NB_X*2]; + tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE+1<NB_Y) ? scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + gidZ*NB_X*2]; +#else + tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; +#endif + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=0;i<NB_PART;i++){ + /* // get scalar value in Z direction */ +#if CUT_DIR_Z == 1 +#if CUT_DIR_Y == 1 + scal_z_p[i] = ghostsZ[gidX + (gidY+i*L_WIDTH)*NB_X]; +#else + scal_z_p[i] = ghosts[gidX + (gidY+i*L_WIDTH)*NB_X]; +#endif +#else + scal_z_p[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X]; +#endif + + // Compute stencil + /* // central point */ + s = scal_z[i] * (1.0 - 2.0 * (cx + cy + cz)); + + s += cz*(scal_z_m[i] + scal_z_p[i]); + + s += cy * tile_XY[lidX][lidY+i*L_WIDTH-1]; + s += cy * tile_XY[lidX][lidY+i*L_WIDTH+1]; + s += cx * tile_XY[lidX-1][lidY+i*L_WIDTH]; + s += cx * tile_XY[lidX+1][lidY+i*L_WIDTH]; + + // write result + scal_out[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] = s; + } +} + diff --git a/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl new file mode 100644 index 000000000..2081d8235 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/kernels/comm_remeshing_noVec.cl @@ -0,0 +1,253 @@ +/** + * @file comm_remeshing_noVec.cl + * Remeshing kernel. + */ + +void reduce_local(__local int* minmax, int lid); + +/** + * Kernel to reduce the particle minimums and maximums indices (minmax values) of grid points out of the local domain that are concerned by particles remeshing contributions. + * + * @param ppos : particles positions. + * @param minmax_buffer : global array to store minmax values for each XY plane. + * @param dx : mesh step size. + */ +__kernel void reduce_stage1_rmsh(__global const float* ppos, + __global int* minmax_buffer, + __local int* minmax, + float dx) +{ + int gidY, gidZ = get_global_id(2); + int lid = get_global_id(0); + int my_minmax[12] = {1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30}; + int i, s; + int ix,ix_s; + float p; + bool is_l, is_r; + + for(gidY=0;gidY<NB_II;gidY++) + { + for (i=lid; i<NB_I; i+=WI_NB_REDUCE) + { + p = (ppos[gidZ*NB_I*NB_II + gidY*NB_I + i])/dx; + ix = convert_int_rtn(p) - REMESH_SHIFT; + + for (s=0;s<2*(REMESH_SHIFT+1); s++) + { + ix_s = ix + s; + is_l = ix_s<START_INDEX; + is_r = ix_s>STOP_INDEX; + + my_minmax[L_MIN_X] = (is_l && my_minmax[L_MIN_X]>ix_s) ? ix_s : my_minmax[L_MIN_X]; + my_minmax[L_MAX_X] = (is_l && my_minmax[L_MAX_X]<ix_s) ? ix_s : my_minmax[L_MAX_X]; + my_minmax[L_MIN_Y] = (is_l && my_minmax[L_MIN_Y]>gidY) ? gidY : my_minmax[L_MIN_Y]; + my_minmax[L_MAX_Y] = (is_l && my_minmax[L_MAX_Y]<gidY) ? gidY : my_minmax[L_MAX_Y]; + my_minmax[L_MIN_Z] = (is_l && my_minmax[L_MIN_Z]>gidZ) ? gidZ : my_minmax[L_MIN_Z]; + my_minmax[L_MAX_Z] = (is_l && my_minmax[L_MAX_Z]<gidZ) ? gidZ : my_minmax[L_MAX_Z]; + + my_minmax[R_MIN_X] = (is_r && my_minmax[R_MIN_X]>ix_s) ? ix_s : my_minmax[R_MIN_X]; + my_minmax[R_MAX_X] = (is_r && my_minmax[R_MAX_X]<ix_s) ? ix_s : my_minmax[R_MAX_X]; + my_minmax[R_MIN_Y] = (is_r && my_minmax[R_MIN_Y]>gidY) ? gidY : my_minmax[R_MIN_Y]; + my_minmax[R_MAX_Y] = (is_r && my_minmax[R_MAX_Y]<gidY) ? gidY : my_minmax[R_MAX_Y]; + my_minmax[R_MIN_Z] = (is_r && my_minmax[R_MIN_Z]>gidZ) ? gidZ : my_minmax[R_MIN_Z]; + my_minmax[R_MAX_Z] = (is_r && my_minmax[R_MAX_Z]<gidZ) ? gidZ : my_minmax[R_MAX_Z]; + } + } + } + + for(i=0;i<12;i++) + minmax[lid*12+i] = my_minmax[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + reduce_local(minmax, lid); + + if (lid == 0) + for(i=0;i<12;i++) + minmax_buffer[gidZ*12+i] = minmax[i]; +} + +/** + * Performs remeshing of the particles' scalar. + * A work-group is handling a 1D problem. Thus, gidY and gidZ are constants among work-items of a work-group. + * Each work-item computes <code>NB_I/WI_NB</code> particles positions. To avoid concurrent witings, in case of strong velocity gradients, work-items computes contiguous particles. + * Particle are computed through OpenCL vector types of lenght 2, 4 or 8. + * Scalar results are stored in a local buffer as a cache and then copied to global memory buffer. + * + * @param ppos Particle position + * @param pscal Particle scalar + * @param gscal Grid scalar + * @param buffer_l Buffer for storing out of domain contributions (to left) + * @param buffer_r Buffer for storing out of domain contributions (to right) + * @param min_position Domain lower coordinate + * @param dx Space step + * @param l_nb buffer_l sizes + * @param r_nb buffer_r sizes + * + * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. + * @remark <code>WI_NB</code> corresponds to the work-item number. + * @remark <code>__N__</code> is expanded at compilation time by vector width. + * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. + * @remark <code>__RCOMP_I</code> flag is for instruction expansion for the different remeshed components. + * @remark <code>__RCOMP_P</code> flag is for function parameter expansion for the different remeshed components. + * @remark <code>__ID__</code> is replaced by the remeshed component id in an expansion. + * @see parmepy.gpu.tools.parse_file + */ +__kernel void buff_remesh(__global const float* ppos, + __global const float* pscal, + __global float* gscal, + __global float* buffer_l, + __global float* buffer_r, + __global int* minmax_global, + __local float* gscal_loc, + __local float* l_buff_loc, + __local float* r_buff_loc, + float min_position, float dx, + int4 l_nb, int4 r_nb + ) +{ + int lid = get_local_id(0); /* OpenCL work-itme global index (X) */ + int gidY = get_global_id(1); /* OpenCL work-itme global index (Y) */ + int gidZ = get_global_id(2); /* OpenCL work-itme global index (Z) */ + float invdx = 1.0/dx; /* Space step inverse */ + int i; /* Particle index in 1D problem */ + float p; /* Particle position */ + float s; /* Particle scalar */ +float y; /* Normalized distance to nearest left grid point */ + int ind; /* Integer coordinate */ + int index; /* Remeshing index */ + float w; + + + int l_start_x = minmax_global[L_MIN_X]; + int l_start_y = minmax_global[L_MIN_Y]; + int l_start_z = minmax_global[L_MIN_Z]; + int r_start_x = minmax_global[R_MIN_X]; + int r_start_y = minmax_global[R_MIN_Y]; + int r_start_z = minmax_global[R_MIN_Z]; + int4 l_nb_used = (int4)(minmax_global[L_MAX_X] - l_start_x + 1, + minmax_global[L_MAX_Y] - l_start_y + 1, + minmax_global[L_MAX_Z] - l_start_z + 1, + 0); + int4 r_nb_used = (int4)(minmax_global[R_MAX_X] - r_start_x + 1, + minmax_global[R_MAX_Y] - r_start_y + 1, + minmax_global[R_MAX_Z] - r_start_z + 1, + 0); + + uint line_index = gidY*NB_I+ gidZ*NB_I*NB_II; /* Current 1D problem index */ + + __local float* loc_ptr; + + // Initialize buffers + if((lid < l_nb_used.x)) + l_buff_loc[lid] = 0.0; + if((lid < r_nb_used.x)) + r_buff_loc[lid] = 0.0; + + for(i=lid; i<NB_I; i+=WI_NB) + { + /* Initialize result buffer */ + gscal_loc[i] = 0.0; + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=lid*PART_NB_PER_WI; i<(lid + 1)*PART_NB_PER_WI; i+=1) + { + /* Read particle position */ + p = ppos[i + line_index]; + /* Read particle scalar */ + s = pscal[i + line_index]; + /* Remesh particle */ + + ind = convert_int_rtn(p * invdx); + y = (p - convert_float(ind) * dx) * invdx; + + index = ind - REMESH_SHIFT; + + w = REMESH(alpha)(y); + loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +noBC_id(index-START_INDEX) : ( (index<START_INDEX)? l_buff_loc+index-l_start_x : r_buff_loc + index-r_start_x ); + w = w * s; + (*loc_ptr) += w; + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(beta)(y); + loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +noBC_id(index-START_INDEX) : ( (index<START_INDEX)? l_buff_loc+index-l_start_x : r_buff_loc + index-r_start_x ); + w = w * s; + (*loc_ptr) += w; + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(gamma)(y); + loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +noBC_id(index-START_INDEX) : ( (index<START_INDEX)? l_buff_loc+index-l_start_x : r_buff_loc + index-r_start_x ); + w = w * s; + (*loc_ptr) += w; + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(delta)(y); + loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +noBC_id(index-START_INDEX) : ( (index<START_INDEX)? l_buff_loc+index-l_start_x : r_buff_loc + index-r_start_x ); + w = w * s; + (*loc_ptr) += w; + barrier(CLK_LOCAL_MEM_FENCE); + +#if REMESH_SHIFT > 1 + index = index + 1; + w = REMESH(eta)(y); + loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +noBC_id(index-START_INDEX) : ( (index<START_INDEX)? l_buff_loc+index-l_start_x : r_buff_loc + index-r_start_x ); + w = w * s; + (*loc_ptr) += w; + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(zeta)(y); + loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +noBC_id(index-START_INDEX) : ( (index<START_INDEX)? l_buff_loc+index-l_start_x : r_buff_loc + index-r_start_x ); + w = w * s; + (*loc_ptr) += w; + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 2 + index = index + 1; + w = REMESH(theta)(y); + loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +noBC_id(index-START_INDEX) : ( (index<START_INDEX)? l_buff_loc+index-l_start_x : r_buff_loc + index-r_start_x ); + w = w * s; + (*loc_ptr) += w; + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + w = REMESH(iota)(y); + loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +noBC_id(index-START_INDEX) : ( (index<START_INDEX)? l_buff_loc+index-l_start_x : r_buff_loc + index-r_start_x ); + w = w * s; + (*loc_ptr) += w; + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 3 + index = index + 1; + w = REMESH(kappa)(y); + loc_ptr = (index>=START_INDEX && index <= STOP_INDEX) ? gscal_loc +noBC_id(index-START_INDEX) : ( (index<START_INDEX)? l_buff_loc+index-l_start_x : r_buff_loc + index-r_start_x ); + w = w * s; + (*loc_ptr) += w; + barrier(CLK_LOCAL_MEM_FENCE); +#endif + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=lid; i<NB_I; i+=WI_NB) + { + /* Store result */ + gscal[i + line_index] = gscal_loc[noBC_id(i)]; + } + + // Store buffers + if((lid < l_nb_used.x) && (gidY<l_nb_used.y) && (gidZ<l_nb_used.z)) + buffer_l[lid + gidY*l_nb.x + gidZ*l_nb.x*l_nb.y] = l_buff_loc[lid]; + if((lid < r_nb_used.x) && (gidY<r_nb_used.y) && (gidZ<r_nb_used.z)) + buffer_r[lid + gidY*r_nb.x + gidZ*r_nb.x*l_nb.y] = r_buff_loc[lid]; + +} diff --git a/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl b/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl new file mode 100644 index 000000000..d764a3a9b --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/kernels/diffusion.cl @@ -0,0 +1,119 @@ + + +__kernel void diffusion(__global const float* scal_in, + __global float* scal_out, + float nudt, float4 dx) +{ + int t_gidX = get_group_id(0); + int t_gidY = get_group_id(1); + int lidX = get_local_id(0); + int lidY = get_local_id(1); + int gidX = t_gidX*TILE_SIZE + lidX; /* OpenCL work-item global index (X) */ + int gidY = t_gidY*TILE_SIZE + lidY; /* OpenCL work-item global index (Y) */ + int gidZ; + float cx = nudt/(dx.x*dx.x); + float cy = nudt/(dx.y*dx.y); + float cz = nudt/(dx.z*dx.z); + float scal_z_m[NB_PART]; + float scal_z[NB_PART]; + float scal_z_p[NB_PART]; + float s; + uint i; + + for(i=0;i<NB_PART;i++){ + scal_z_m[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + NB_X*NB_Y + (NB_Z-1)*NB_X*NB_Y]; + scal_z[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X]; + } + + /* scal_z[nb_parts] */ + /* for i in xrange(nb_parts) */ + /* lidY+i*L_WIDTH */ + /* gidY+i*L_WIDTH */ + + __local float tile_XY[TILE_SIZE+2][TILE_SIZE+2]; + + lidX += 1; + lidY += 1; + + // loop over Z indices but last. + for (gidZ=0; gidZ<(NB_Z-1); gidZ++) + { + for(i=0;i<NB_PART;i++){ + // fill the tile + tile_XY[lidX][lidY+i*L_WIDTH] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + + /* // fill tile edges */ + tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + } + tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=0;i<NB_PART;i++){ + /* get scalar value in Z direction */ + scal_z_p[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + (gidZ+1)*NB_X*NB_Y]; + + // Compute stencil + // central point + s = scal_z[i] * (1.0 - 2.0 * (cx + cy + cz)); + + s += cz*(scal_z_m[i] + scal_z_p[i]); + + s += cy * tile_XY[lidX][lidY+i*L_WIDTH-1]; + s += cy * tile_XY[lidX][lidY+i*L_WIDTH+1]; + s += cx * tile_XY[lidX-1][lidY+i*L_WIDTH]; + s += cx * tile_XY[lidX+1][lidY+i*L_WIDTH]; + + // write result + scal_out[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] = s; + } + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=0;i<NB_PART;i++){ + // Shift Z values + scal_z_m[i] = scal_z[i]; + scal_z[i] = scal_z_p[i]; + } + } + + // Compute last point (from ghosts) + gidZ = NB_Z - 1; + + for(i=0;i<NB_PART;i++){ + // fill the tile + tile_XY[lidX][lidY+i*L_WIDTH] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + + /* // fill tile edges */ + tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y]; + } + tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; + tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y]; + + /* Synchronize work-group */ + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=0;i<NB_PART;i++){ + /* // get scalar value in Z direction */ + scal_z_p[i] = scal_in[gidX + (gidY+i*L_WIDTH)*NB_X]; + + // Compute stencil + /* // central point */ + s = scal_z[i] * (1.0 - 2.0 * (cx + cy + cz)); + + s += cz*(scal_z_m[i] + scal_z_p[i]); + + s += cy * tile_XY[lidX][lidY+i*L_WIDTH-1]; + s += cy * tile_XY[lidX][lidY+i*L_WIDTH+1]; + s += cx * tile_XY[lidX-1][lidY+i*L_WIDTH]; + s += cx * tile_XY[lidX+1][lidY+i*L_WIDTH]; + + // write result + scal_out[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] = s; + } +} diff --git a/HySoP/hysop/gpu/cl_src/kernels/minmax_buffers.cl b/HySoP/hysop/gpu/cl_src/kernels/minmax_buffers.cl new file mode 100644 index 000000000..558fef5c9 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/kernels/minmax_buffers.cl @@ -0,0 +1,84 @@ +/** + * @file minmax_buffers.cl + * + * @brief reduction kernels. + * + * + */ + +/** + * Perfoms a global reduction of an array of minmax values. + * + * @param minmax_buffer : minmax values to reduce + * @param minmax_global : Result of the reduction (12 integers) + * + * @remark : A single stage kernel can be written but it seems to be 5 time slower + */ +__kernel void reduce_stage2(__global const int* minmax_buffer, + __global int* minmax_global, + __local int* minmax) +{ + int lid = get_global_id(0); + int my_minmax[12] = {1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30, 1<<30, -1<<30}; + int i; + + for (i=lid; i<NB_III; i+=WI_NB_REDUCE) + { + my_minmax[L_MIN_X] = (my_minmax[L_MIN_X] < minmax_buffer[12*i + L_MIN_X] ) ? my_minmax[L_MIN_X] : minmax_buffer[12*i + L_MIN_X]; + my_minmax[L_MAX_X] = (my_minmax[L_MAX_X] > minmax_buffer[12*i + L_MAX_X]) ? my_minmax[L_MAX_X] : minmax_buffer[12*i + L_MAX_X]; + my_minmax[L_MIN_Y] = (my_minmax[L_MIN_Y] < minmax_buffer[12*i + L_MIN_Y]) ? my_minmax[L_MIN_Y] : minmax_buffer[12*i + L_MIN_Y]; + my_minmax[L_MAX_Y] = (my_minmax[L_MAX_Y] > minmax_buffer[12*i + L_MAX_Y]) ? my_minmax[L_MAX_Y] : minmax_buffer[12*i + L_MAX_Y]; + my_minmax[L_MIN_Z] = (my_minmax[L_MIN_Z] < minmax_buffer[12*i + L_MIN_Z]) ? my_minmax[L_MIN_Z] : minmax_buffer[12*i + L_MIN_Z]; + my_minmax[L_MAX_Z] = (my_minmax[L_MAX_Z] > minmax_buffer[12*i + L_MAX_Z]) ? my_minmax[L_MAX_Z] : minmax_buffer[12*i + L_MAX_Z]; + + my_minmax[R_MIN_X] = (my_minmax[R_MIN_X] < minmax_buffer[12*i + R_MIN_X]) ? my_minmax[R_MIN_X] : minmax_buffer[12*i + R_MIN_X]; + my_minmax[R_MAX_X] = (my_minmax[R_MAX_X] > minmax_buffer[12*i + R_MAX_X]) ? my_minmax[R_MAX_X] : minmax_buffer[12*i + R_MAX_X]; + my_minmax[R_MIN_Y] = (my_minmax[R_MIN_Y] < minmax_buffer[12*i + R_MIN_Y]) ? my_minmax[R_MIN_Y] : minmax_buffer[12*i + R_MIN_Y]; + my_minmax[R_MAX_Y] = (my_minmax[R_MAX_Y] > minmax_buffer[12*i + R_MAX_Y]) ? my_minmax[R_MAX_Y] : minmax_buffer[12*i + R_MAX_Y]; + my_minmax[R_MIN_Z] = (my_minmax[R_MIN_Z] < minmax_buffer[12*i + R_MIN_Z]) ? my_minmax[R_MIN_Z] : minmax_buffer[12*i + R_MIN_Z]; + my_minmax[R_MAX_Z] = (my_minmax[R_MAX_Z] > minmax_buffer[12*i + R_MAX_Z]) ? my_minmax[R_MAX_Z] : minmax_buffer[12*i + R_MAX_Z]; + } + + for(i=0;i<12;i++) + minmax[lid*12+i] = my_minmax[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + reduce_local(minmax, lid); + + for(i=0;i<12;i++) + minmax_global[i] = minmax[i]; +} + + +/** + * Function to reduce in local memory of minmax values. The array must contains one value (12 integers) per work-items of a single work-group. + * At each step of the reduction, the last half part of the non-reduced arrays is compaired and reduced into the first half part and the non reduced array becomes the first half part. + * + * @param minmax : array of minmax values (12 integers each). + * @param lid : local index of the work-item. + */ +void reduce_local(__local int* minmax, int lid) +{ + int offset; + for(offset=WI_NB_REDUCE/2; offset>0; offset=offset/2) + { + if(lid < offset) + { + minmax[lid*12 + L_MIN_X] = (minmax[lid*12 + L_MIN_X] < minmax[(lid+offset)*12 + L_MIN_X]) ? minmax[lid*12 + L_MIN_X] : minmax[(lid+offset)*12 + L_MIN_X]; + minmax[lid*12 + R_MIN_X] = (minmax[lid*12 + R_MIN_X] < minmax[(lid+offset)*12 + R_MIN_X]) ? minmax[lid*12 + R_MIN_X] : minmax[(lid+offset)*12 + R_MIN_X]; + minmax[lid*12 + L_MIN_Y] = (minmax[lid*12 + L_MIN_Y] < minmax[(lid+offset)*12 + L_MIN_Y]) ? minmax[lid*12 + L_MIN_Y] : minmax[(lid+offset)*12 + L_MIN_Y]; + minmax[lid*12 + R_MIN_Y] = (minmax[lid*12 + R_MIN_Y] < minmax[(lid+offset)*12 + R_MIN_Y]) ? minmax[lid*12 + R_MIN_Y] : minmax[(lid+offset)*12 + R_MIN_Y]; + minmax[lid*12 + L_MIN_Z] = (minmax[lid*12 + L_MIN_Z] < minmax[(lid+offset)*12 + L_MIN_Z]) ? minmax[lid*12 + L_MIN_Z] : minmax[(lid+offset)*12 + L_MIN_Z]; + minmax[lid*12 + R_MIN_Z] = (minmax[lid*12 + R_MIN_Z] < minmax[(lid+offset)*12 + R_MIN_Z]) ? minmax[lid*12 + R_MIN_Z] : minmax[(lid+offset)*12 + R_MIN_Z]; + + minmax[lid*12 + L_MAX_X] = (minmax[lid*12 + L_MAX_X] > minmax[(lid+offset)*12 + L_MAX_X]) ? minmax[lid*12 + L_MAX_X] : minmax[(lid+offset)*12 + L_MAX_X]; + minmax[lid*12 + R_MAX_X] = (minmax[lid*12 + R_MAX_X] > minmax[(lid+offset)*12 + R_MAX_X]) ? minmax[lid*12 + R_MAX_X] : minmax[(lid+offset)*12 + R_MAX_X]; + minmax[lid*12 + L_MAX_Y] = (minmax[lid*12 + L_MAX_Y] > minmax[(lid+offset)*12 + L_MAX_Y]) ? minmax[lid*12 + L_MAX_Y] : minmax[(lid+offset)*12 + L_MAX_Y]; + minmax[lid*12 + R_MAX_Y] = (minmax[lid*12 + R_MAX_Y] > minmax[(lid+offset)*12 + R_MAX_Y]) ? minmax[lid*12 + R_MAX_Y] : minmax[(lid+offset)*12 + R_MAX_Y]; + minmax[lid*12 + L_MAX_Z] = (minmax[lid*12 + L_MAX_Z] > minmax[(lid+offset)*12 + L_MAX_Z]) ? minmax[lid*12 + L_MAX_Z] : minmax[(lid+offset)*12 + L_MAX_Z]; + minmax[lid*12 + R_MAX_Z] = (minmax[lid*12 + R_MAX_Z] > minmax[(lid+offset)*12 + R_MAX_Z]) ? minmax[lid*12 + R_MAX_Z] : minmax[(lid+offset)*12 + R_MAX_Z]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} diff --git a/HySoP/hysop/gpu/cl_src/kernels/remeshing.cl b/HySoP/hysop/gpu/cl_src/kernels/remeshing.cl index bf2cf7d2f..1bd8fb5b8 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/remeshing.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/remeshing.cl @@ -53,11 +53,11 @@ __kernel void remeshing_kernel(__global const float* ppos, for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=__N__) { /* Read particle position */ - p = vload__N__((i + line_index)/__N__, ppos) - (float__N__)(min_position); + p = vload__N__((i + line_index)/__N__, ppos); /* Read particle scalar */ __RCOMP_Is__ID__ = vload__N__((i + line_index)/__N__, pscal__ID__); /* Remesh particle */ - remesh(i, dx, invdx, __RCOMP_Ps__ID__, p, __RCOMP_Pgscal_loc__ID__); + remesh(i, dx, invdx, __RCOMP_Ps__ID__, p, min_position, __RCOMP_Pgscal_loc__ID__); } /* Synchronize work-group */ diff --git a/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl b/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl index faa18986c..0ec701775 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/remeshing_noVec.cl @@ -53,11 +53,11 @@ __kernel void remeshing_kernel(__global const float* ppos, for(i=gidX*PART_NB_PER_WI; i<(gidX + 1)*PART_NB_PER_WI; i+=1) { /* Read particle position */ - p = ppos[i + line_index] - min_position; + p = ppos[i + line_index]; /* Read particle scalar */ __RCOMP_Is__ID__ = pscal__ID__[i + line_index]; /* Remesh particle */ - remesh(i, dx, invdx, __RCOMP_Ps__ID__, p, __RCOMP_Pgscal_loc__ID__); + remesh(i, dx, invdx, __RCOMP_Ps__ID__, p, min_position, __RCOMP_Pgscal_loc__ID__); } /* Synchronize work-group */ diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic.cl index 71cf2163d..394cb1d23 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/basic.cl @@ -3,7 +3,7 @@ * Remeshing function, vectorized version. */ -void remesh(uint i, float dx, float invdx, __RCOMP_P float__N__ s__ID__, float__N__ p, __RCOMP_P__local float* gscal_loc__ID__); +void remesh(uint i, float dx, float invdx, __RCOMP_P float__N__ s__ID__, float__N__ p, float min_position, __RCOMP_P__local float* gscal_loc__ID__); /** @@ -33,12 +33,15 @@ void remesh(uint i, float dx, float invdx, __RCOMP_P float__N__ s__ID__, float__ void remesh(uint i, float dx, float invdx, __RCOMP_P float__N__ s__ID__, float__N__ p, + float min_position, __RCOMP_P__local float* gscal_loc__ID__){ float__N__ y; /* Normalized distance to nearest left grid point */ int__N__ ind; /* Integer coordinate */ uint__N__ index; /* Remeshing index */ float w__NN__; + p = p - min_position; + ind = convert_int__N___rtn(p * invdx); y = (p - convert_float__N__(ind) * dx) * invdx; diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl index 61b3cf0b9..02f6562e5 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl @@ -3,7 +3,7 @@ * Remeshing function, vectorized version. */ -void remesh(uint i, float dx, float invdx, __RCOMP_P float s__ID__, float p, __RCOMP_P__local float* gscal_loc__ID__); +void remesh(uint i, float dx, float invdx, __RCOMP_P float s__ID__, float p, float min_position, __RCOMP_P__local float* gscal_loc__ID__); /** @@ -33,13 +33,14 @@ void remesh(uint i, float dx, float invdx, __RCOMP_P float s__ID__, float p, __R void remesh(uint i, float dx, float invdx, __RCOMP_P float s__ID__, float p, + float min_position, __RCOMP_P__local float* gscal_loc__ID__){ float y; /* Normalized distance to nearest left grid point */ int ind; /* Integer coordinate */ uint index; /* Remeshing index */ float w; - + p = p - min_position; ind = convert_int_rtn(p * invdx); y = (p - convert_float(ind) * dx) * invdx; diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_2d.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_2d.cl deleted file mode 100644 index abb672668..000000000 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_2d.cl +++ /dev/null @@ -1,111 +0,0 @@ -/** - * @file basic_noVec_vector_2d.cl - * Remeshing function, vectorized version for 2D vector remeshing. - */ - -void remesh(uint i, float dx, float invdx, - float v_X, float v_Y, - float p, - __local float* gvec_X_loc, __local float* gvec_Y_loc); - - -/** - * Remesh particles in local buffer. - * - * Remeshing formula is given a compiling time. - * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types. - * - * @param i Particle index - * @param dx Space step - * @param invdx 1/dx - * @param s Particle scalar - * @param p Particle position - * @param gscal_loc Local buffer for result - * - * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. - * @remark <code>__N__</code> is expanded at compilation time by vector width. - * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. - * @remark <code>FORMULA</code> : remeshing formula flag {<code>M4PRIME</code>, <code>M6PRIME</code>, <code>M8PRIME</code>, <code>L6STAR</code>} - * @remark <code>REMESH</code> is a function-like macro expanding to the proper remeshing formula (i.e.: <code>REMESH(alpha)</code> -> <code>alpha_l2_1</code>) - * @see parmepy.gpu.tools.parse_file - * @see parmepy.gpu.cl_src.common - */ -void remesh(uint i, float dx, float invdx, - float v_X, float v_Y, - float p, - __local float* gvec_X_loc, __local float* gvec_Y_loc){ - float y; /* Normalized distance to nearest left grid point */ - int ind; /* Integer coordinate */ - uint index; /* Remeshing index */ - float w; - - ind = convert_int_rtn(p * invdx); - y = (p - convert_float(ind) * dx) * invdx; - - index = convert_uint((ind - REMESH_SHIFT + NB_I) % NB_I); - - w = REMESH(alpha)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(beta)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(gamma)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(delta)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - barrier(CLK_LOCAL_MEM_FENCE); - -#if REMESH_SHIFT > 1 - index = (index + 1) % NB_I; - w = REMESH(eta)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(zeta)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if REMESH_SHIFT > 2 - index = (index + 1) % NB_I; - w = REMESH(theta)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(iota)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if REMESH_SHIFT > 3 - index = (index + 1) % NB_I; - w = REMESH(kappa)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(mu)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - barrier(CLK_LOCAL_MEM_FENCE); -#endif -} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_3d.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_3d.cl deleted file mode 100644 index c912769d9..000000000 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_3d.cl +++ /dev/null @@ -1,121 +0,0 @@ -/** - * @file basic_noVec_vector_3d.cl - * Remeshing function, vectorized version for 3D vector remeshing. - */ - -void remesh(uint i, float dx, float invdx, - float v_X, float v_Y, float v_Z, - float p, - __local float* gvec_X_loc, __local float* gvec_Y_loc, __local float* gvec_Z_loc); - - -/** - * Remesh particles in local buffer. - * - * Remeshing formula is given a compiling time. - * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types. - * - * @param i Particle index - * @param dx Space step - * @param invdx 1/dx - * @param s Particle scalar - * @param p Particle position - * @param gscal_loc Local buffer for result - * - * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. - * @remark <code>__N__</code> is expanded at compilation time by vector width. - * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. - * @remark <code>FORMULA</code> : remeshing formula flag {<code>M4PRIME</code>, <code>M6PRIME</code>, <code>M8PRIME</code>, <code>L6STAR</code>} - * @remark <code>REMESH</code> is a function-like macro expanding to the proper remeshing formula (i.e.: <code>REMESH(alpha)</code> -> <code>alpha_l2_1</code>) - * @see parmepy.gpu.tools.parse_file - * @see parmepy.gpu.cl_src.common - */ -void remesh(uint i, float dx, float invdx, - float v_X, float v_Y, float v_Z, - float p, - __local float* gvec_X_loc, __local float* gvec_Y_loc, __local float* gvec_Z_loc){ - float y; /* Normalized distance to nearest left grid point */ - int ind; /* Integer coordinate */ - uint index; /* Remeshing index */ - float w; - - ind = convert_int_rtn(p * invdx); - y = (p - convert_float(ind) * dx) * invdx; - - index = convert_uint((ind - REMESH_SHIFT + NB_I) % NB_I); - - w = REMESH(alpha)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - gvec_Z_loc[noBC_id(index)] += (w * v_Z); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(beta)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - gvec_Z_loc[noBC_id(index)] += (w * v_Z); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(gamma)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - gvec_Z_loc[noBC_id(index)] += (w * v_Z); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(delta)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - gvec_Z_loc[noBC_id(index)] += (w * v_Z); - barrier(CLK_LOCAL_MEM_FENCE); - -#if REMESH_SHIFT > 1 - index = (index + 1) % NB_I; - w = REMESH(eta)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - gvec_Z_loc[noBC_id(index)] += (w * v_Z); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(zeta)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - gvec_Z_loc[noBC_id(index)] += (w * v_Z); - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if REMESH_SHIFT > 2 - index = (index + 1) % NB_I; - w = REMESH(theta)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - gvec_Z_loc[noBC_id(index)] += (w * v_Z); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(iota)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - gvec_Z_loc[noBC_id(index)] += (w * v_Z); - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if REMESH_SHIFT > 3 - index = (index + 1) % NB_I; - w = REMESH(kappa)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - gvec_Z_loc[noBC_id(index)] += (w * v_Z); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(mu)(y); - gvec_X_loc[noBC_id(index)] += (w * v_X); - gvec_Y_loc[noBC_id(index)] += (w * v_Y); - gvec_Z_loc[noBC_id(index)] += (w * v_Z); - barrier(CLK_LOCAL_MEM_FENCE); -#endif -} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_2d.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_2d.cl deleted file mode 100644 index da8d9234b..000000000 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_2d.cl +++ /dev/null @@ -1,111 +0,0 @@ -/** - * @file basic_vector_2d.cl - * Remeshing function, vectorized version for vector remeshing in 2D. - */ - -void remesh(uint i, float dx, float invdx, - float__N__ v_X, float__N__ v_Y, - float__N__ p, - __local float* gvec_X_loc, __local float* gvec_Y_loc); - - -/** - * Remesh particles in local buffer. - * - * Remeshing formula is given a compiling time. - * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types. - * - * @param i Particle index - * @param dx Space step - * @param invdx 1/dx - * @param s Particle scalar - * @param p Particle position - * @param gscal_loc Local buffer for result - * - * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. - * @remark <code>FORMULA</code> : remeshing formula flag {<code>M4PRIME</code>, <code>M6PRIME</code>, <code>M8PRIME</code>, <code>L6STAR</code>} - * @remark <code>__N__</code> is expanded at compilation time by vector width. - * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. - * @remark <code>REMESH</code> is a function-like macro expanding to the proper remeshing formula (i.e.: <code>REMESH(alpha)</code> -> <code>alpha_l2_1</code>) - * @see parmepy.gpu.tools.parse_file - * @see parmepy.gpu.cl_src.common - */ -void remesh(uint i, float dx, float invdx, - float__N__ v_X, float__N__ v_Y, - float__N__ p, - __local float* gvec_X_loc, __local float* gvec_Y_loc){ - float__N__ y; /* Normalized distance to nearest left grid point */ - int__N__ ind; /* Integer coordinate */ - uint__N__ index; /* Remeshing index */ - float w__NN__; - - ind = convert_int__N___rtn(p * invdx); - y = (p - convert_float__N__(ind) * dx) * invdx; - - index = convert_uint__N__((ind - REMESH_SHIFT + NB_I) % NB_I); - - w__NN__ = REMESH(alpha)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w__NN__ = REMESH(beta)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w__NN__ = REMESH(gamma)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w__NN__ = REMESH(delta)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - -#if REMESH_SHIFT > 1 - index = (index + 1) % NB_I; - w__NN__ = REMESH(eta)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w__NN__ = REMESH(zeta)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if REMESH_SHIFT > 2 - index = (index + 1) % NB_I; - w__NN__ = REMESH(theta)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w__NN__ = REMESH(iota)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if REMESH_SHIFT > 3 - index = (index + 1) % NB_I; - w__NN__ = REMESH(kappa)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w__NN__ = REMESH(mu)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); -#endif -} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_3d.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_3d.cl deleted file mode 100644 index ed3f4a397..000000000 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_3d.cl +++ /dev/null @@ -1,121 +0,0 @@ -/** - * @file basic_vector_3d.cl - * Remeshing function, vectorized version for vector remeshing in 3D. - */ - -void remesh(uint i, float dx, float invdx, - float__N__ v_X, float__N__ v_Y, float__N__ v_Z, - float__N__ p, - __local float* gvec_X_loc, __local float* gvec_Y_loc, __local float* gvec_Z_loc); - - -/** - * Remesh particles in local buffer. - * - * Remeshing formula is given a compiling time. - * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types. - * - * @param i Particle index - * @param dx Space step - * @param invdx 1/dx - * @param s Particle scalar - * @param p Particle position - * @param gscal_loc Local buffer for result - * - * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. - * @remark <code>FORMULA</code> : remeshing formula flag {<code>M4PRIME</code>, <code>M6PRIME</code>, <code>M8PRIME</code>, <code>L6STAR</code>} - * @remark <code>__N__</code> is expanded at compilation time by vector width. - * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. - * @remark <code>REMESH</code> is a function-like macro expanding to the proper remeshing formula (i.e.: <code>REMESH(alpha)</code> -> <code>alpha_l2_1</code>) - * @see parmepy.gpu.tools.parse_file - * @see parmepy.gpu.cl_src.common - */ -void remesh(uint i, float dx, float invdx, - float__N__ v_X, float__N__ v_Y,float__N__ v_Z, - float__N__ p, - __local float* gvec_X_loc, __local float* gvec_Y_loc, __local float* gvec_Z_loc){ - float__N__ y; /* Normalized distance to nearest left grid point */ - int__N__ ind; /* Integer coordinate */ - uint__N__ index; /* Remeshing index */ - float w__NN__; - - ind = convert_int__N___rtn(p * invdx); - y = (p - convert_float__N__(ind) * dx) * invdx; - - index = convert_uint__N__((ind - REMESH_SHIFT + NB_I) % NB_I); - - w__NN__ = REMESH(alpha)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w__NN__ = REMESH(beta)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w__NN__ = REMESH(gamma)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w__NN__ = REMESH(delta)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - -#if REMESH_SHIFT > 1 - index = (index + 1) % NB_I; - w__NN__ = REMESH(eta)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w__NN__ = REMESH(zeta)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if REMESH_SHIFT > 2 - index = (index + 1) % NB_I; - w__NN__ = REMESH(theta)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w__NN__ = REMESH(iota)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if REMESH_SHIFT > 3 - index = (index + 1) % NB_I; - w__NN__ = REMESH(kappa)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w__NN__ = REMESH(mu)(y.s__NN__); - gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); - gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); - gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); - barrier(CLK_LOCAL_MEM_FENCE); -#endif -} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/comm_basic_noVec.cl b/HySoP/hysop/gpu/cl_src/remeshing/comm_basic_noVec.cl new file mode 100644 index 000000000..3f1519f41 --- /dev/null +++ b/HySoP/hysop/gpu/cl_src/remeshing/comm_basic_noVec.cl @@ -0,0 +1,124 @@ +/** + * @file remeshing/comm_basic_noVec.cl + * Remeshing function, vectorized version. + */ + +void remesh(uint i, float dx, float invdx, __RCOMP_P float s__ID__, float p, float min_position, __RCOMP_P__local float* gscal_loc__ID__); + + +/** + * Remesh particles in local buffer. + * + * Remeshing formula is given a compiling time. + * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types. + * + * @param i Particle index + * @param dx Space step + * @param invdx 1/dx + * @param s Particle scalar + * @param p Particle position + * @param gscal_loc Local buffer for result + * + * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. + * @remark T_NB_I: global points number in the 1st direction (mpi cutted direction) + * @remark START_INDEX Global staring index for computational points + * @remark <code>__N__</code> is expanded at compilation time by vector width. + * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. + * @remark <code>FORMULA</code> : remeshing formula flag {<code>M4PRIME</code>, <code>M6PRIME</code>, <code>M8PRIME</code>, <code>L6STAR</code>} + * @remark <code>__RCOMP_I</code> flag is for instruction expansion for the different remeshed components. + * @remark <code>__RCOMP_P</code> flag is for function parameter expansion for the different remeshed components. + * @remark <code>__ID__</code> is replaced by the remeshed component id in an expansion. + * @remark <code>REMESH</code> is a function-like macro expanding to the proper remeshing formula (i.e.: <code>REMESH(alpha)</code> -> <code>alpha_l2_1</code>) + * @see parmepy.gpu.tools.parse_file + * @see parmepy.gpu.cl_src.common + */ +void remesh(uint i, float dx, float invdx, + __RCOMP_P float s__ID__, + float p, + float min_position, + __RCOMP_P__local float* gscal_loc__ID__){ + float y; /* Normalized distance to nearest left grid point */ + int ind; /* Integer coordinate */ + int index; /* Remeshing index */ + float w; + + ind = convert_int_rtn(p * invdx); + y = (p - convert_float(ind) * dx) * invdx; + + index = ((ind - REMESH_SHIFT + T_NB_I) % T_NB_I) - START_INDEX; + + if (index>=0 && index < NB_I){ + w = REMESH(alpha)(y); + __RCOMP_Igscal_loc__ID__[noBC_id(index)] += (w * s__ID__); + } + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + if (index>=0 && index < NB_I){ + w = REMESH(beta)(y); + __RCOMP_Igscal_loc__ID__[noBC_id(index)] += (w * s__ID__); + } + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + if (index>=0 && index < NB_I){ + w = REMESH(gamma)(y); + __RCOMP_Igscal_loc__ID__[noBC_id(index)] += (w * s__ID__); + } + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + if (index>=0 && index < NB_I){ + w = REMESH(delta)(y); + __RCOMP_Igscal_loc__ID__[noBC_id(index)] += (w * s__ID__); + } + barrier(CLK_LOCAL_MEM_FENCE); + +#if REMESH_SHIFT > 1 + index = index + 1; + if (index>=0 && index < NB_I){ + w = REMESH(eta)(y); + __RCOMP_Igscal_loc__ID__[noBC_id(index)] += (w * s__ID__); + } + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + if (index>=0 && index < NB_I){ + w = REMESH(zeta)(y); + __RCOMP_Igscal_loc__ID__[noBC_id(index)] += (w * s__ID__); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 2 + index = index + 1; + if (index>=0 && index < NB_I){ + w = REMESH(theta)(y); + __RCOMP_Igscal_loc__ID__[noBC_id(index)] += (w * s__ID__); + } + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + if (index>=0 && index < NB_I){ + w = REMESH(iota)(y); + __RCOMP_Igscal_loc__ID__[noBC_id(index)] += (w * s__ID__); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if REMESH_SHIFT > 3 + index = index + 1; + if (index>=0 && index < NB_I){ + w = REMESH(kappa)(y); + __RCOMP_Igscal_loc__ID__[noBC_id(index)] += (w * s__ID__); + } + barrier(CLK_LOCAL_MEM_FENCE); + + index = index + 1; + if (index>=0 && index < NB_I){ + w = REMESH(mu)(y); + __RCOMP_Igscal_loc__ID__[noBC_id(index)] += (w * s__ID__); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif +} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/private.cl b/HySoP/hysop/gpu/cl_src/remeshing/private.cl index bce6c1790..89b4befa4 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/private.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/private.cl @@ -3,7 +3,7 @@ * Remeshing function, vectorized, private variable. */ -void remesh(uint i, float dx, float invdx, __RCOMP_P float__N__ s__ID__, float__N__ p, __RCOMP_P__local float* gscal_loc__ID__); +void remesh(uint i, float dx, float invdx, __RCOMP_P float__N__ s__ID__, float__N__ p, float min_position, __RCOMP_P__local float* gscal_loc__ID__); /** @@ -34,6 +34,7 @@ void remesh(uint i, float dx, float invdx, __RCOMP_P float__N__ s__ID__, float__ void remesh(uint i, float dx, float invdx, __RCOMP_P float__N__ s__ID__, float__N__ p, + float min_position, __RCOMP_P__local float* gscal_loc__ID__){ float__N__ y, /* Normalized distance to nearest left grid point */ w; @@ -41,6 +42,8 @@ void remesh(uint i, float dx, float invdx, int__N__ ind; /* Integer coordinate */ uint__N__ index; /* Remeshing index */ + p = p - min_position; + ind = convert_int__N___rtn(p * invdx); y = (p - convert_float__N__(ind) * dx) * invdx; diff --git a/HySoP/hysop/gpu/cl_src/remeshing/private_noVec.cl b/HySoP/hysop/gpu/cl_src/remeshing/private_noVec.cl index e1afa6a3a..3496e8823 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/private_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/private_noVec.cl @@ -3,7 +3,7 @@ * Remeshing function, vectorized, private variable. */ -void remesh(uint i, float dx, float invdx, __RCOMP_P float s__ID__, float p, __RCOMP_P__local float* gscal_loc__ID__); +void remesh(uint i, float dx, float invdx, __RCOMP_P float s__ID__, float p, float min_position, __RCOMP_P__local float* gscal_loc__ID__); /** @@ -34,6 +34,7 @@ void remesh(uint i, float dx, float invdx, __RCOMP_P float s__ID__, float p, __R void remesh(uint i, float dx, float invdx, __RCOMP_P float s__ID__, float p, + float min_position, __RCOMP_P__local float* gscal_loc__ID__){ float y, /* Normalized distance to nearest left grid point */ w; /* Temporary remeshing weights */ @@ -41,6 +42,8 @@ void remesh(uint i, float dx, float invdx, int ind; /* Integer coordinate */ uint index; /* Remeshing index */ + p = p - min_position; + ind = convert_int_rtn(p * invdx); y = (p - convert_float(ind) * dx) * invdx; diff --git a/HySoP/hysop/gpu/cl_src/remeshing/private_vector_2d.cl b/HySoP/hysop/gpu/cl_src/remeshing/private_vector_2d.cl deleted file mode 100644 index fcb1e443c..000000000 --- a/HySoP/hysop/gpu/cl_src/remeshing/private_vector_2d.cl +++ /dev/null @@ -1,112 +0,0 @@ -/** - * @file private_vector_2d.cl - * Remeshing function, vectorized, private variable for 2D vector remeshing. - */ - -void remesh(uint i, float dx, float invdx, - float__N__ v_X, float__N__ v_Y, - float__N__ p, - __local float* gvec_X_loc, __local float* gvec_Y_loc); - - -/** - * Remesh particles in local buffer. - * - * Remeshing formula is given a compiling time. - * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types. - * Use of a private temporary variable for remeshing weights. - * - * @param i Particle index - * @param dx Space step - * @param invdx 1/dx - * @param s Particle scalar - * @param p Particle position - * @param gscal_loc Local buffer for result - * - * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. - * @remark <code>__N__</code> is expanded at compilation time by vector width. - * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. - * @remark <code>FORMULA</code> : remeshing formula flag {<code>M4PRIME</code>, <code>M6PRIME</code>, <code>M8PRIME</code>, <code>L6STAR</code>} - * @remark <code>REMESH</code> is a function-like macro expanding to the proper remeshing formula (i.e.: <code>REMESH(alpha)</code> -> <code>alpha_l2_1</code>) - * @see parmepy.gpu.tools.parse_file - * @see parmepy.gpu.cl_src.common - */ -void remesh(uint i, float dx, float invdx, - float__N__ v_X, float__N__ v_Y, - float__N__ p, - __local float* gvec_X_loc, __local float* gvec_Y_loc){ - float__N__ y, /* Normalized distance to nearest left grid point */ - w; /* Temporary remeshing weights */ - int__N__ ind; /* Integer coordinate */ - uint__N__ index; /* Remeshing index */ - - ind = convert_int__N___rtn(p * invdx); - y = (p - convert_float__N__(ind) * dx) * invdx; - - index = convert_uint__N__((ind - REMESH_SHIFT + NB_I) % NB_I); - - w = REMESH(alpha)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(beta)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(gamma)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(delta)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - -#if REMESH_SHIFT > 1 - index = (index + 1) % NB_I; - w = REMESH(eta)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(zeta)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if REMESH_SHIFT > 2 - index = (index + 1) % NB_I; - w = REMESH(theta)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(iota)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if REMESH_SHIFT > 3 - index = (index + 1) % NB_I; - w = REMESH(kappa)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(mu)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); -#endif -} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/private_vector_3d.cl b/HySoP/hysop/gpu/cl_src/remeshing/private_vector_3d.cl deleted file mode 100644 index dabd8e5d8..000000000 --- a/HySoP/hysop/gpu/cl_src/remeshing/private_vector_3d.cl +++ /dev/null @@ -1,122 +0,0 @@ -/** - * @file private_vector_3d.cl - * Remeshing function, vectorized, private variable for 3D vector remeshing. - */ - -void remesh(uint i, float dx, float invdx, - float__N__ v_X, float__N__ v_Y,float__N__ v_Z, - float__N__ p, - __local float* gvec_X_loc, __local float* gvec_Y_loc, __local float* gvec_Z_loc); - - -/** - * Remesh particles in local buffer. - * - * Remeshing formula is given a compiling time. - * Use of builtin OpenCL functions fma and mix. Computations through OpenCL vector types. - * Use of a private temporary variable for remeshing weights. - * - * @param i Particle index - * @param dx Space step - * @param invdx 1/dx - * @param s Particle scalar - * @param p Particle position - * @param gscal_loc Local buffer for result - * - * @remark <code>NB_I</code>, <code>NB_II</code>, <code>NB_III</code> : points number in directions from 1st varying index to last. - * @remark <code>__N__</code> is expanded at compilation time by vector width. - * @remark <code>__NN__</code> is expanded at compilation time by a sequence of integer for each vector component. - * @remark <code>FORMULA</code> : remeshing formula flag {<code>M4PRIME</code>, <code>M6PRIME</code>, <code>M8PRIME</code>, <code>L6STAR</code>} - * @remark <code>REMESH</code> is a function-like macro expanding to the proper remeshing formula (i.e.: <code>REMESH(alpha)</code> -> <code>alpha_l2_1</code>) - * @see parmepy.gpu.tools.parse_file - * @see parmepy.gpu.cl_src.common - */ -void remesh(uint i, float dx, float invdx, - float__N__ v_X, float__N__ v_Y,float__N__ v_Z, - float__N__ p, - __local float* gvec_X_loc, __local float* gvec_Y_loc, __local float* gvec_Z_loc){ - float__N__ y, /* Normalized distance to nearest left grid point */ - w; /* Temporary remeshing weights */ - int__N__ ind; /* Integer coordinate */ - uint__N__ index; /* Remeshing index */ - - ind = convert_int__N___rtn(p * invdx); - y = (p - convert_float__N__(ind) * dx) * invdx; - - index = convert_uint__N__((ind - REMESH_SHIFT + NB_I) % NB_I); - - w = REMESH(alpha)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(beta)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(gamma)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(delta)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - -#if REMESH_SHIFT > 1 - index = (index + 1) % NB_I; - w = REMESH(eta)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(zeta)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if REMESH_SHIFT > 2 - index = (index + 1) % NB_I; - w = REMESH(theta)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(iota)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); -#endif - -#if REMESH_SHIFT > 3 - index = (index + 1) % NB_I; - w = REMESH(kappa)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); - - index = (index + 1) % NB_I; - w = REMESH(mu)(y); - gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; - gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; - gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; - barrier(CLK_LOCAL_MEM_FENCE); -#endif -} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights.cl index 0b102c369..d101fffed 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/weights.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/weights.cl @@ -5,197 +5,197 @@ */ inline float__N__ alpha_l2_1(float__N__ y){ - return ((y * (y * (-y + 2.0) - 1.0)) / 2.0);} + return ((y * (y * (-y + 2.0) - 1.0)) * 0.5);} inline float__N__ beta_l2_1(float__N__ y){ - return ((y * y * (3.0 * y - 5.0) + 2.0) / 2.0);} + return ((y * y * (3.0 * y - 5.0) + 2.0) * 0.5);} inline float__N__ gamma_l2_1(float__N__ y){ - return ((y * (y * (-3.0 * y + 4.0) + 1.0)) / 2.0);} + return ((y * (y * (-3.0 * y + 4.0) + 1.0)) * 0.5);} inline float__N__ delta_l2_1(float__N__ y){ - return ((y * y * (y - 1.0)) / 2.0);} + return ((y * y * (y - 1.0)) * 0.5);} inline float__N__ alpha_l2_2(float__N__ y){ - return ((y * (y * (y * (y * (2.0 * y - 5.0) + 3.0) + 1.0) - 1.0)) / 2.0);} + return ((y * (y * (y * (y * (2.0 * y - 5.0) + 3.0) + 1.0) - 1.0)) * 0.5);} inline float__N__ beta_l2_2(float__N__ y){ - return ((y * y * (y * (y * (-6.0 * y + 15.0) - 9.0) - 2.0) + 2.0) / 2.0);} + return ((y * y * (y * (y * (-6.0 * y + 15.0) - 9.0) - 2.0) + 2.0) * 0.5);} inline float__N__ gamma_l2_2(float__N__ y){ - return ((y * (y * (y * (y * (6.0 * y - 15.0) + 9.0) + 1.0) + 1.0)) / 2.0);} + return ((y * (y * (y * (y * (6.0 * y - 15.0) + 9.0) + 1.0) + 1.0)) * 0.5);} inline float__N__ delta_l2_2(float__N__ y){ - return ((y * y * y * (y * (-2.0 * y + 5.0) - 3.0)) / 2.0);} + return ((y * y * y * (y * (-2.0 * y + 5.0) - 3.0)) * 0.5);} inline float__N__ alpha_l2_3(float__N__ y){ - return ((y * (y * (y * y * (y * (y * (-6.0 * y + 21.0) - 25.0) + 10.0) + 1.0) - 1.0)) / 2.0);} + return ((y * (y * (y * y * (y * (y * (-6.0 * y + 21.0) - 25.0) + 10.0) + 1.0) - 1.0)) * 0.5);} inline float__N__ beta_l2_3(float__N__ y){ - return ((y * y * (y * y * (y * (y * (18.0 * y - 63.0) + 75.0) - 30.0) - 2.0) + 2.0) / 2.0);} + return ((y * y * (y * y * (y * (y * (18.0 * y - 63.0) + 75.0) - 30.0) - 2.0) + 2.0) * 0.5);} inline float__N__ gamma_l2_3(float__N__ y){ - return ((y * (y * (y * y * (y * (y * (-18.0 * y + 63.0) - 75.0) + 30.0) + 1.0) + 1.0)) / 2.0);} + return ((y * (y * (y * y * (y * (y * (-18.0 * y + 63.0) - 75.0) + 30.0) + 1.0) + 1.0)) * 0.5);} inline float__N__ delta_l2_3(float__N__ y){ - return ((y * y * y * y * (y * (y * (6.0 * y - 21.0) + 25.0) - 10.0)) / 2.0);} + return ((y * y * y * y * (y * (y * (6.0 * y - 21.0) + 25.0) - 10.0)) * 0.5);} inline float__N__ alpha_l2_4(float__N__ y){ - return ((y * (y * (y * y * y * (y * (y * (y * (20.0 * y - 90.0) + 154.0) - 119.0) + 35.0) + 1.0) - 1.0)) / 2.0);} + return ((y * (y * (y * y * y * (y * (y * (y * (20.0 * y - 90.0) + 154.0) - 119.0) + 35.0) + 1.0) - 1.0)) * 0.5);} inline float__N__ beta_l2_4(float__N__ y){ - return ((y * y * (y * y * y * (y * (y * (y * (-60.0 * y + 270.0) - 462.0) + 357.0) - 105.0) - 2.0) + 2.0) / 2.0);} + return ((y * y * (y * y * y * (y * (y * (y * (-60.0 * y + 270.0) - 462.0) + 357.0) - 105.0) - 2.0) + 2.0) * 0.5);} inline float__N__ gamma_l2_4(float__N__ y){ - return ((y * (y * (y * y * y * (y * (y * (y * (60.0 * y - 270.0) + 462.0) - 357.0) + 105.0) + 1.0) + 1.0)) / 2.0);} + return ((y * (y * (y * y * y * (y * (y * (y * (60.0 * y - 270.0) + 462.0) - 357.0) + 105.0) + 1.0) + 1.0)) * 0.5);} inline float__N__ delta_l2_4(float__N__ y){ - return ((y * y * y * y * y * (y * (y * (y * (-20.0 * y + 90.0) - 154.0) + 119.0) - 35.0)) / 2.0);} + return ((y * y * y * y * y * (y * (y * (y * (-20.0 * y + 90.0) - 154.0) + 119.0) - 35.0)) * 0.5);} inline float__N__ alpha_l4_2(float__N__ y){ - return ((y * (y * (y * (y * (-5.0 * y + 13.0) - 9.0) - 1.0) + 2.0)) / 24.0);} + return ((y * (y * (y * (y * (-5.0 * y + 13.0) - 9.0) - 1.0) + 2.0)) * 0.041666666666666664);} inline float__N__ beta_l4_2(float__N__ y){ - return ((y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0)) / 24.0);} + return ((y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0)) * 0.041666666666666664);} inline float__N__ gamma_l4_2(float__N__ y){ - return ((y * y * (y * (y * (-50.0 * y + 126.0) - 70.0) - 30.0) + 24.0) / 24.0);} + return ((y * y * (y * (y * (-50.0 * y + 126.0) - 70.0) - 30.0) + 24.0) * 0.041666666666666664);} inline float__N__ delta_l4_2(float__N__ y){ - return ((y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0)) / 24.0);} + return ((y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0)) * 0.041666666666666664);} inline float__N__ eta_l4_2(float__N__ y){ - return ((y * (y * (y * (y * (-25.0 * y + 61.0) - 33.0) - 1.0) - 2.0)) / 24.0);} + return ((y * (y * (y * (y * (-25.0 * y + 61.0) - 33.0) - 1.0) - 2.0)) * 0.041666666666666664);} inline float__N__ zeta_l4_2(float__N__ y){ - return ((y * y * y * (y * (5.0 * y - 12.0) + 7.0)) / 24.0);} + return ((y * y * y * (y * (5.0 * y - 12.0) + 7.0)) * 0.041666666666666664);} inline float__N__ alpha_l4_3(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (14.0 * y - 49.0) + 58.0) - 22.0) - 2.0) - 1.0) + 2.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (14.0 * y - 49.0) + 58.0) - 22.0) - 2.0) - 1.0) + 2.0)) * 0.041666666666666664);} inline float__N__ beta_l4_3(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (-70.0 * y + 245.0) - 290.0) + 111.0) + 4.0) + 16.0) - 16.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (-70.0 * y + 245.0) - 290.0) + 111.0) + 4.0) + 16.0) - 16.0)) * 0.041666666666666664);} inline float__N__ gamma_l4_3(float__N__ y){ - return ((y * y * (y * y * (y * (y * (140.0 * y - 490.0) + 580.0) - 224.0) - 30.0) + 24.0) / 24.0);} + return ((y * y * (y * y * (y * (y * (140.0 * y - 490.0) + 580.0) - 224.0) - 30.0) + 24.0) * 0.041666666666666664);} inline float__N__ delta_l4_3(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (-140.0 * y + 490.0) - 580.0) + 226.0) - 4.0) + 16.0) + 16.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (-140.0 * y + 490.0) - 580.0) + 226.0) - 4.0) + 16.0) + 16.0)) * 0.041666666666666664);} inline float__N__ eta_l4_3(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (70.0 * y - 245.0) + 290.0) - 114.0) + 2.0) - 1.0) - 2.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (70.0 * y - 245.0) + 290.0) - 114.0) + 2.0) - 1.0) - 2.0)) * 0.041666666666666664);} inline float__N__ zeta_l4_3(float__N__ y){ - return ((y * y * y * y * (y * (y * (-14.0 * y + 49.0) - 58.0) + 23.0)) / 24.0);} + return ((y * y * y * y * (y * (y * (-14.0 * y + 49.0) - 58.0) + 23.0)) * 0.041666666666666664);} inline float__N__ alpha_l4_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-46.0 * y + 207.0) - 354.0) + 273.0) - 80.0) + 1.0) - 2.0) - 1.0) + 2.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-46.0 * y + 207.0) - 354.0) + 273.0) - 80.0) + 1.0) - 2.0) - 1.0) + 2.0)) * 0.041666666666666664);} inline float__N__ beta_l4_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (230.0 * y - 1035.0) + 1770.0) - 1365.0) + 400.0) - 4.0) + 4.0) + 16.0) - 16.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (230.0 * y - 1035.0) + 1770.0) - 1365.0) + 400.0) - 4.0) + 4.0) + 16.0) - 16.0)) * 0.041666666666666664);} inline float__N__ gamma_l4_4(float__N__ y){ - return ((y * y * (y * y * (y * (y * (y * (y * (-460.0 * y + 2070.0) - 3540.0) + 2730.0) - 800.0) + 6.0) - 30.0) + 24.0) / 24.0);} + return ((y * y * (y * y * (y * (y * (y * (y * (-460.0 * y + 2070.0) - 3540.0) + 2730.0) - 800.0) + 6.0) - 30.0) + 24.0) * 0.041666666666666664);} inline float__N__ delta_l4_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (460.0 * y - 2070.0) + 3540.0) - 2730.0) + 800.0) - 4.0) - 4.0) + 16.0) + 16.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (460.0 * y - 2070.0) + 3540.0) - 2730.0) + 800.0) - 4.0) - 4.0) + 16.0) + 16.0)) * 0.041666666666666664);} inline float__N__ eta_l4_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-230.0 * y + 1035.0) - 1770.0) + 1365.0) - 400.0) + 1.0) + 2.0) - 1.0) - 2.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-230.0 * y + 1035.0) - 1770.0) + 1365.0) - 400.0) + 1.0) + 2.0) - 1.0) - 2.0)) * 0.041666666666666664);} inline float__N__ zeta_l4_4(float__N__ y){ - return ((y * y * y * y * y * (y * (y * (y * (46.0 * y - 207.0) + 354.0) - 273.0) + 80.0)) / 24.0);} + return ((y * y * y * y * y * (y * (y * (y * (46.0 * y - 207.0) + 354.0) - 273.0) + 80.0)) * 0.041666666666666664);} inline float__N__ alpha_M8p(float__N__ y){ - return ((y*(y*(y*(y*(y*(y*(-10.0*y + 21.0) + 28.0) - 105.0) + 70.0) + 35.0) - 56.0) + 17.0) / 3360.0);} + return ((y*(y*(y*(y*(y*(y*(-10.0*y + 21.0) + 28.0) - 105.0) + 70.0) + 35.0) - 56.0) + 17.0) * 0.00029761904761904765);} inline float__N__ beta_M8p(float__N__ y){ - return ((y*(y*(y*(y*(y*(y*(70.0*y - 175.0) - 140.0) + 770.0) - 560.0) - 350.0) + 504.0) - 102.0) / 3360.0);} + return ((y*(y*(y*(y*(y*(y*(70.0*y - 175.0) - 140.0) + 770.0) - 560.0) - 350.0) + 504.0) - 102.0) * 0.00029761904761904765);} inline float__N__ gamma_M8p(float__N__ y){ - return ((y*(y*(y*(y*(y*(y*(-210.0*y + 609.0) + 224.0) - 2135.0) + 910.0) + 2765.0) - 2520.0) + 255.0) / 3360.0);} + return ((y*(y*(y*(y*(y*(y*(-210.0*y + 609.0) + 224.0) - 2135.0) + 910.0) + 2765.0) - 2520.0) + 255.0) * 0.00029761904761904765);} inline float__N__ delta_M8p(float__N__ y){ - return ((y*y* (y*y* (y*y* (70.0*y - 231.0) + 588.0) - 980.0) + 604.0) / 672.0);} + return ((y*y* (y*y* (y*y* (70.0*y - 231.0) + 588.0) - 980.0) + 604.0) * 0.001488095238095238);} inline float__N__ eta_M8p(float__N__ y){ - return ((y*(y*(y*(y*(y*(y*(-70.0*y+ 259.0) - 84.0) - 427.0) - 182.0)+ 553.0) + 504.0)+ 51.0) / 672.0);} + return ((y*(y*(y*(y*(y*(y*(-70.0*y+ 259.0) - 84.0) - 427.0) - 182.0)+ 553.0) + 504.0)+ 51.0) * 0.001488095238095238);} inline float__N__ zeta_M8p(float__N__ y){ - return ((y*(y*(y*(y*(y*(y*(210.0*y- 861.0) + 532.0) + 770.0) + 560.0) - 350.0) - 504.0) - 102.0) / 3360.0);} + return ((y*(y*(y*(y*(y*(y*(210.0*y- 861.0) + 532.0) + 770.0) + 560.0) - 350.0) - 504.0) - 102.0) * 0.00029761904761904765);} inline float__N__ theta_M8p(float__N__ y){ - return ((y* (y* (y* (y* (y* (y* (-70.0* y+ 315.0) -280.0) -105.0) -70.0) +35.0)+ 56.0) +17.0) / 3360.0);} + return ((y* (y* (y* (y* (y* (y* (-70.0* y+ 315.0) -280.0) -105.0) -70.0) +35.0)+ 56.0) +17.0) * 0.00029761904761904765);} inline float__N__ iota_M8p(float__N__ y){ - return ((y * y * y * y * y * (y * (10.0 * y - 49.0) + 56.0)) / 3360.0);} + return ((y * y * y * y * y * (y * (10.0 * y - 49.0) + 56.0)) * 0.00029761904761904765);} inline float__N__ alpha_l6_3(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (-89.0 * y + 312.0) - 370.0) + 140.0) + 15.0) + 4.0) - 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (-89.0 * y + 312.0) - 370.0) + 140.0) + 15.0) + 4.0) - 12.0)) * 0.001388888888888889);} inline float__N__ beta_l6_3(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (623.0 * y - 2183.0) + 2581.0) - 955.0) - 120.0) - 54.0) + 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (623.0 * y - 2183.0) + 2581.0) - 955.0) - 120.0) - 54.0) + 108.0)) * 0.001388888888888889);} inline float__N__ gamma_l6_3(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (-1869.0 * y + 6546.0) - 7722.0) + 2850.0) + 195.0) + 540.0) - 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (-1869.0 * y + 6546.0) - 7722.0) + 2850.0) + 195.0) + 540.0) - 540.0)) * 0.001388888888888889);} inline float__N__ delta_l6_3(float__N__ y){ - return ((y * y * (y * y * (y * (y * (3115.0 * y - 10905.0) + 12845.0) - 4795.0) - 980.0) + 720.0) / 720.0);} + return ((y * y * (y * y * (y * (y * (3115.0 * y - 10905.0) + 12845.0) - 4795.0) - 980.0) + 720.0) * 0.001388888888888889);} inline float__N__ eta_l6_3(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (-3115.0 * y + 10900.0) - 12830.0) + 4880.0) - 195.0) + 540.0) + 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (-3115.0 * y + 10900.0) - 12830.0) + 4880.0) - 195.0) + 540.0) + 540.0)) * 0.001388888888888889);} inline float__N__ zeta_l6_3(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (1869.0 * y - 6537.0) + 7695.0) - 2985.0) + 120.0) - 54.0) - 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (1869.0 * y - 6537.0) + 7695.0) - 2985.0) + 120.0) - 54.0) - 108.0)) * 0.001388888888888889);} inline float__N__ theta_l6_3(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (-623.0 * y + 2178.0) - 2566.0) + 1010.0) - 15.0) + 4.0) + 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (-623.0 * y + 2178.0) - 2566.0) + 1010.0) - 15.0) + 4.0) + 12.0)) * 0.001388888888888889);} inline float__N__ iota_l6_3(float__N__ y){ - return ((y * y * y * y * (y * (y * (89.0 * y - 311.0) + 367.0) - 145.0)) / 720.0);} + return ((y * y * y * y * (y * (y * (89.0 * y - 311.0) + 367.0) - 145.0)) * 0.001388888888888889);} inline float__N__ alpha_l6_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (290.0 * y - 1305.0) + 2231.0) - 1718.0) + 500.0) - 5.0) + 15.0) + 4.0) - 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (290.0 * y - 1305.0) + 2231.0) - 1718.0) + 500.0) - 5.0) + 15.0) + 4.0) - 12.0)) * 0.001388888888888889);} inline float__N__ beta_l6_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-2030.0 * y + 9135.0) - 15617.0) + 12027.0) - 3509.0) + 60.0) - 120.0) - 54.0) + 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-2030.0 * y + 9135.0) - 15617.0) + 12027.0) - 3509.0) + 60.0) - 120.0) - 54.0) + 108.0)) * 0.001388888888888889);} inline float__N__ gamma_l6_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (6090.0 * y - 27405.0) + 46851.0) - 36084.0) + 10548.0) - 195.0) + 195.0) + 540.0) - 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (6090.0 * y - 27405.0) + 46851.0) - 36084.0) + 10548.0) - 195.0) + 195.0) + 540.0) - 540.0)) * 0.001388888888888889);} inline float__N__ delta_l6_4(float__N__ y){ - return ((y * y * (y * y * (y * (y * (y * (y * (-10150.0 * y + 45675.0) - 78085.0) + 60145.0) - 17605.0) + 280.0) - 980.0) + 720.0) / 720.0);} + return ((y * y * (y * y * (y * (y * (y * (y * (-10150.0 * y + 45675.0) - 78085.0) + 60145.0) - 17605.0) + 280.0) - 980.0) + 720.0) * 0.001388888888888889);} inline float__N__ eta_l6_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (10150.0 * y - 45675.0) + 78085.0) - 60150.0) + 17620.0) - 195.0) - 195.0) + 540.0) + 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (10150.0 * y - 45675.0) + 78085.0) - 60150.0) + 17620.0) - 195.0) - 195.0) + 540.0) + 540.0)) * 0.001388888888888889);} inline float__N__ zeta_l6_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-6090.0 * y + 27405.0) - 46851.0) + 36093.0) - 10575.0) + 60.0) + 120.0) - 54.0) - 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-6090.0 * y + 27405.0) - 46851.0) + 36093.0) - 10575.0) + 60.0) + 120.0) - 54.0) - 108.0)) * 0.001388888888888889);} inline float__N__ theta_l6_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (2030.0 * y - 9135.0) + 15617.0) - 12032.0) + 3524.0) - 5.0) - 15.0) + 4.0) + 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (2030.0 * y - 9135.0) + 15617.0) - 12032.0) + 3524.0) - 5.0) - 15.0) + 4.0) + 12.0)) * 0.001388888888888889);} inline float__N__ iota_l6_4(float__N__ y){ - return ((y * y * y * y * y * (y * (y * (y * (-290.0 * y + 1305.0) - 2231.0) + 1719.0) - 503.0)) / 720.0);} + return ((y * y * y * y * y * (y * (y * (y * (-290.0 * y + 1305.0) - 2231.0) + 1719.0) - 503.0)) * 0.001388888888888889);} inline float__N__ alpha_l6_5(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-1006.0 * y + 5533.0) - 12285.0) + 13785.0) - 7829.0) + 1803.0) - 3.0) - 5.0) + 15.0) + 4.0) - 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-1006.0 * y + 5533.0) - 12285.0) + 13785.0) - 7829.0) + 1803.0) - 3.0) - 5.0) + 15.0) + 4.0) - 12.0)) * 0.001388888888888889);} inline float__N__ beta_l6_5(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (7042.0 * y - 38731.0) + 85995.0) - 96495.0) + 54803.0) - 12620.0) + 12.0) + 60.0) - 120.0) - 54.0) + 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (7042.0 * y - 38731.0) + 85995.0) - 96495.0) + 54803.0) - 12620.0) + 12.0) + 60.0) - 120.0) - 54.0) + 108.0)) * 0.001388888888888889);} inline float__N__ gamma_l6_5(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-21126.0 * y + 116193.0) - 257985.0) + 289485.0) - 164409.0) + 37857.0) - 15.0) - 195.0) + 195.0) + 540.0) - 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-21126.0 * y + 116193.0) - 257985.0) + 289485.0) - 164409.0) + 37857.0) - 15.0) - 195.0) + 195.0) + 540.0) - 540.0)) * 0.001388888888888889);} inline float__N__ delta_l6_5(float__N__ y){ - return ((y * y * (y * y * (y * y * (y * (y * (y * (y * (35210.0 * y - 193655.0) + 429975.0) - 482475.0) + 274015.0) - 63090.0) + 280.0) - 980.0) + 720.0) / 720.0);} + return ((y * y * (y * y * (y * y * (y * (y * (y * (y * (35210.0 * y - 193655.0) + 429975.0) - 482475.0) + 274015.0) - 63090.0) + 280.0) - 980.0) + 720.0) * 0.001388888888888889);} inline float__N__ eta_l6_5(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-35210.0 * y + 193655.0) - 429975.0) + 482475.0) - 274015.0) + 63085.0) + 15.0) - 195.0) - 195.0) + 540.0) + 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-35210.0 * y + 193655.0) - 429975.0) + 482475.0) - 274015.0) + 63085.0) + 15.0) - 195.0) - 195.0) + 540.0) + 540.0)) * 0.001388888888888889);} inline float__N__ zeta_l6_5(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (21126.0 * y - 116193.0) + 257985.0) - 289485.0) + 164409.0) - 37848.0) - 12.0) + 60.0) + 120.0) - 54.0) - 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (21126.0 * y - 116193.0) + 257985.0) - 289485.0) + 164409.0) - 37848.0) - 12.0) + 60.0) + 120.0) - 54.0) - 108.0)) * 0.001388888888888889);} inline float__N__ theta_l6_5(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-7042.0 * y + 38731.0) - 85995.0) + 96495.0) - 54803.0) + 12615.0) + 3.0) - 5.0) - 15.0) + 4.0) + 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-7042.0 * y + 38731.0) - 85995.0) + 96495.0) - 54803.0) + 12615.0) + 3.0) - 5.0) - 15.0) + 4.0) + 12.0)) * 0.001388888888888889);} inline float__N__ iota_l6_5(float__N__ y){ - return ((y * y * y * y * y * y * (y * (y * (y * (y * (1006.0 * y - 5533.0) + 12285.0) - 13785.0) + 7829.0) - 1802.0)) / 720.0);} + return ((y * y * y * y * y * y * (y * (y * (y * (y * (1006.0 * y - 5533.0) + 12285.0) - 13785.0) + 7829.0) - 1802.0)) * 0.001388888888888889);} inline float__N__ alpha_l6_6(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (3604.0 * y - 23426.0) + 63866.0) - 93577.0) + 77815.0) - 34869.0) + 6587.0) + 1.0) - 3.0) - 5.0) + 15.0) + 4.0) - 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (3604.0 * y - 23426.0) + 63866.0) - 93577.0) + 77815.0) - 34869.0) + 6587.0) + 1.0) - 3.0) - 5.0) + 15.0) + 4.0) - 12.0)) * 0.001388888888888889);} inline float__N__ beta_l6_6(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-25228.0 * y + 163982.0) - 447062.0) + 655039.0) - 544705.0) + 244083.0) - 46109.0) - 6.0) + 12.0) + 60.0) - 120.0) - 54.0) + 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-25228.0 * y + 163982.0) - 447062.0) + 655039.0) - 544705.0) + 244083.0) - 46109.0) - 6.0) + 12.0) + 60.0) - 120.0) - 54.0) + 108.0)) * 0.001388888888888889);} inline float__N__ gamma_l6_6(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (75684.0 * y - 491946.0) + 1341186.0) - 1965117.0) + 1634115.0) - 732249.0) + 138327.0) + 15.0) - 15.0) - 195.0) + 195.0) + 540.0) - 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (75684.0 * y - 491946.0) + 1341186.0) - 1965117.0) + 1634115.0) - 732249.0) + 138327.0) + 15.0) - 15.0) - 195.0) + 195.0) + 540.0) - 540.0)) * 0.001388888888888889);} inline float__N__ delta_l6_6(float__N__ y){ - return ((y * y * (y * y * (y * y * (y * (y * (y * (y * (y * (y * (-126140.0 * y + 819910.0) - 2235310.0) + 3275195.0) - 2723525.0) + 1220415.0) - 230545.0) - 20.0) + 280.0) - 980.0) + 720.0) / 720.0);} + return ((y * y * (y * y * (y * y * (y * (y * (y * (y * (y * (y * (-126140.0 * y + 819910.0) - 2235310.0) + 3275195.0) - 2723525.0) + 1220415.0) - 230545.0) - 20.0) + 280.0) - 980.0) + 720.0) * 0.001388888888888889);} inline float__N__ eta_l6_6(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (126140.0 * y - 819910.0) + 2235310.0) - 3275195.0) + 2723525.0) - 1220415.0) + 230545.0) + 15.0) + 15.0) - 195.0) - 195.0) + 540.0) + 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (126140.0 * y - 819910.0) + 2235310.0) - 3275195.0) + 2723525.0) - 1220415.0) + 230545.0) + 15.0) + 15.0) - 195.0) - 195.0) + 540.0) + 540.0)) * 0.001388888888888889);} inline float__N__ zeta_l6_6(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-75684.0 * y + 491946.0) - 1341186.0) + 1965117.0) - 1634115.0) + 732249.0) - 138327.0) - 6.0) - 12.0) + 60.0) + 120.0) - 54.0) - 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-75684.0 * y + 491946.0) - 1341186.0) + 1965117.0) - 1634115.0) + 732249.0) - 138327.0) - 6.0) - 12.0) + 60.0) + 120.0) - 54.0) - 108.0)) * 0.001388888888888889);} inline float__N__ theta_l6_6(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (25228.0 * y - 163982.0) + 447062.0) - 655039.0) + 544705.0) - 244083.0) + 46109.0) + 1.0) + 3.0) - 5.0) - 15.0) + 4.0) + 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (25228.0 * y - 163982.0) + 447062.0) - 655039.0) + 544705.0) - 244083.0) + 46109.0) + 1.0) + 3.0) - 5.0) - 15.0) + 4.0) + 12.0)) * 0.001388888888888889);} inline float__N__ iota_l6_6(float__N__ y){ - return ((y * y * y * y * y * y * y * (y * (y * (y * (y * (y * (-3604.0 * y + 23426.0) - 63866.0) + 93577.0) - 77815.0) + 34869.0) - 6587.0)) / 720.0);} + return ((y * y * y * y * y * y * y * (y * (y * (y * (y * (y * (-3604.0 * y + 23426.0) - 63866.0) + 93577.0) - 77815.0) + 34869.0) - 6587.0)) * 0.001388888888888889);} inline float__N__ alpha_l8_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-3569.0 * y + 16061.0) - 27454.0) + 21126.0) - 6125.0) + 49.0) - 196.0) - 36.0) + 144.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-3569.0 * y + 16061.0) - 27454.0) + 21126.0) - 6125.0) + 49.0) - 196.0) - 36.0) + 144.0)) * 2.48015873015873e-05);} inline float__N__ beta_l8_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (32121.0 * y - 144548.0) + 247074.0) - 190092.0) + 55125.0) - 672.0) + 2016.0) + 512.0) - 1536.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (32121.0 * y - 144548.0) + 247074.0) - 190092.0) + 55125.0) - 672.0) + 2016.0) + 512.0) - 1536.0)) * 2.48015873015873e-05);} inline float__N__ gamma_l8_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-128484.0 * y + 578188.0) - 988256.0) + 760312.0) - 221060.0) + 4732.0) - 9464.0) - 4032.0) + 8064.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-128484.0 * y + 578188.0) - 988256.0) + 760312.0) - 221060.0) + 4732.0) - 9464.0) - 4032.0) + 8064.0)) * 2.48015873015873e-05);} inline float__N__ delta_l8_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (299796.0 * y - 1349096.0) + 2305856.0) - 1774136.0) + 517580.0) - 13664.0) + 13664.0) + 32256.0) - 32256.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (299796.0 * y - 1349096.0) + 2305856.0) - 1774136.0) + 517580.0) - 13664.0) + 13664.0) + 32256.0) - 32256.0)) * 2.48015873015873e-05);} inline float__N__ eta_l8_4(float__N__ y){ - return ((y * y * (y * y * (y * (y * (y * (y * (-449694.0 * y + 2023630.0) - 3458700.0) + 2661540.0) - 778806.0) + 19110.0) - 57400.0) + 40320.0) / 40320.0);} + return ((y * y * (y * y * (y * (y * (y * (y * (-449694.0 * y + 2023630.0) - 3458700.0) + 2661540.0) - 778806.0) + 19110.0) - 57400.0) + 40320.0) * 2.48015873015873e-05);} inline float__N__ zeta_l8_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (449694.0 * y - 2023616.0) + 3458644.0) - 2662016.0) + 780430.0) - 13664.0) - 13664.0) + 32256.0) + 32256.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (449694.0 * y - 2023616.0) + 3458644.0) - 2662016.0) + 780430.0) - 13664.0) - 13664.0) + 32256.0) + 32256.0)) * 2.48015873015873e-05);} inline float__N__ theta_l8_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-299796.0 * y + 1349068.0) - 2305744.0) + 1775032.0) - 520660.0) + 4732.0) + 9464.0) - 4032.0) - 8064.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-299796.0 * y + 1349068.0) - 2305744.0) + 1775032.0) - 520660.0) + 4732.0) + 9464.0) - 4032.0) - 8064.0)) * 2.48015873015873e-05);} inline float__N__ iota_l8_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (128484.0 * y - 578168.0) + 988176.0) - 760872.0) + 223020.0) - 672.0) - 2016.0) + 512.0) + 1536.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (128484.0 * y - 578168.0) + 988176.0) - 760872.0) + 223020.0) - 672.0) - 2016.0) + 512.0) + 1536.0)) * 2.48015873015873e-05);} inline float__N__ kappa_l8_4(float__N__ y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-32121.0 * y + 144541.0) - 247046.0) + 190246.0) - 55685.0) + 49.0) + 196.0) - 36.0) - 144.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-32121.0 * y + 144541.0) - 247046.0) + 190246.0) - 55685.0) + 49.0) + 196.0) - 36.0) - 144.0)) * 2.48015873015873e-05);} inline float__N__ mu_l8_4(float__N__ y){ - return ((y * y * y * y * y * (y * (y * (y * (3569.0 * y - 16060.0) + 27450.0) - 21140.0) + 6181.0)) / 40320.0);} + return ((y * y * y * y * y * (y * (y * (y * (3569.0 * y - 16060.0) + 27450.0) - 21140.0) + 6181.0)) * 2.48015873015873e-05);} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl index 35f45164f..cd1827937 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl @@ -5,198 +5,195 @@ */ inline float__N__ alpha_l2_1(float__N__ y){ - return (y*fma(y,fma(y,-1.0, 2.0), - 1.0)/2.0);} + return (y*fma(y,fma(y,-1.0, 2.0), - 1.0) * 0.5);} inline float__N__ beta_l2_1(float__N__ y){ - return (fma(y*y, fma(y, 3.0, -5.0), 2.0) / 2.0);} + return (fma(y*y, fma(y, 3.0, -5.0), 2.0) * 0.5);} inline float__N__ gamma_l2_1(float__N__ y){ - return ((y * fma(y , fma(-3.0, y, 4.0), 1.0)) / 2.0);} + return ((y * fma(y , fma(-3.0, y, 4.0), 1.0)) * 0.5);} inline float__N__ delta_l2_1(float__N__ y){ - return ((y * y * fma(1.0, y, - 1.0)) / 2.0);} + return ((y * y * fma(1.0, y, - 1.0)) * 0.5);} inline float__N__ alpha_l2_2(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, 2.0, -5.0), 3.0), 1.0), -1.0)) / 2.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, 2.0, -5.0), 3.0), 1.0), -1.0)) * 0.5);} inline float__N__ beta_l2_2(float__N__ y){ - return (fma(y * y, fma(y, fma(y, fma(y, -6.0, 15.0), -9.0), -2.0), 2.0) / 2.0);} + return (fma(y * y, fma(y, fma(y, fma(y, -6.0, 15.0), -9.0), -2.0), 2.0) * 0.5);} inline float__N__ gamma_l2_2(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, 6.0, -15.0), 9.0), 1.0), 1.0)) / 2.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, 6.0, -15.0), 9.0), 1.0), 1.0)) * 0.5);} inline float__N__ delta_l2_2(float__N__ y){ - return ((y * y * y * fma(y, fma(y, -2.0, 5.0), -3.0)) / 2.0);} + return ((y * y * y * fma(y, fma(y, -2.0, 5.0), -3.0)) * 0.5);} inline float__N__ alpha_l2_3(float__N__ y){ - return ((y * fma(y, fma(y * y, fma(y, fma(y, fma(y, -6.0, 21.0), -25.0), 10.0), 1.0), -1.0)) / 2.0);} + return ((y * fma(y, fma(y * y, fma(y, fma(y, fma(y, -6.0, 21.0), -25.0), 10.0), 1.0), -1.0)) * 0.5);} inline float__N__ beta_l2_3(float__N__ y){ - return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 18.0, -63.0), 75.0), -30.0), -2.0), 2.0) / 2.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 18.0, -63.0), 75.0), -30.0), -2.0), 2.0) * 0.5);} inline float__N__ gamma_l2_3(float__N__ y){ - return ((y * fma(y, fma(y * y, fma(y, fma(y, fma(y, -18.0, 63.0), -75.0), 30.0), 1.0), 1.0)) / 2.0);} + return ((y * fma(y, fma(y * y, fma(y, fma(y, fma(y, -18.0, 63.0), -75.0), 30.0), 1.0), 1.0)) * 0.5);} inline float__N__ delta_l2_3(float__N__ y){ - return ((y * y * y * y * fma(y, fma(y, fma(y, 6.0, -21.0), 25.0), -10.0)) / 2.0);} + return ((y * y * y * y * fma(y, fma(y, fma(y, 6.0, -21.0), 25.0), -10.0)) * 0.5);} inline float__N__ alpha_l2_4(float__N__ y){ - return ((y * fma(y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, 20.0, -90.0), 154.0), -119.0), 35.0), 1.0), -1.0)) / 2.0);} + return ((y * fma(y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, 20.0, -90.0), 154.0), -119.0), 35.0), 1.0), -1.0)) * 0.5);} inline float__N__ beta_l2_4(float__N__ y){ - return (fma(y * y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, -60.0, 270.0), -462.0), 357.0), -105.0), -2.0), 2.0) / 2.0);} + return (fma(y * y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, -60.0, 270.0), -462.0), 357.0), -105.0), -2.0), 2.0) * 0.5);} inline float__N__ gamma_l2_4(float__N__ y){ - return ((y * fma(y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, 60.0, -270.0), 462.0), -357.0), 105.0), 1.0), 1.0)) / 2.0);} + return ((y * fma(y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, 60.0, -270.0), 462.0), -357.0), 105.0), 1.0), 1.0)) * 0.5);} inline float__N__ delta_l2_4(float__N__ y){ - return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, -20.0, 90.0), -154.0), 119.0), -35.0)) / 2.0);} + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, -20.0, 90.0), -154.0), 119.0), -35.0)) * 0.5);} inline float__N__ alpha_l4_2(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, -5.0, 13.0), -9.0), -1.0), 2.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, -5.0, 13.0), -9.0), -1.0), 2.0)) * 0.041666666666666664);} inline float__N__ beta_l4_2(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, 25.0, -64.0), 39.0), 16.0), -16.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, 25.0, -64.0), 39.0), 16.0), -16.0)) * 0.041666666666666664);} inline float__N__ gamma_l4_2(float__N__ y){ - return (fma(y * y, fma(y, fma(y, fma(y, -50.0, 126.0), -70.0), -30.0), 24.0) / 24.0);} + return (fma(y * y, fma(y, fma(y, fma(y, -50.0, 126.0), -70.0), -30.0), 24.0) * 0.041666666666666664);} inline float__N__ delta_l4_2(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, 50.0, -124.0), 66.0), 16.0), 16.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, 50.0, -124.0), 66.0), 16.0), 16.0)) * 0.041666666666666664);} inline float__N__ eta_l4_2(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, -25.0, 61.0), -33.0), -1.0), -2.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, -25.0, 61.0), -33.0), -1.0), -2.0)) * 0.041666666666666664);} inline float__N__ zeta_l4_2(float__N__ y){ - return ((y * y * y * fma(y, fma(y, 5.0, -12.0), 7.0)) / 24.0);} + return ((y * y * y * fma(y, fma(y, 5.0, -12.0), 7.0)) * 0.041666666666666664);} inline float__N__ alpha_l4_3(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 14.0, -49.0), 58.0), -22.0), -2.0), -1.0), 2.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 14.0, -49.0), 58.0), -22.0), -2.0), -1.0), 2.0)) * 0.041666666666666664);} inline float__N__ beta_l4_3(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -70.0, 245.0), -290.0), 111.0), 4.0), 16.0), -16.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -70.0, 245.0), -290.0), 111.0), 4.0), 16.0), -16.0)) * 0.041666666666666664);} inline float__N__ gamma_l4_3(float__N__ y){ - return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 140.0, -490.0), 580.0), -224.0), -30.0), 24.0) / 24.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 140.0, -490.0), 580.0), -224.0), -30.0), 24.0) * 0.041666666666666664);} inline float__N__ delta_l4_3(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -140.0, 490.0), -580.0), 226.0), -4.0), 16.0), 16.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -140.0, 490.0), -580.0), 226.0), -4.0), 16.0), 16.0)) * 0.041666666666666664);} inline float__N__ eta_l4_3(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 70.0, -245.0), 290.0), -114.0), 2.0), -1.0), -2.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 70.0, -245.0), 290.0), -114.0), 2.0), -1.0), -2.0)) * 0.041666666666666664);} inline float__N__ zeta_l4_3(float__N__ y){ - return ((y * y * y * y * fma(y, fma(y, fma(y, -14.0, 49.0), -58.0), 23.0)) / 24.0);} + return ((y * y * y * y * fma(y, fma(y, fma(y, -14.0, 49.0), -58.0), 23.0)) * 0.041666666666666664);} inline float__N__ alpha_l4_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -46.0, 207.0), -354.0), 273.0), -80.0), 1.0), -2.0), -1.0), 2.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -46.0, 207.0), -354.0), 273.0), -80.0), 1.0), -2.0), -1.0), 2.0)) * 0.041666666666666664);} inline float__N__ beta_l4_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 230.0, -1035.0), 1770.0), -1365.0), 400.0), -4.0), 4.0), 16.0), -16.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 230.0, -1035.0), 1770.0), -1365.0), 400.0), -4.0), 4.0), 16.0), -16.0)) * 0.041666666666666664);} inline float__N__ gamma_l4_4(float__N__ y){ - return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -460.0, 2070.0), -3540.0), 2730.0), -800.0), 6.0), -30.0), 24.0) / 24.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -460.0, 2070.0), -3540.0), 2730.0), -800.0), 6.0), -30.0), 24.0) * 0.041666666666666664);} inline float__N__ delta_l4_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 460.0, -2070.0), 3540.0), -2730.0), 800.0), -4.0), -4.0), 16.0), 16.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 460.0, -2070.0), 3540.0), -2730.0), 800.0), -4.0), -4.0), 16.0), 16.0)) * 0.041666666666666664);} inline float__N__ eta_l4_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -230.0, 1035.0), -1770.0), 1365.0), -400.0), 1.0), 2.0), -1.0), -2.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -230.0, 1035.0), -1770.0), 1365.0), -400.0), 1.0), 2.0), -1.0), -2.0)) * 0.041666666666666664);} inline float__N__ zeta_l4_4(float__N__ y){ - return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, 46.0, -207.0), 354.0), -273.0), 80.0)) / 24.0);} + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, 46.0, -207.0), 354.0), -273.0), 80.0)) * 0.041666666666666664);} inline float__N__ alpha_M8p(float__N__ y){ - return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) / 3360.0);} + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) * 0.00029761904761904765);} inline float__N__ beta_M8p(float__N__ y){ - return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(70.0,y, - 175.0), - 140.0), + 770.0), - 560.0), - 350.0), + 504.0), - 102.0) / 3360.0);} + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(70.0,y, - 175.0), - 140.0), + 770.0), - 560.0), - 350.0), + 504.0), - 102.0) * 0.00029761904761904765);} inline float__N__ gamma_M8p(float__N__ y){ - return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-210.0,y, + 609.0), + 224.0), - 2135.0), + 910.0), + 2765.0), - 2520.0), + 255.0) / 3360.0);} + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-210.0,y, + 609.0), + 224.0), - 2135.0), + 910.0), + 2765.0), - 2520.0), + 255.0) * 0.00029761904761904765);} inline float__N__ delta_M8p(float__N__ y){ - return (fma(y*y, fma(y*y, fma(y*y, fma(70.0,y, - 231.0), + 588.0), - 980.0), + 604.0) / 672.0);} + return (fma(y*y, fma(y*y, fma(y*y, fma(70.0,y, - 231.0), + 588.0), - 980.0), + 604.0) * 0.001488095238095238);} inline float__N__ eta_M8p(float__N__ y){ - return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-70.0,y, 259.0), - 84.0), - 427.0), - 182.0), + 553.0), + 504.0), + 51.0) / 672.0);} + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-70.0,y, 259.0), - 84.0), - 427.0), - 182.0), + 553.0), + 504.0), + 51.0) * 0.001488095238095238);} inline float__N__ zeta_M8p(float__N__ y){ - return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(210.0,y,- 861.0), + 532.0), + 770.0), + 560.0), - 350.0), - 504.0), - 102.0) / 3360.0);} + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(210.0,y,- 861.0), + 532.0), + 770.0), + 560.0), - 350.0), - 504.0), - 102.0) * 0.00029761904761904765);} inline float__N__ theta_M8p(float__N__ y){ - return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) / 3360.0);} + return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) * 0.00029761904761904765);} inline float__N__ iota_M8p(float__N__ y){ - return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) / 3360.0);} + return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) * 0.00029761904761904765);} inline float__N__ alpha_l6_3(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -89.0, 312.0), -370.0), 140.0), 15.0), 4.0), -12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -89.0, 312.0), -370.0), 140.0), 15.0), 4.0), -12.0)) * 0.001388888888888889);} inline float__N__ beta_l6_3(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 623.0, -2183.0), 2581.0), -955.0), -120.0), -54.0), 108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 623.0, -2183.0), 2581.0), -955.0), -120.0), -54.0), 108.0)) * 0.001388888888888889);} inline float__N__ gamma_l6_3(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1869.0, 6546.0), -7722.0), 2850.0), 195.0), 540.0), -540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1869.0, 6546.0), -7722.0), 2850.0), 195.0), 540.0), -540.0)) * 0.001388888888888889);} inline float__N__ delta_l6_3(float__N__ y){ - return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 3115.0, -10905.0), 12845.0), -4795.0), -980.0), 720.0) / 720.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 3115.0, -10905.0), 12845.0), -4795.0), -980.0), 720.0) * 0.001388888888888889);} inline float__N__ eta_l6_3(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3115.0, 10900.0), -12830.0), 4880.0), -195.0), 540.0), 540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3115.0, 10900.0), -12830.0), 4880.0), -195.0), 540.0), 540.0)) * 0.001388888888888889);} inline float__N__ zeta_l6_3(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 1869.0, -6537.0), 7695.0), -2985.0), 120.0), -54.0), -108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 1869.0, -6537.0), 7695.0), -2985.0), 120.0), -54.0), -108.0)) * 0.001388888888888889);} inline float__N__ theta_l6_3(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -623.0, 2178.0), -2566.0), 1010.0), -15.0), 4.0), 12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -623.0, 2178.0), -2566.0), 1010.0), -15.0), 4.0), 12.0)) * 0.001388888888888889);} inline float__N__ iota_l6_3(float__N__ y){ - return ((y * y * y * y * fma(y, fma(y, fma(y, 89.0, -311.0), 367.0), -145.0)) / 720.0);} + return ((y * y * y * y * fma(y, fma(y, fma(y, 89.0, -311.0), 367.0), -145.0)) * 0.001388888888888889);} inline float__N__ alpha_l6_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 290.0, -1305.0), 2231.0), -1718.0), 500.0), -5.0), 15.0), 4.0), -12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 290.0, -1305.0), 2231.0), -1718.0), 500.0), -5.0), 15.0), 4.0), -12.0)) * 0.001388888888888889);} inline float__N__ beta_l6_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -2030.0, 9135.0), -15617.0), 12027.0), -3509.0), 60.0), -120.0), -54.0), 108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -2030.0, 9135.0), -15617.0), 12027.0), -3509.0), 60.0), -120.0), -54.0), 108.0)) * 0.001388888888888889);} inline float__N__ gamma_l6_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 6090.0, -27405.0), 46851.0), -36084.0), 10548.0), -195.0), 195.0), 540.0), -540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 6090.0, -27405.0), 46851.0), -36084.0), 10548.0), -195.0), 195.0), 540.0), -540.0)) * 0.001388888888888889);} inline float__N__ delta_l6_4(float__N__ y){ - return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -10150.0, 45675.0), -78085.0), 60145.0), -17605.0), 280.0), -980.0), 720.0) / 720.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -10150.0, 45675.0), -78085.0), 60145.0), -17605.0), 280.0), -980.0), 720.0) * 0.001388888888888889);} inline float__N__ eta_l6_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 10150.0, -45675.0), 78085.0), -60150.0), 17620.0), -195.0), -195.0), 540.0), 540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 10150.0, -45675.0), 78085.0), -60150.0), 17620.0), -195.0), -195.0), 540.0), 540.0)) * 0.001388888888888889);} inline float__N__ zeta_l6_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -6090.0, 27405.0), -46851.0), 36093.0), -10575.0), 60.0), 120.0), -54.0), -108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -6090.0, 27405.0), -46851.0), 36093.0), -10575.0), 60.0), 120.0), -54.0), -108.0)) * 0.001388888888888889);} inline float__N__ theta_l6_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 2030.0, -9135.0), 15617.0), -12032.0), 3524.0), -5.0), -15.0), 4.0), 12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 2030.0, -9135.0), 15617.0), -12032.0), 3524.0), -5.0), -15.0), 4.0), 12.0)) * 0.001388888888888889);} inline float__N__ iota_l6_4(float__N__ y){ - return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, -290.0, 1305.0), -2231.0), 1719.0), -503.0)) / 720.0);} + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, -290.0, 1305.0), -2231.0), 1719.0), -503.0)) * 0.001388888888888889);} inline float__N__ alpha_l6_5(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1006.0, 5533.0), -12285.0), 13785.0), -7829.0), 1803.0), -3.0), -5.0), 15.0), 4.0), -12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1006.0, 5533.0), -12285.0), 13785.0), -7829.0), 1803.0), -3.0), -5.0), 15.0), 4.0), -12.0)) * 0.001388888888888889);} inline float__N__ beta_l6_5(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 7042.0, -38731.0), 85995.0), -96495.0), 54803.0), -12620.0), 12.0), 60.0), -120.0), -54.0), 108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 7042.0, -38731.0), 85995.0), -96495.0), 54803.0), -12620.0), 12.0), 60.0), -120.0), -54.0), 108.0)) * 0.001388888888888889);} inline float__N__ gamma_l6_5(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -21126.0, 116193.0), -257985.0), 289485.0), -164409.0), 37857.0), -15.0), -195.0), 195.0), 540.0), -540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -21126.0, 116193.0), -257985.0), 289485.0), -164409.0), 37857.0), -15.0), -195.0), 195.0), 540.0), -540.0)) * 0.001388888888888889);} inline float__N__ delta_l6_5(float__N__ y){ - return (fma(y * y, fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, 35210.0, -193655.0), 429975.0), -482475.0), 274015.0), -63090.0), 280.0), -980.0), 720.0) / 720.0);} + return (fma(y * y, fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, 35210.0, -193655.0), 429975.0), -482475.0), 274015.0), -63090.0), 280.0), -980.0), 720.0) * 0.001388888888888889);} inline float__N__ eta_l6_5(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -35210.0, 193655.0), -429975.0), 482475.0), -274015.0), 63085.0), 15.0), -195.0), -195.0), 540.0), 540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -35210.0, 193655.0), -429975.0), 482475.0), -274015.0), 63085.0), 15.0), -195.0), -195.0), 540.0), 540.0)) * 0.001388888888888889);} inline float__N__ zeta_l6_5(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 21126.0, -116193.0), 257985.0), -289485.0), 164409.0), -37848.0), -12.0), 60.0), 120.0), -54.0), -108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 21126.0, -116193.0), 257985.0), -289485.0), 164409.0), -37848.0), -12.0), 60.0), 120.0), -54.0), -108.0)) * 0.001388888888888889);} inline float__N__ theta_l6_5(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -7042.0, 38731.0), -85995.0), 96495.0), -54803.0), 12615.0), 3.0), -5.0), -15.0), 4.0), 12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -7042.0, 38731.0), -85995.0), 96495.0), -54803.0), 12615.0), 3.0), -5.0), -15.0), 4.0), 12.0)) * 0.001388888888888889);} inline float__N__ iota_l6_5(float__N__ y){ - return ((y * y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, fma(y, 1006.0, -5533.0), 12285.0), -13785.0), 7829.0), -1802.0)) / 720.0);} + return ((y * y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, fma(y, 1006.0, -5533.0), 12285.0), -13785.0), 7829.0), -1802.0)) * 0.001388888888888889);} inline float__N__ alpha_l6_6(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 3604.0, -23426.0), 63866.0), -93577.0), 77815.0), -34869.0), 6587.0), 1.0), -3.0), -5.0), 15.0), 4.0), -12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 3604.0, -23426.0), 63866.0), -93577.0), 77815.0), -34869.0), 6587.0), 1.0), -3.0), -5.0), 15.0), 4.0), -12.0)) * 0.001388888888888889);} inline float__N__ beta_l6_6(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -25228.0, 163982.0), -447062.0), 655039.0), -544705.0), 244083.0), -46109.0), -6.0), 12.0), 60.0), -120.0), -54.0), 108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -25228.0, 163982.0), -447062.0), 655039.0), -544705.0), 244083.0), -46109.0), -6.0), 12.0), 60.0), -120.0), -54.0), 108.0)) * 0.001388888888888889);} inline float__N__ gamma_l6_6(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 75684.0, -491946.0), 1341186.0), -1965117.0), 1634115.0), -732249.0), 138327.0), 15.0), -15.0), -195.0), 195.0), 540.0), -540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 75684.0, -491946.0), 1341186.0), -1965117.0), 1634115.0), -732249.0), 138327.0), 15.0), -15.0), -195.0), 195.0), 540.0), -540.0)) * 0.001388888888888889);} inline float__N__ delta_l6_6(float__N__ y){ - return (fma(y * y, fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -126140.0, 819910.0), -2235310.0), 3275195.0), -2723525.0), 1220415.0), -230545.0), -20.0), 280.0), -980.0), 720.0) / 720.0);} + return (fma(y * y, fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -126140.0, 819910.0), -2235310.0), 3275195.0), -2723525.0), 1220415.0), -230545.0), -20.0), 280.0), -980.0), 720.0) * 0.001388888888888889);} inline float__N__ eta_l6_6(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 126140.0, -819910.0), 2235310.0), -3275195.0), 2723525.0), -1220415.0), 230545.0), 15.0), 15.0), -195.0), -195.0), 540.0), 540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 126140.0, -819910.0), 2235310.0), -3275195.0), 2723525.0), -1220415.0), 230545.0), 15.0), 15.0), -195.0), -195.0), 540.0), 540.0)) * 0.001388888888888889);} inline float__N__ zeta_l6_6(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -75684.0, 491946.0), -1341186.0), 1965117.0), -1634115.0), 732249.0), -138327.0), -6.0), -12.0), 60.0), 120.0), -54.0), -108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -75684.0, 491946.0), -1341186.0), 1965117.0), -1634115.0), 732249.0), -138327.0), -6.0), -12.0), 60.0), 120.0), -54.0), -108.0)) * 0.001388888888888889);} inline float__N__ theta_l6_6(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 25228.0, -163982.0), 447062.0), -655039.0), 544705.0), -244083.0), 46109.0), 1.0), 3.0), -5.0), -15.0), 4.0), 12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 25228.0, -163982.0), 447062.0), -655039.0), 544705.0), -244083.0), 46109.0), 1.0), 3.0), -5.0), -15.0), 4.0), 12.0)) * 0.001388888888888889);} inline float__N__ iota_l6_6(float__N__ y){ - return ((y * y * y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3604.0, 23426.0), -63866.0), 93577.0), -77815.0), 34869.0), -6587.0)) / 720.0);} + return ((y * y * y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3604.0, 23426.0), -63866.0), 93577.0), -77815.0), 34869.0), -6587.0)) * 0.001388888888888889);} inline float__N__ alpha_l8_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3569.0, 16061.0), -27454.0), 21126.0), -6125.0), 49.0), -196.0), -36.0), 144.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3569.0, 16061.0), -27454.0), 21126.0), -6125.0), 49.0), -196.0), -36.0), 144.0)) * 2.48015873015873e-05);} inline float__N__ beta_l8_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 32121.0, -144548.0), 247074.0), -190092.0), 55125.0), -672.0), 2016.0), 512.0), -1536.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 32121.0, -144548.0), 247074.0), -190092.0), 55125.0), -672.0), 2016.0), 512.0), -1536.0)) * 2.48015873015873e-05);} inline float__N__ gamma_l8_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -128484.0, 578188.0), -988256.0), 760312.0), -221060.0), 4732.0), -9464.0), -4032.0), 8064.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -128484.0, 578188.0), -988256.0), 760312.0), -221060.0), 4732.0), -9464.0), -4032.0), 8064.0)) * 2.48015873015873e-05);} inline float__N__ delta_l8_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 299796.0, -1349096.0), 2305856.0), -1774136.0), 517580.0), -13664.0), 13664.0), 32256.0), -32256.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 299796.0, -1349096.0), 2305856.0), -1774136.0), 517580.0), -13664.0), 13664.0), 32256.0), -32256.0)) * 2.48015873015873e-05);} inline float__N__ eta_l8_4(float__N__ y){ - return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -449694.0, 2023630.0), -3458700.0), 2661540.0), -778806.0), 19110.0), -57400.0), 40320.0) / 40320.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -449694.0, 2023630.0), -3458700.0), 2661540.0), -778806.0), 19110.0), -57400.0), 40320.0) * 2.48015873015873e-05);} inline float__N__ zeta_l8_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 449694.0, -2023616.0), 3458644.0), -2662016.0), 780430.0), -13664.0), -13664.0), 32256.0), 32256.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 449694.0, -2023616.0), 3458644.0), -2662016.0), 780430.0), -13664.0), -13664.0), 32256.0), 32256.0)) * 2.48015873015873e-05);} inline float__N__ theta_l8_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -299796.0, 1349068.0), -2305744.0), 1775032.0), -520660.0), 4732.0), 9464.0), -4032.0), -8064.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -299796.0, 1349068.0), -2305744.0), 1775032.0), -520660.0), 4732.0), 9464.0), -4032.0), -8064.0)) * 2.48015873015873e-05);} inline float__N__ iota_l8_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 128484.0, -578168.0), 988176.0), -760872.0), 223020.0), -672.0), -2016.0), 512.0), 1536.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 128484.0, -578168.0), 988176.0), -760872.0), 223020.0), -672.0), -2016.0), 512.0), 1536.0)) * 2.48015873015873e-05);} inline float__N__ kappa_l8_4(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -32121.0, 144541.0), -247046.0), 190246.0), -55685.0), 49.0), 196.0), -36.0), -144.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -32121.0, 144541.0), -247046.0), 190246.0), -55685.0), 49.0), 196.0), -36.0), -144.0)) * 2.48015873015873e-05);} inline float__N__ mu_l8_4(float__N__ y){ - return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, 3569.0, -16060.0), 27450.0), -21140.0), 6181.0)) / 40320.0);} - - -#endif + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, 3569.0, -16060.0), 27450.0), -21140.0), 6181.0)) * 2.48015873015873e-05);} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl index e023dc5e8..a46f89e31 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl @@ -5,197 +5,194 @@ */ inline float alpha_l2_1(float y){ - return ((y * (y * (-y + 2.0) - 1.0)) / 2.0);} + return ((y * (y * (-y + 2.0) - 1.0)) * 0.5);} inline float beta_l2_1(float y){ - return ((y * y * (3.0 * y - 5.0) + 2.0) / 2.0);} + return ((y * y * (3.0 * y - 5.0) + 2.0) * 0.5);} inline float gamma_l2_1(float y){ - return ((y * (y * (-3.0 * y + 4.0) + 1.0)) / 2.0);} + return ((y * (y * (-3.0 * y + 4.0) + 1.0)) * 0.5);} inline float delta_l2_1(float y){ - return ((y * y * (y - 1.0)) / 2.0);} + return ((y * y * (y - 1.0)) * 0.5);} inline float alpha_l2_2(float y){ - return ((y * (y * (y * (y * (2.0 * y - 5.0) + 3.0) + 1.0) - 1.0)) / 2.0);} + return ((y * (y * (y * (y * (2.0 * y - 5.0) + 3.0) + 1.0) - 1.0)) * 0.5);} inline float beta_l2_2(float y){ - return ((y * y * (y * (y * (-6.0 * y + 15.0) - 9.0) - 2.0) + 2.0) / 2.0);} + return ((y * y * (y * (y * (-6.0 * y + 15.0) - 9.0) - 2.0) + 2.0) * 0.5);} inline float gamma_l2_2(float y){ - return ((y * (y * (y * (y * (6.0 * y - 15.0) + 9.0) + 1.0) + 1.0)) / 2.0);} + return ((y * (y * (y * (y * (6.0 * y - 15.0) + 9.0) + 1.0) + 1.0)) * 0.5);} inline float delta_l2_2(float y){ - return ((y * y * y * (y * (-2.0 * y + 5.0) - 3.0)) / 2.0);} + return ((y * y * y * (y * (-2.0 * y + 5.0) - 3.0)) * 0.5);} inline float alpha_l2_3(float y){ - return ((y * (y * (y * y * (y * (y * (-6.0 * y + 21.0) - 25.0) + 10.0) + 1.0) - 1.0)) / 2.0);} + return ((y * (y * (y * y * (y * (y * (-6.0 * y + 21.0) - 25.0) + 10.0) + 1.0) - 1.0)) * 0.5);} inline float beta_l2_3(float y){ - return ((y * y * (y * y * (y * (y * (18.0 * y - 63.0) + 75.0) - 30.0) - 2.0) + 2.0) / 2.0);} + return ((y * y * (y * y * (y * (y * (18.0 * y - 63.0) + 75.0) - 30.0) - 2.0) + 2.0) * 0.5);} inline float gamma_l2_3(float y){ - return ((y * (y * (y * y * (y * (y * (-18.0 * y + 63.0) - 75.0) + 30.0) + 1.0) + 1.0)) / 2.0);} + return ((y * (y * (y * y * (y * (y * (-18.0 * y + 63.0) - 75.0) + 30.0) + 1.0) + 1.0)) * 0.5);} inline float delta_l2_3(float y){ - return ((y * y * y * y * (y * (y * (6.0 * y - 21.0) + 25.0) - 10.0)) / 2.0);} + return ((y * y * y * y * (y * (y * (6.0 * y - 21.0) + 25.0) - 10.0)) * 0.5);} inline float alpha_l2_4(float y){ - return ((y * (y * (y * y * y * (y * (y * (y * (20.0 * y - 90.0) + 154.0) - 119.0) + 35.0) + 1.0) - 1.0)) / 2.0);} + return ((y * (y * (y * y * y * (y * (y * (y * (20.0 * y - 90.0) + 154.0) - 119.0) + 35.0) + 1.0) - 1.0)) * 0.5);} inline float beta_l2_4(float y){ - return ((y * y * (y * y * y * (y * (y * (y * (-60.0 * y + 270.0) - 462.0) + 357.0) - 105.0) - 2.0) + 2.0) / 2.0);} + return ((y * y * (y * y * y * (y * (y * (y * (-60.0 * y + 270.0) - 462.0) + 357.0) - 105.0) - 2.0) + 2.0) * 0.5);} inline float gamma_l2_4(float y){ - return ((y * (y * (y * y * y * (y * (y * (y * (60.0 * y - 270.0) + 462.0) - 357.0) + 105.0) + 1.0) + 1.0)) / 2.0);} + return ((y * (y * (y * y * y * (y * (y * (y * (60.0 * y - 270.0) + 462.0) - 357.0) + 105.0) + 1.0) + 1.0)) * 0.5);} inline float delta_l2_4(float y){ - return ((y * y * y * y * y * (y * (y * (y * (-20.0 * y + 90.0) - 154.0) + 119.0) - 35.0)) / 2.0);} + return ((y * y * y * y * y * (y * (y * (y * (-20.0 * y + 90.0) - 154.0) + 119.0) - 35.0)) * 0.5);} inline float alpha_l4_2(float y){ - return ((y * (y * (y * (y * (-5.0 * y + 13.0) - 9.0) - 1.0) + 2.0)) / 24.0);} + return ((y * (y * (y * (y * (-5.0 * y + 13.0) - 9.0) - 1.0) + 2.0)) * 0.041666666666666664);} inline float beta_l4_2(float y){ - return ((y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0)) / 24.0);} + return ((y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0)) * 0.041666666666666664);} inline float gamma_l4_2(float y){ - return ((y * y * (y * (y * (-50.0 * y + 126.0) - 70.0) - 30.0) + 24.0) / 24.0);} + return ((y * y * (y * (y * (-50.0 * y + 126.0) - 70.0) - 30.0) + 24.0) * 0.041666666666666664);} inline float delta_l4_2(float y){ - return ((y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0)) / 24.0);} + return ((y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0)) * 0.041666666666666664);} inline float eta_l4_2(float y){ - return ((y * (y * (y * (y * (-25.0 * y + 61.0) - 33.0) - 1.0) - 2.0)) / 24.0);} + return ((y * (y * (y * (y * (-25.0 * y + 61.0) - 33.0) - 1.0) - 2.0)) * 0.041666666666666664);} inline float zeta_l4_2(float y){ - return ((y * y * y * (y * (5.0 * y - 12.0) + 7.0)) / 24.0);} + return ((y * y * y * (y * (5.0 * y - 12.0) + 7.0)) * 0.041666666666666664);} inline float alpha_l4_3(float y){ - return ((y * (y * (y * (y * (y * (y * (14.0 * y - 49.0) + 58.0) - 22.0) - 2.0) - 1.0) + 2.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (14.0 * y - 49.0) + 58.0) - 22.0) - 2.0) - 1.0) + 2.0)) * 0.041666666666666664);} inline float beta_l4_3(float y){ - return ((y * (y * (y * (y * (y * (y * (-70.0 * y + 245.0) - 290.0) + 111.0) + 4.0) + 16.0) - 16.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (-70.0 * y + 245.0) - 290.0) + 111.0) + 4.0) + 16.0) - 16.0)) * 0.041666666666666664);} inline float gamma_l4_3(float y){ - return ((y * y * (y * y * (y * (y * (140.0 * y - 490.0) + 580.0) - 224.0) - 30.0) + 24.0) / 24.0);} + return ((y * y * (y * y * (y * (y * (140.0 * y - 490.0) + 580.0) - 224.0) - 30.0) + 24.0) * 0.041666666666666664);} inline float delta_l4_3(float y){ - return ((y * (y * (y * (y * (y * (y * (-140.0 * y + 490.0) - 580.0) + 226.0) - 4.0) + 16.0) + 16.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (-140.0 * y + 490.0) - 580.0) + 226.0) - 4.0) + 16.0) + 16.0)) * 0.041666666666666664);} inline float eta_l4_3(float y){ - return ((y * (y * (y * (y * (y * (y * (70.0 * y - 245.0) + 290.0) - 114.0) + 2.0) - 1.0) - 2.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (70.0 * y - 245.0) + 290.0) - 114.0) + 2.0) - 1.0) - 2.0)) * 0.041666666666666664);} inline float zeta_l4_3(float y){ - return ((y * y * y * y * (y * (y * (-14.0 * y + 49.0) - 58.0) + 23.0)) / 24.0);} + return ((y * y * y * y * (y * (y * (-14.0 * y + 49.0) - 58.0) + 23.0)) * 0.041666666666666664);} inline float alpha_l4_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-46.0 * y + 207.0) - 354.0) + 273.0) - 80.0) + 1.0) - 2.0) - 1.0) + 2.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-46.0 * y + 207.0) - 354.0) + 273.0) - 80.0) + 1.0) - 2.0) - 1.0) + 2.0)) * 0.041666666666666664);} inline float beta_l4_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (230.0 * y - 1035.0) + 1770.0) - 1365.0) + 400.0) - 4.0) + 4.0) + 16.0) - 16.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (230.0 * y - 1035.0) + 1770.0) - 1365.0) + 400.0) - 4.0) + 4.0) + 16.0) - 16.0)) * 0.041666666666666664);} inline float gamma_l4_4(float y){ - return ((y * y * (y * y * (y * (y * (y * (y * (-460.0 * y + 2070.0) - 3540.0) + 2730.0) - 800.0) + 6.0) - 30.0) + 24.0) / 24.0);} + return ((y * y * (y * y * (y * (y * (y * (y * (-460.0 * y + 2070.0) - 3540.0) + 2730.0) - 800.0) + 6.0) - 30.0) + 24.0) * 0.041666666666666664);} inline float delta_l4_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (460.0 * y - 2070.0) + 3540.0) - 2730.0) + 800.0) - 4.0) - 4.0) + 16.0) + 16.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (460.0 * y - 2070.0) + 3540.0) - 2730.0) + 800.0) - 4.0) - 4.0) + 16.0) + 16.0)) * 0.041666666666666664);} inline float eta_l4_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-230.0 * y + 1035.0) - 1770.0) + 1365.0) - 400.0) + 1.0) + 2.0) - 1.0) - 2.0)) / 24.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-230.0 * y + 1035.0) - 1770.0) + 1365.0) - 400.0) + 1.0) + 2.0) - 1.0) - 2.0)) * 0.041666666666666664);} inline float zeta_l4_4(float y){ - return ((y * y * y * y * y * (y * (y * (y * (46.0 * y - 207.0) + 354.0) - 273.0) + 80.0)) / 24.0);} + return ((y * y * y * y * y * (y * (y * (y * (46.0 * y - 207.0) + 354.0) - 273.0) + 80.0)) * 0.041666666666666664);} inline float alpha_M8p(float y){ - return ((y*(y*(y*(y*(y*(y*(-10.0*y + 21.0) + 28.0) - 105.0) + 70.0) + 35.0) - 56.0) + 17.0) / 3360.0);} + return ((y*(y*(y*(y*(y*(y*(-10.0*y + 21.0) + 28.0) - 105.0) + 70.0) + 35.0) - 56.0) + 17.0) * 0.00029761904761904765);} inline float beta_M8p(float y){ - return ((y*(y*(y*(y*(y*(y*(70.0*y - 175.0) - 140.0) + 770.0) - 560.0) - 350.0) + 504.0) - 102.0) / 3360.0);} + return ((y*(y*(y*(y*(y*(y*(70.0*y - 175.0) - 140.0) + 770.0) - 560.0) - 350.0) + 504.0) - 102.0) * 0.00029761904761904765);} inline float gamma_M8p(float y){ - return ((y*(y*(y*(y*(y*(y*(-210.0*y + 609.0) + 224.0) - 2135.0) + 910.0) + 2765.0) - 2520.0) + 255.0) / 3360.0);} + return ((y*(y*(y*(y*(y*(y*(-210.0*y + 609.0) + 224.0) - 2135.0) + 910.0) + 2765.0) - 2520.0) + 255.0) * 0.00029761904761904765);} inline float delta_M8p(float y){ - return ((y*y* (y*y* (y*y* (70.0*y - 231.0) + 588.0) - 980.0) + 604.0) / 672.0);} + return ((y*y* (y*y* (y*y* (70.0*y - 231.0) + 588.0) - 980.0) + 604.0) * 0.001488095238095238);} inline float eta_M8p(float y){ - return ((y*(y*(y*(y*(y*(y*(-70.0*y+ 259.0) - 84.0) - 427.0) - 182.0)+ 553.0) + 504.0)+ 51.0) / 672.0);} + return ((y*(y*(y*(y*(y*(y*(-70.0*y+ 259.0) - 84.0) - 427.0) - 182.0)+ 553.0) + 504.0)+ 51.0) * 0.001488095238095238);} inline float zeta_M8p(float y){ - return ((y*(y*(y*(y*(y*(y*(210.0*y- 861.0) + 532.0) + 770.0) + 560.0) - 350.0) - 504.0) - 102.0) / 3360.0);} + return ((y*(y*(y*(y*(y*(y*(210.0*y- 861.0) + 532.0) + 770.0) + 560.0) - 350.0) - 504.0) - 102.0) * 0.00029761904761904765);} inline float theta_M8p(float y){ - return ((y* (y* (y* (y* (y* (y* (-70.0* y+ 315.0) -280.0) -105.0) -70.0) +35.0)+ 56.0) +17.0) / 3360.0);} + return ((y* (y* (y* (y* (y* (y* (-70.0* y+ 315.0) -280.0) -105.0) -70.0) +35.0)+ 56.0) +17.0) * 0.00029761904761904765);} inline float iota_M8p(float y){ - return ((y * y * y * y * y * (y * (10.0 * y - 49.0) + 56.0)) / 3360.0);} + return ((y * y * y * y * y * (y * (10.0 * y - 49.0) + 56.0)) * 0.00029761904761904765);} inline float alpha_l6_3(float y){ - return ((y * (y * (y * (y * (y * (y * (-89.0 * y + 312.0) - 370.0) + 140.0) + 15.0) + 4.0) - 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (-89.0 * y + 312.0) - 370.0) + 140.0) + 15.0) + 4.0) - 12.0)) * 0.001388888888888889);} inline float beta_l6_3(float y){ - return ((y * (y * (y * (y * (y * (y * (623.0 * y - 2183.0) + 2581.0) - 955.0) - 120.0) - 54.0) + 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (623.0 * y - 2183.0) + 2581.0) - 955.0) - 120.0) - 54.0) + 108.0)) * 0.001388888888888889);} inline float gamma_l6_3(float y){ - return ((y * (y * (y * (y * (y * (y * (-1869.0 * y + 6546.0) - 7722.0) + 2850.0) + 195.0) + 540.0) - 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (-1869.0 * y + 6546.0) - 7722.0) + 2850.0) + 195.0) + 540.0) - 540.0)) * 0.001388888888888889);} inline float delta_l6_3(float y){ - return ((y * y * (y * y * (y * (y * (3115.0 * y - 10905.0) + 12845.0) - 4795.0) - 980.0) + 720.0) / 720.0);} + return ((y * y * (y * y * (y * (y * (3115.0 * y - 10905.0) + 12845.0) - 4795.0) - 980.0) + 720.0) * 0.001388888888888889);} inline float eta_l6_3(float y){ - return ((y * (y * (y * (y * (y * (y * (-3115.0 * y + 10900.0) - 12830.0) + 4880.0) - 195.0) + 540.0) + 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (-3115.0 * y + 10900.0) - 12830.0) + 4880.0) - 195.0) + 540.0) + 540.0)) * 0.001388888888888889);} inline float zeta_l6_3(float y){ - return ((y * (y * (y * (y * (y * (y * (1869.0 * y - 6537.0) + 7695.0) - 2985.0) + 120.0) - 54.0) - 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (1869.0 * y - 6537.0) + 7695.0) - 2985.0) + 120.0) - 54.0) - 108.0)) * 0.001388888888888889);} inline float theta_l6_3(float y){ - return ((y * (y * (y * (y * (y * (y * (-623.0 * y + 2178.0) - 2566.0) + 1010.0) - 15.0) + 4.0) + 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (-623.0 * y + 2178.0) - 2566.0) + 1010.0) - 15.0) + 4.0) + 12.0)) * 0.001388888888888889);} inline float iota_l6_3(float y){ - return ((y * y * y * y * (y * (y * (89.0 * y - 311.0) + 367.0) - 145.0)) / 720.0);} + return ((y * y * y * y * (y * (y * (89.0 * y - 311.0) + 367.0) - 145.0)) * 0.001388888888888889);} inline float alpha_l6_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (290.0 * y - 1305.0) + 2231.0) - 1718.0) + 500.0) - 5.0) + 15.0) + 4.0) - 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (290.0 * y - 1305.0) + 2231.0) - 1718.0) + 500.0) - 5.0) + 15.0) + 4.0) - 12.0)) * 0.001388888888888889);} inline float beta_l6_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-2030.0 * y + 9135.0) - 15617.0) + 12027.0) - 3509.0) + 60.0) - 120.0) - 54.0) + 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-2030.0 * y + 9135.0) - 15617.0) + 12027.0) - 3509.0) + 60.0) - 120.0) - 54.0) + 108.0)) * 0.001388888888888889);} inline float gamma_l6_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (6090.0 * y - 27405.0) + 46851.0) - 36084.0) + 10548.0) - 195.0) + 195.0) + 540.0) - 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (6090.0 * y - 27405.0) + 46851.0) - 36084.0) + 10548.0) - 195.0) + 195.0) + 540.0) - 540.0)) * 0.001388888888888889);} inline float delta_l6_4(float y){ - return ((y * y * (y * y * (y * (y * (y * (y * (-10150.0 * y + 45675.0) - 78085.0) + 60145.0) - 17605.0) + 280.0) - 980.0) + 720.0) / 720.0);} + return ((y * y * (y * y * (y * (y * (y * (y * (-10150.0 * y + 45675.0) - 78085.0) + 60145.0) - 17605.0) + 280.0) - 980.0) + 720.0) * 0.001388888888888889);} inline float eta_l6_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (10150.0 * y - 45675.0) + 78085.0) - 60150.0) + 17620.0) - 195.0) - 195.0) + 540.0) + 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (10150.0 * y - 45675.0) + 78085.0) - 60150.0) + 17620.0) - 195.0) - 195.0) + 540.0) + 540.0)) * 0.001388888888888889);} inline float zeta_l6_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-6090.0 * y + 27405.0) - 46851.0) + 36093.0) - 10575.0) + 60.0) + 120.0) - 54.0) - 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-6090.0 * y + 27405.0) - 46851.0) + 36093.0) - 10575.0) + 60.0) + 120.0) - 54.0) - 108.0)) * 0.001388888888888889);} inline float theta_l6_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (2030.0 * y - 9135.0) + 15617.0) - 12032.0) + 3524.0) - 5.0) - 15.0) + 4.0) + 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (2030.0 * y - 9135.0) + 15617.0) - 12032.0) + 3524.0) - 5.0) - 15.0) + 4.0) + 12.0)) * 0.001388888888888889);} inline float iota_l6_4(float y){ - return ((y * y * y * y * y * (y * (y * (y * (-290.0 * y + 1305.0) - 2231.0) + 1719.0) - 503.0)) / 720.0);} + return ((y * y * y * y * y * (y * (y * (y * (-290.0 * y + 1305.0) - 2231.0) + 1719.0) - 503.0)) * 0.001388888888888889);} inline float alpha_l6_5(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-1006.0 * y + 5533.0) - 12285.0) + 13785.0) - 7829.0) + 1803.0) - 3.0) - 5.0) + 15.0) + 4.0) - 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-1006.0 * y + 5533.0) - 12285.0) + 13785.0) - 7829.0) + 1803.0) - 3.0) - 5.0) + 15.0) + 4.0) - 12.0)) * 0.001388888888888889);} inline float beta_l6_5(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (7042.0 * y - 38731.0) + 85995.0) - 96495.0) + 54803.0) - 12620.0) + 12.0) + 60.0) - 120.0) - 54.0) + 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (7042.0 * y - 38731.0) + 85995.0) - 96495.0) + 54803.0) - 12620.0) + 12.0) + 60.0) - 120.0) - 54.0) + 108.0)) * 0.001388888888888889);} inline float gamma_l6_5(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-21126.0 * y + 116193.0) - 257985.0) + 289485.0) - 164409.0) + 37857.0) - 15.0) - 195.0) + 195.0) + 540.0) - 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-21126.0 * y + 116193.0) - 257985.0) + 289485.0) - 164409.0) + 37857.0) - 15.0) - 195.0) + 195.0) + 540.0) - 540.0)) * 0.001388888888888889);} inline float delta_l6_5(float y){ - return ((y * y * (y * y * (y * y * (y * (y * (y * (y * (35210.0 * y - 193655.0) + 429975.0) - 482475.0) + 274015.0) - 63090.0) + 280.0) - 980.0) + 720.0) / 720.0);} + return ((y * y * (y * y * (y * y * (y * (y * (y * (y * (35210.0 * y - 193655.0) + 429975.0) - 482475.0) + 274015.0) - 63090.0) + 280.0) - 980.0) + 720.0) * 0.001388888888888889);} inline float eta_l6_5(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-35210.0 * y + 193655.0) - 429975.0) + 482475.0) - 274015.0) + 63085.0) + 15.0) - 195.0) - 195.0) + 540.0) + 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-35210.0 * y + 193655.0) - 429975.0) + 482475.0) - 274015.0) + 63085.0) + 15.0) - 195.0) - 195.0) + 540.0) + 540.0)) * 0.001388888888888889);} inline float zeta_l6_5(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (21126.0 * y - 116193.0) + 257985.0) - 289485.0) + 164409.0) - 37848.0) - 12.0) + 60.0) + 120.0) - 54.0) - 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (21126.0 * y - 116193.0) + 257985.0) - 289485.0) + 164409.0) - 37848.0) - 12.0) + 60.0) + 120.0) - 54.0) - 108.0)) * 0.001388888888888889);} inline float theta_l6_5(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-7042.0 * y + 38731.0) - 85995.0) + 96495.0) - 54803.0) + 12615.0) + 3.0) - 5.0) - 15.0) + 4.0) + 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-7042.0 * y + 38731.0) - 85995.0) + 96495.0) - 54803.0) + 12615.0) + 3.0) - 5.0) - 15.0) + 4.0) + 12.0)) * 0.001388888888888889);} inline float iota_l6_5(float y){ - return ((y * y * y * y * y * y * (y * (y * (y * (y * (1006.0 * y - 5533.0) + 12285.0) - 13785.0) + 7829.0) - 1802.0)) / 720.0);} + return ((y * y * y * y * y * y * (y * (y * (y * (y * (1006.0 * y - 5533.0) + 12285.0) - 13785.0) + 7829.0) - 1802.0)) * 0.001388888888888889);} inline float alpha_l6_6(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (3604.0 * y - 23426.0) + 63866.0) - 93577.0) + 77815.0) - 34869.0) + 6587.0) + 1.0) - 3.0) - 5.0) + 15.0) + 4.0) - 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (3604.0 * y - 23426.0) + 63866.0) - 93577.0) + 77815.0) - 34869.0) + 6587.0) + 1.0) - 3.0) - 5.0) + 15.0) + 4.0) - 12.0)) * 0.001388888888888889);} inline float beta_l6_6(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-25228.0 * y + 163982.0) - 447062.0) + 655039.0) - 544705.0) + 244083.0) - 46109.0) - 6.0) + 12.0) + 60.0) - 120.0) - 54.0) + 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-25228.0 * y + 163982.0) - 447062.0) + 655039.0) - 544705.0) + 244083.0) - 46109.0) - 6.0) + 12.0) + 60.0) - 120.0) - 54.0) + 108.0)) * 0.001388888888888889);} inline float gamma_l6_6(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (75684.0 * y - 491946.0) + 1341186.0) - 1965117.0) + 1634115.0) - 732249.0) + 138327.0) + 15.0) - 15.0) - 195.0) + 195.0) + 540.0) - 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (75684.0 * y - 491946.0) + 1341186.0) - 1965117.0) + 1634115.0) - 732249.0) + 138327.0) + 15.0) - 15.0) - 195.0) + 195.0) + 540.0) - 540.0)) * 0.001388888888888889);} inline float delta_l6_6(float y){ - return ((y * y * (y * y * (y * y * (y * (y * (y * (y * (y * (y * (-126140.0 * y + 819910.0) - 2235310.0) + 3275195.0) - 2723525.0) + 1220415.0) - 230545.0) - 20.0) + 280.0) - 980.0) + 720.0) / 720.0);} + return ((y * y * (y * y * (y * y * (y * (y * (y * (y * (y * (y * (-126140.0 * y + 819910.0) - 2235310.0) + 3275195.0) - 2723525.0) + 1220415.0) - 230545.0) - 20.0) + 280.0) - 980.0) + 720.0) * 0.001388888888888889);} inline float eta_l6_6(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (126140.0 * y - 819910.0) + 2235310.0) - 3275195.0) + 2723525.0) - 1220415.0) + 230545.0) + 15.0) + 15.0) - 195.0) - 195.0) + 540.0) + 540.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (126140.0 * y - 819910.0) + 2235310.0) - 3275195.0) + 2723525.0) - 1220415.0) + 230545.0) + 15.0) + 15.0) - 195.0) - 195.0) + 540.0) + 540.0)) * 0.001388888888888889);} inline float zeta_l6_6(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-75684.0 * y + 491946.0) - 1341186.0) + 1965117.0) - 1634115.0) + 732249.0) - 138327.0) - 6.0) - 12.0) + 60.0) + 120.0) - 54.0) - 108.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-75684.0 * y + 491946.0) - 1341186.0) + 1965117.0) - 1634115.0) + 732249.0) - 138327.0) - 6.0) - 12.0) + 60.0) + 120.0) - 54.0) - 108.0)) * 0.001388888888888889);} inline float theta_l6_6(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (25228.0 * y - 163982.0) + 447062.0) - 655039.0) + 544705.0) - 244083.0) + 46109.0) + 1.0) + 3.0) - 5.0) - 15.0) + 4.0) + 12.0)) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (25228.0 * y - 163982.0) + 447062.0) - 655039.0) + 544705.0) - 244083.0) + 46109.0) + 1.0) + 3.0) - 5.0) - 15.0) + 4.0) + 12.0)) * 0.001388888888888889);} inline float iota_l6_6(float y){ - return ((y * y * y * y * y * y * y * (y * (y * (y * (y * (y * (-3604.0 * y + 23426.0) - 63866.0) + 93577.0) - 77815.0) + 34869.0) - 6587.0)) / 720.0);} + return ((y * y * y * y * y * y * y * (y * (y * (y * (y * (y * (-3604.0 * y + 23426.0) - 63866.0) + 93577.0) - 77815.0) + 34869.0) - 6587.0)) * 0.001388888888888889);} inline float alpha_l8_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-3569.0 * y + 16061.0) - 27454.0) + 21126.0) - 6125.0) + 49.0) - 196.0) - 36.0) + 144.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-3569.0 * y + 16061.0) - 27454.0) + 21126.0) - 6125.0) + 49.0) - 196.0) - 36.0) + 144.0)) * 2.48015873015873e-05);} inline float beta_l8_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (32121.0 * y - 144548.0) + 247074.0) - 190092.0) + 55125.0) - 672.0) + 2016.0) + 512.0) - 1536.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (32121.0 * y - 144548.0) + 247074.0) - 190092.0) + 55125.0) - 672.0) + 2016.0) + 512.0) - 1536.0)) * 2.48015873015873e-05);} inline float gamma_l8_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-128484.0 * y + 578188.0) - 988256.0) + 760312.0) - 221060.0) + 4732.0) - 9464.0) - 4032.0) + 8064.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-128484.0 * y + 578188.0) - 988256.0) + 760312.0) - 221060.0) + 4732.0) - 9464.0) - 4032.0) + 8064.0)) * 2.48015873015873e-05);} inline float delta_l8_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (299796.0 * y - 1349096.0) + 2305856.0) - 1774136.0) + 517580.0) - 13664.0) + 13664.0) + 32256.0) - 32256.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (299796.0 * y - 1349096.0) + 2305856.0) - 1774136.0) + 517580.0) - 13664.0) + 13664.0) + 32256.0) - 32256.0)) * 2.48015873015873e-05);} inline float eta_l8_4(float y){ - return ((y * y * (y * y * (y * (y * (y * (y * (-449694.0 * y + 2023630.0) - 3458700.0) + 2661540.0) - 778806.0) + 19110.0) - 57400.0) + 40320.0) / 40320.0);} + return ((y * y * (y * y * (y * (y * (y * (y * (-449694.0 * y + 2023630.0) - 3458700.0) + 2661540.0) - 778806.0) + 19110.0) - 57400.0) + 40320.0) * 2.48015873015873e-05);} inline float zeta_l8_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (449694.0 * y - 2023616.0) + 3458644.0) - 2662016.0) + 780430.0) - 13664.0) - 13664.0) + 32256.0) + 32256.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (449694.0 * y - 2023616.0) + 3458644.0) - 2662016.0) + 780430.0) - 13664.0) - 13664.0) + 32256.0) + 32256.0)) * 2.48015873015873e-05);} inline float theta_l8_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-299796.0 * y + 1349068.0) - 2305744.0) + 1775032.0) - 520660.0) + 4732.0) + 9464.0) - 4032.0) - 8064.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-299796.0 * y + 1349068.0) - 2305744.0) + 1775032.0) - 520660.0) + 4732.0) + 9464.0) - 4032.0) - 8064.0)) * 2.48015873015873e-05);} inline float iota_l8_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (128484.0 * y - 578168.0) + 988176.0) - 760872.0) + 223020.0) - 672.0) - 2016.0) + 512.0) + 1536.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (128484.0 * y - 578168.0) + 988176.0) - 760872.0) + 223020.0) - 672.0) - 2016.0) + 512.0) + 1536.0)) * 2.48015873015873e-05);} inline float kappa_l8_4(float y){ - return ((y * (y * (y * (y * (y * (y * (y * (y * (-32121.0 * y + 144541.0) - 247046.0) + 190246.0) - 55685.0) + 49.0) + 196.0) - 36.0) - 144.0)) / 40320.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-32121.0 * y + 144541.0) - 247046.0) + 190246.0) - 55685.0) + 49.0) + 196.0) - 36.0) - 144.0)) * 2.48015873015873e-05);} inline float mu_l8_4(float y){ - return ((y * y * y * y * y * (y * (y * (y * (3569.0 * y - 16060.0) + 27450.0) - 21140.0) + 6181.0)) / 40320.0);} - - -#endif + return ((y * y * y * y * y * (y * (y * (y * (3569.0 * y - 16060.0) + 27450.0) - 21140.0) + 6181.0)) * 2.48015873015873e-05);} diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl index e1dc7d29e..4c0e12480 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl @@ -5,195 +5,194 @@ */ inline float alpha_l2_1(float y){ - return (y*fma(y,fma(y,-1.0, 2.0), - 1.0)/2.0);} + return (y*fma(y,fma(y,-1.0, 2.0), - 1.0) * 0.5);} inline float beta_l2_1(float y){ - return (fma(y*y, fma(y, 3.0, -5.0), 2.0) / 2.0);} + return (fma(y*y, fma(y, 3.0, -5.0), 2.0) * 0.5);} inline float gamma_l2_1(float y){ - return ((y * fma(y , fma(-3.0, y, 4.0), 1.0)) / 2.0);} + return ((y * fma(y , fma(-3.0, y, 4.0), 1.0)) * 0.5);} inline float delta_l2_1(float y){ - return ((y * y * fma(1.0, y, - 1.0)) / 2.0);} + return ((y * y * fma(1.0, y, - 1.0)) * 0.5);} inline float alpha_l2_2(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, 2.0, -5.0), 3.0), 1.0), -1.0)) / 2.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, 2.0, -5.0), 3.0), 1.0), -1.0)) * 0.5);} inline float beta_l2_2(float y){ - return (fma(y * y, fma(y, fma(y, fma(y, -6.0, 15.0), -9.0), -2.0), 2.0) / 2.0);} + return (fma(y * y, fma(y, fma(y, fma(y, -6.0, 15.0), -9.0), -2.0), 2.0) * 0.5);} inline float gamma_l2_2(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, 6.0, -15.0), 9.0), 1.0), 1.0)) / 2.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, 6.0, -15.0), 9.0), 1.0), 1.0)) * 0.5);} inline float delta_l2_2(float y){ - return ((y * y * y * fma(y, fma(y, -2.0, 5.0), -3.0)) / 2.0);} + return ((y * y * y * fma(y, fma(y, -2.0, 5.0), -3.0)) * 0.5);} inline float alpha_l2_3(float y){ - return ((y * fma(y, fma(y * y, fma(y, fma(y, fma(y, -6.0, 21.0), -25.0), 10.0), 1.0), -1.0)) / 2.0);} + return ((y * fma(y, fma(y * y, fma(y, fma(y, fma(y, -6.0, 21.0), -25.0), 10.0), 1.0), -1.0)) * 0.5);} inline float beta_l2_3(float y){ - return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 18.0, -63.0), 75.0), -30.0), -2.0), 2.0) / 2.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 18.0, -63.0), 75.0), -30.0), -2.0), 2.0) * 0.5);} inline float gamma_l2_3(float y){ - return ((y * fma(y, fma(y * y, fma(y, fma(y, fma(y, -18.0, 63.0), -75.0), 30.0), 1.0), 1.0)) / 2.0);} + return ((y * fma(y, fma(y * y, fma(y, fma(y, fma(y, -18.0, 63.0), -75.0), 30.0), 1.0), 1.0)) * 0.5);} inline float delta_l2_3(float y){ - return ((y * y * y * y * fma(y, fma(y, fma(y, 6.0, -21.0), 25.0), -10.0)) / 2.0);} + return ((y * y * y * y * fma(y, fma(y, fma(y, 6.0, -21.0), 25.0), -10.0)) * 0.5);} inline float alpha_l2_4(float y){ - return ((y * fma(y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, 20.0, -90.0), 154.0), -119.0), 35.0), 1.0), -1.0)) / 2.0);} + return ((y * fma(y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, 20.0, -90.0), 154.0), -119.0), 35.0), 1.0), -1.0)) * 0.5);} inline float beta_l2_4(float y){ - return (fma(y * y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, -60.0, 270.0), -462.0), 357.0), -105.0), -2.0), 2.0) / 2.0);} + return (fma(y * y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, -60.0, 270.0), -462.0), 357.0), -105.0), -2.0), 2.0) * 0.5);} inline float gamma_l2_4(float y){ - return ((y * fma(y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, 60.0, -270.0), 462.0), -357.0), 105.0), 1.0), 1.0)) / 2.0);} + return ((y * fma(y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, 60.0, -270.0), 462.0), -357.0), 105.0), 1.0), 1.0)) * 0.5);} inline float delta_l2_4(float y){ - return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, -20.0, 90.0), -154.0), 119.0), -35.0)) / 2.0);} + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, -20.0, 90.0), -154.0), 119.0), -35.0)) * 0.5);} inline float alpha_l4_2(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, -5.0, 13.0), -9.0), -1.0), 2.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, -5.0, 13.0), -9.0), -1.0), 2.0)) * 0.041666666666666664);} inline float beta_l4_2(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, 25.0, -64.0), 39.0), 16.0), -16.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, 25.0, -64.0), 39.0), 16.0), -16.0)) * 0.041666666666666664);} inline float gamma_l4_2(float y){ - return (fma(y * y, fma(y, fma(y, fma(y, -50.0, 126.0), -70.0), -30.0), 24.0) / 24.0);} + return (fma(y * y, fma(y, fma(y, fma(y, -50.0, 126.0), -70.0), -30.0), 24.0) * 0.041666666666666664);} inline float delta_l4_2(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, 50.0, -124.0), 66.0), 16.0), 16.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, 50.0, -124.0), 66.0), 16.0), 16.0)) * 0.041666666666666664);} inline float eta_l4_2(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, -25.0, 61.0), -33.0), -1.0), -2.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, -25.0, 61.0), -33.0), -1.0), -2.0)) * 0.041666666666666664);} inline float zeta_l4_2(float y){ - return ((y * y * y * fma(y, fma(y, 5.0, -12.0), 7.0)) / 24.0);} + return ((y * y * y * fma(y, fma(y, 5.0, -12.0), 7.0)) * 0.041666666666666664);} inline float alpha_l4_3(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 14.0, -49.0), 58.0), -22.0), -2.0), -1.0), 2.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 14.0, -49.0), 58.0), -22.0), -2.0), -1.0), 2.0)) * 0.041666666666666664);} inline float beta_l4_3(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -70.0, 245.0), -290.0), 111.0), 4.0), 16.0), -16.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -70.0, 245.0), -290.0), 111.0), 4.0), 16.0), -16.0)) * 0.041666666666666664);} inline float gamma_l4_3(float y){ - return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 140.0, -490.0), 580.0), -224.0), -30.0), 24.0) / 24.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 140.0, -490.0), 580.0), -224.0), -30.0), 24.0) * 0.041666666666666664);} inline float delta_l4_3(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -140.0, 490.0), -580.0), 226.0), -4.0), 16.0), 16.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -140.0, 490.0), -580.0), 226.0), -4.0), 16.0), 16.0)) * 0.041666666666666664);} inline float eta_l4_3(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 70.0, -245.0), 290.0), -114.0), 2.0), -1.0), -2.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 70.0, -245.0), 290.0), -114.0), 2.0), -1.0), -2.0)) * 0.041666666666666664);} inline float zeta_l4_3(float y){ - return ((y * y * y * y * fma(y, fma(y, fma(y, -14.0, 49.0), -58.0), 23.0)) / 24.0);} + return ((y * y * y * y * fma(y, fma(y, fma(y, -14.0, 49.0), -58.0), 23.0)) * 0.041666666666666664);} inline float alpha_l4_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -46.0, 207.0), -354.0), 273.0), -80.0), 1.0), -2.0), -1.0), 2.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -46.0, 207.0), -354.0), 273.0), -80.0), 1.0), -2.0), -1.0), 2.0)) * 0.041666666666666664);} inline float beta_l4_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 230.0, -1035.0), 1770.0), -1365.0), 400.0), -4.0), 4.0), 16.0), -16.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 230.0, -1035.0), 1770.0), -1365.0), 400.0), -4.0), 4.0), 16.0), -16.0)) * 0.041666666666666664);} inline float gamma_l4_4(float y){ - return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -460.0, 2070.0), -3540.0), 2730.0), -800.0), 6.0), -30.0), 24.0) / 24.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -460.0, 2070.0), -3540.0), 2730.0), -800.0), 6.0), -30.0), 24.0) * 0.041666666666666664);} inline float delta_l4_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 460.0, -2070.0), 3540.0), -2730.0), 800.0), -4.0), -4.0), 16.0), 16.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 460.0, -2070.0), 3540.0), -2730.0), 800.0), -4.0), -4.0), 16.0), 16.0)) * 0.041666666666666664);} inline float eta_l4_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -230.0, 1035.0), -1770.0), 1365.0), -400.0), 1.0), 2.0), -1.0), -2.0)) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -230.0, 1035.0), -1770.0), 1365.0), -400.0), 1.0), 2.0), -1.0), -2.0)) * 0.041666666666666664);} inline float zeta_l4_4(float y){ - return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, 46.0, -207.0), 354.0), -273.0), 80.0)) / 24.0);} + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, 46.0, -207.0), 354.0), -273.0), 80.0)) * 0.041666666666666664);} inline float alpha_M8p(float y){ - return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) / 3360.0);} + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) * 0.00029761904761904765);} inline float beta_M8p(float y){ - return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(70.0,y, - 175.0), - 140.0), + 770.0), - 560.0), - 350.0), + 504.0), - 102.0) / 3360.0);} + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(70.0,y, - 175.0), - 140.0), + 770.0), - 560.0), - 350.0), + 504.0), - 102.0) * 0.00029761904761904765);} inline float gamma_M8p(float y){ - return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-210.0,y, + 609.0), + 224.0), - 2135.0), + 910.0), + 2765.0), - 2520.0), + 255.0) / 3360.0);} + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-210.0,y, + 609.0), + 224.0), - 2135.0), + 910.0), + 2765.0), - 2520.0), + 255.0) * 0.00029761904761904765);} inline float delta_M8p(float y){ - return (fma(y*y, fma(y*y, fma(y*y, fma(70.0,y, - 231.0), + 588.0), - 980.0), + 604.0) / 672.0);} + return (fma(y*y, fma(y*y, fma(y*y, fma(70.0,y, - 231.0), + 588.0), - 980.0), + 604.0) * 0.001488095238095238);} inline float eta_M8p(float y){ - return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-70.0,y, 259.0), - 84.0), - 427.0), - 182.0), + 553.0), + 504.0), + 51.0) / 672.0);} + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-70.0,y, 259.0), - 84.0), - 427.0), - 182.0), + 553.0), + 504.0), + 51.0) * 0.001488095238095238);} inline float zeta_M8p(float y){ - return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(210.0,y,- 861.0), + 532.0), + 770.0), + 560.0), - 350.0), - 504.0), - 102.0) / 3360.0);} + return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(210.0,y,- 861.0), + 532.0), + 770.0), + 560.0), - 350.0), - 504.0), - 102.0) * 0.00029761904761904765);} inline float theta_M8p(float y){ - return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) / 3360.0);} + return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) * 0.00029761904761904765);} inline float iota_M8p(float y){ - return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) / 3360.0);} + return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) * 0.00029761904761904765);} inline float alpha_l6_3(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -89.0, 312.0), -370.0), 140.0), 15.0), 4.0), -12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -89.0, 312.0), -370.0), 140.0), 15.0), 4.0), -12.0)) * 0.001388888888888889);} inline float beta_l6_3(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 623.0, -2183.0), 2581.0), -955.0), -120.0), -54.0), 108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 623.0, -2183.0), 2581.0), -955.0), -120.0), -54.0), 108.0)) * 0.001388888888888889);} inline float gamma_l6_3(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1869.0, 6546.0), -7722.0), 2850.0), 195.0), 540.0), -540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1869.0, 6546.0), -7722.0), 2850.0), 195.0), 540.0), -540.0)) * 0.001388888888888889);} inline float delta_l6_3(float y){ - return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 3115.0, -10905.0), 12845.0), -4795.0), -980.0), 720.0) / 720.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 3115.0, -10905.0), 12845.0), -4795.0), -980.0), 720.0) * 0.001388888888888889);} inline float eta_l6_3(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3115.0, 10900.0), -12830.0), 4880.0), -195.0), 540.0), 540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3115.0, 10900.0), -12830.0), 4880.0), -195.0), 540.0), 540.0)) * 0.001388888888888889);} inline float zeta_l6_3(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 1869.0, -6537.0), 7695.0), -2985.0), 120.0), -54.0), -108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 1869.0, -6537.0), 7695.0), -2985.0), 120.0), -54.0), -108.0)) * 0.001388888888888889);} inline float theta_l6_3(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -623.0, 2178.0), -2566.0), 1010.0), -15.0), 4.0), 12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -623.0, 2178.0), -2566.0), 1010.0), -15.0), 4.0), 12.0)) * 0.001388888888888889);} inline float iota_l6_3(float y){ - return ((y * y * y * y * fma(y, fma(y, fma(y, 89.0, -311.0), 367.0), -145.0)) / 720.0);} + return ((y * y * y * y * fma(y, fma(y, fma(y, 89.0, -311.0), 367.0), -145.0)) * 0.001388888888888889);} inline float alpha_l6_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 290.0, -1305.0), 2231.0), -1718.0), 500.0), -5.0), 15.0), 4.0), -12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 290.0, -1305.0), 2231.0), -1718.0), 500.0), -5.0), 15.0), 4.0), -12.0)) * 0.001388888888888889);} inline float beta_l6_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -2030.0, 9135.0), -15617.0), 12027.0), -3509.0), 60.0), -120.0), -54.0), 108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -2030.0, 9135.0), -15617.0), 12027.0), -3509.0), 60.0), -120.0), -54.0), 108.0)) * 0.001388888888888889);} inline float gamma_l6_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 6090.0, -27405.0), 46851.0), -36084.0), 10548.0), -195.0), 195.0), 540.0), -540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 6090.0, -27405.0), 46851.0), -36084.0), 10548.0), -195.0), 195.0), 540.0), -540.0)) * 0.001388888888888889);} inline float delta_l6_4(float y){ - return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -10150.0, 45675.0), -78085.0), 60145.0), -17605.0), 280.0), -980.0), 720.0) / 720.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -10150.0, 45675.0), -78085.0), 60145.0), -17605.0), 280.0), -980.0), 720.0) * 0.001388888888888889);} inline float eta_l6_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 10150.0, -45675.0), 78085.0), -60150.0), 17620.0), -195.0), -195.0), 540.0), 540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 10150.0, -45675.0), 78085.0), -60150.0), 17620.0), -195.0), -195.0), 540.0), 540.0)) * 0.001388888888888889);} inline float zeta_l6_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -6090.0, 27405.0), -46851.0), 36093.0), -10575.0), 60.0), 120.0), -54.0), -108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -6090.0, 27405.0), -46851.0), 36093.0), -10575.0), 60.0), 120.0), -54.0), -108.0)) * 0.001388888888888889);} inline float theta_l6_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 2030.0, -9135.0), 15617.0), -12032.0), 3524.0), -5.0), -15.0), 4.0), 12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 2030.0, -9135.0), 15617.0), -12032.0), 3524.0), -5.0), -15.0), 4.0), 12.0)) * 0.001388888888888889);} inline float iota_l6_4(float y){ - return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, -290.0, 1305.0), -2231.0), 1719.0), -503.0)) / 720.0);} + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, -290.0, 1305.0), -2231.0), 1719.0), -503.0)) * 0.001388888888888889);} inline float alpha_l6_5(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1006.0, 5533.0), -12285.0), 13785.0), -7829.0), 1803.0), -3.0), -5.0), 15.0), 4.0), -12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1006.0, 5533.0), -12285.0), 13785.0), -7829.0), 1803.0), -3.0), -5.0), 15.0), 4.0), -12.0)) * 0.001388888888888889);} inline float beta_l6_5(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 7042.0, -38731.0), 85995.0), -96495.0), 54803.0), -12620.0), 12.0), 60.0), -120.0), -54.0), 108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 7042.0, -38731.0), 85995.0), -96495.0), 54803.0), -12620.0), 12.0), 60.0), -120.0), -54.0), 108.0)) * 0.001388888888888889);} inline float gamma_l6_5(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -21126.0, 116193.0), -257985.0), 289485.0), -164409.0), 37857.0), -15.0), -195.0), 195.0), 540.0), -540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -21126.0, 116193.0), -257985.0), 289485.0), -164409.0), 37857.0), -15.0), -195.0), 195.0), 540.0), -540.0)) * 0.001388888888888889);} inline float delta_l6_5(float y){ - return (fma(y * y, fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, 35210.0, -193655.0), 429975.0), -482475.0), 274015.0), -63090.0), 280.0), -980.0), 720.0) / 720.0);} + return (fma(y * y, fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, 35210.0, -193655.0), 429975.0), -482475.0), 274015.0), -63090.0), 280.0), -980.0), 720.0) * 0.001388888888888889);} inline float eta_l6_5(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -35210.0, 193655.0), -429975.0), 482475.0), -274015.0), 63085.0), 15.0), -195.0), -195.0), 540.0), 540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -35210.0, 193655.0), -429975.0), 482475.0), -274015.0), 63085.0), 15.0), -195.0), -195.0), 540.0), 540.0)) * 0.001388888888888889);} inline float zeta_l6_5(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 21126.0, -116193.0), 257985.0), -289485.0), 164409.0), -37848.0), -12.0), 60.0), 120.0), -54.0), -108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 21126.0, -116193.0), 257985.0), -289485.0), 164409.0), -37848.0), -12.0), 60.0), 120.0), -54.0), -108.0)) * 0.001388888888888889);} inline float theta_l6_5(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -7042.0, 38731.0), -85995.0), 96495.0), -54803.0), 12615.0), 3.0), -5.0), -15.0), 4.0), 12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -7042.0, 38731.0), -85995.0), 96495.0), -54803.0), 12615.0), 3.0), -5.0), -15.0), 4.0), 12.0)) * 0.001388888888888889);} inline float iota_l6_5(float y){ - return ((y * y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, fma(y, 1006.0, -5533.0), 12285.0), -13785.0), 7829.0), -1802.0)) / 720.0);} + return ((y * y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, fma(y, 1006.0, -5533.0), 12285.0), -13785.0), 7829.0), -1802.0)) * 0.001388888888888889);} inline float alpha_l6_6(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 3604.0, -23426.0), 63866.0), -93577.0), 77815.0), -34869.0), 6587.0), 1.0), -3.0), -5.0), 15.0), 4.0), -12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 3604.0, -23426.0), 63866.0), -93577.0), 77815.0), -34869.0), 6587.0), 1.0), -3.0), -5.0), 15.0), 4.0), -12.0)) * 0.001388888888888889);} inline float beta_l6_6(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -25228.0, 163982.0), -447062.0), 655039.0), -544705.0), 244083.0), -46109.0), -6.0), 12.0), 60.0), -120.0), -54.0), 108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -25228.0, 163982.0), -447062.0), 655039.0), -544705.0), 244083.0), -46109.0), -6.0), 12.0), 60.0), -120.0), -54.0), 108.0)) * 0.001388888888888889);} inline float gamma_l6_6(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 75684.0, -491946.0), 1341186.0), -1965117.0), 1634115.0), -732249.0), 138327.0), 15.0), -15.0), -195.0), 195.0), 540.0), -540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 75684.0, -491946.0), 1341186.0), -1965117.0), 1634115.0), -732249.0), 138327.0), 15.0), -15.0), -195.0), 195.0), 540.0), -540.0)) * 0.001388888888888889);} inline float delta_l6_6(float y){ - return (fma(y * y, fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -126140.0, 819910.0), -2235310.0), 3275195.0), -2723525.0), 1220415.0), -230545.0), -20.0), 280.0), -980.0), 720.0) / 720.0);} + return (fma(y * y, fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -126140.0, 819910.0), -2235310.0), 3275195.0), -2723525.0), 1220415.0), -230545.0), -20.0), 280.0), -980.0), 720.0) * 0.001388888888888889);} inline float eta_l6_6(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 126140.0, -819910.0), 2235310.0), -3275195.0), 2723525.0), -1220415.0), 230545.0), 15.0), 15.0), -195.0), -195.0), 540.0), 540.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 126140.0, -819910.0), 2235310.0), -3275195.0), 2723525.0), -1220415.0), 230545.0), 15.0), 15.0), -195.0), -195.0), 540.0), 540.0)) * 0.001388888888888889);} inline float zeta_l6_6(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -75684.0, 491946.0), -1341186.0), 1965117.0), -1634115.0), 732249.0), -138327.0), -6.0), -12.0), 60.0), 120.0), -54.0), -108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -75684.0, 491946.0), -1341186.0), 1965117.0), -1634115.0), 732249.0), -138327.0), -6.0), -12.0), 60.0), 120.0), -54.0), -108.0)) * 0.001388888888888889);} inline float theta_l6_6(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 25228.0, -163982.0), 447062.0), -655039.0), 544705.0), -244083.0), 46109.0), 1.0), 3.0), -5.0), -15.0), 4.0), 12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 25228.0, -163982.0), 447062.0), -655039.0), 544705.0), -244083.0), 46109.0), 1.0), 3.0), -5.0), -15.0), 4.0), 12.0)) * 0.001388888888888889);} inline float iota_l6_6(float y){ - return ((y * y * y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3604.0, 23426.0), -63866.0), 93577.0), -77815.0), 34869.0), -6587.0)) / 720.0);} + return ((y * y * y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3604.0, 23426.0), -63866.0), 93577.0), -77815.0), 34869.0), -6587.0)) * 0.001388888888888889);} inline float alpha_l8_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3569.0, 16061.0), -27454.0), 21126.0), -6125.0), 49.0), -196.0), -36.0), 144.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3569.0, 16061.0), -27454.0), 21126.0), -6125.0), 49.0), -196.0), -36.0), 144.0)) * 2.48015873015873e-05);} inline float beta_l8_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 32121.0, -144548.0), 247074.0), -190092.0), 55125.0), -672.0), 2016.0), 512.0), -1536.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 32121.0, -144548.0), 247074.0), -190092.0), 55125.0), -672.0), 2016.0), 512.0), -1536.0)) * 2.48015873015873e-05);} inline float gamma_l8_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -128484.0, 578188.0), -988256.0), 760312.0), -221060.0), 4732.0), -9464.0), -4032.0), 8064.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -128484.0, 578188.0), -988256.0), 760312.0), -221060.0), 4732.0), -9464.0), -4032.0), 8064.0)) * 2.48015873015873e-05);} inline float delta_l8_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 299796.0, -1349096.0), 2305856.0), -1774136.0), 517580.0), -13664.0), 13664.0), 32256.0), -32256.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 299796.0, -1349096.0), 2305856.0), -1774136.0), 517580.0), -13664.0), 13664.0), 32256.0), -32256.0)) * 2.48015873015873e-05);} inline float eta_l8_4(float y){ - return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -449694.0, 2023630.0), -3458700.0), 2661540.0), -778806.0), 19110.0), -57400.0), 40320.0) / 40320.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -449694.0, 2023630.0), -3458700.0), 2661540.0), -778806.0), 19110.0), -57400.0), 40320.0) * 2.48015873015873e-05);} inline float zeta_l8_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 449694.0, -2023616.0), 3458644.0), -2662016.0), 780430.0), -13664.0), -13664.0), 32256.0), 32256.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 449694.0, -2023616.0), 3458644.0), -2662016.0), 780430.0), -13664.0), -13664.0), 32256.0), 32256.0)) * 2.48015873015873e-05);} inline float theta_l8_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -299796.0, 1349068.0), -2305744.0), 1775032.0), -520660.0), 4732.0), 9464.0), -4032.0), -8064.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -299796.0, 1349068.0), -2305744.0), 1775032.0), -520660.0), 4732.0), 9464.0), -4032.0), -8064.0)) * 2.48015873015873e-05);} inline float iota_l8_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 128484.0, -578168.0), 988176.0), -760872.0), 223020.0), -672.0), -2016.0), 512.0), 1536.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 128484.0, -578168.0), 988176.0), -760872.0), 223020.0), -672.0), -2016.0), 512.0), 1536.0)) * 2.48015873015873e-05);} inline float kappa_l8_4(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -32121.0, 144541.0), -247046.0), 190246.0), -55685.0), 49.0), 196.0), -36.0), -144.0)) / 40320.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -32121.0, 144541.0), -247046.0), 190246.0), -55685.0), 49.0), 196.0), -36.0), -144.0)) * 2.48015873015873e-05);} inline float mu_l8_4(float y){ - return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, 3569.0, -16060.0), 27450.0), -21140.0), 6181.0)) / 40320.0);} - + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, 3569.0, -16060.0), 27450.0), -21140.0), 6181.0)) * 2.48015873015873e-05);} diff --git a/HySoP/hysop/gpu/config_default.py b/HySoP/hysop/gpu/config_default.py index 3a96866ed..8bad707b9 100644 --- a/HySoP/hysop/gpu/config_default.py +++ b/HySoP/hysop/gpu/config_default.py @@ -172,3 +172,8 @@ kernels_config[2][DOUBLE_GPU]['advec_and_remesh'] = \ "kernels/advection_and_remeshing_noVec.cl"], False, 1, advection_and_remeshing_index_space) +kernels_config[3][FLOAT_GPU]['diffusion'] = \ + (["common.cl", "kernels/diffusion.cl"], + 16, 1, 1, + lambda size, nb_part, tile: ((size[0], size[1] / nb_part), + (tile, tile / nb_part))) diff --git a/HySoP/hysop/gpu/config_k20m.py b/HySoP/hysop/gpu/config_k20m.py index 102633579..e80548af6 100644 --- a/HySoP/hysop/gpu/config_k20m.py +++ b/HySoP/hysop/gpu/config_k20m.py @@ -180,3 +180,8 @@ kernels_config[2][DOUBLE_GPU]['advec_and_remesh'] = \ "kernels/advection_and_remeshing_noVec.cl"], False, 1, advection_and_remeshing_index_space) +kernels_config[3][DOUBLE_GPU]['diffusion'] = \ + (["common.cl", "kernels/diffusion.cl"], + 16, 4, 1, + lambda size, nb_part, tile: ((size[0], size[1] / nb_part), + (tile, tile / nb_part))) diff --git a/HySoP/hysop/gpu/gpu_operator.py b/HySoP/hysop/gpu/gpu_operator.py index ddae1f375..cf6e1387f 100644 --- a/HySoP/hysop/gpu/gpu_operator.py +++ b/HySoP/hysop/gpu/gpu_operator.py @@ -51,7 +51,7 @@ class GPUOperator(object): if self.dim == 3 and self.direction == 1: self._reorderVect = lambda v: (v[1], v[0], v[2]) if self.dim == 3 and self.direction == 2: - if self._main_size == 1 and self.method[Splitting].find('o2') >= 0: + if self._comm_size == 1 and self.method[Splitting].find('o2') >= 0: self._reorderVect = lambda v: (v[2], v[0], v[1]) else: self._reorderVect = lambda v: (v[2], v[1], v[0]) diff --git a/HySoP/hysop/gpu/tools.py b/HySoP/hysop/gpu/tools.py index 98a369834..acee2503b 100644 --- a/HySoP/hysop/gpu/tools.py +++ b/HySoP/hysop/gpu/tools.py @@ -5,9 +5,10 @@ Tools for gpu management. """ from parmepy import __VERBOSE__, __DEFAULT_PLATFORM_ID__, __DEFAULT_DEVICE_ID__ from parmepy.constants import np, PARMES_REAL, ORDER -from parmepy.gpu import cl, clTools, GPU_SRC, CL_PROFILE, clArray +from parmepy.gpu import cl, clTools, GPU_SRC, CL_PROFILE import parmepy.tools.numpywrappers as npw import re +import mpi4py.MPI as mpi FLOAT_GPU, DOUBLE_GPU = np.float32, np.float64 @@ -43,13 +44,22 @@ class OpenCLEnvironment(object): self.ctx = self._get_context(self.device, gl_sharing) ## OpenCL queue self.queue = self._get_queue(self.ctx) + ## MPI sub-communicator for all processes attached to the same device if comm is None: from parmepy.mpi.main_var import main_comm else: main_comm = comm - self.gpu_comm = main_comm.Split(color=device_id, - key=main_comm.Get_rank()) + # Splitting the mpi communicator by the device id is not enough: + # the id of the first gpu of each node is 0 + # We build color from the processor name and the id + import hashlib + # The md5 sum of the proc name is tuncated to obtain an integer + # for fortran (32bit) + hash_name = hashlib.md5(mpi.Get_processor_name()).hexdigest()[-7:] + self.gpu_comm = main_comm.Split( + color=int(hash_name, 16) + device_id, + key=main_comm.Get_rank()) ## Memory Pool allocator (immediate allocator) self.memPool = clTools.MemoryPool( @@ -61,7 +71,7 @@ class OpenCLEnvironment(object): elif self.precision is DOUBLE_GPU: self.prec_size = 8 self.macros = {} - self.default_build_opts = "-Werror" + self.default_build_opts = "-Werror" + self._get_precision_opts() ## Kernels configuration dictionary if self.device.name == "Cayman": @@ -114,7 +124,7 @@ class OpenCLEnvironment(object): print ("Warning, GPU precision is overrided from",) print (self.precision, 'to', precision) self.precision = precision - self.default_build_opts = self._get_precision_opts() + self.default_build_opts = "-Werror" + self._get_precision_opts() def _get_platform(self, platform_id): """ @@ -360,9 +370,12 @@ class OpenCLEnvironment(object): for k in self.macros: gpu_src = gpu_src.replace(k, str(self.macros[k])) if self.precision is FLOAT_GPU: - float_replace = re.compile(r'(?P<float>\d\.\d+)') + # Rexexp to add 'f' suffix to float constants + # Match 1.2, 1.234, 1.2e3, 1.2E-05 + float_replace = re.compile(r'(?P<float>\d\.\d+((e|E)-?\d+)?)') prg = cl.Program( - self.ctx, float_replace.sub(r'\g<float>f', gpu_src)) + self.ctx, + float_replace.sub(r'\g<float>f', gpu_src)) else: prg = cl.Program(self.ctx, gpu_src.replace('float', 'double')) # OpenCL program @@ -486,47 +499,64 @@ class OpenCLEnvironment(object): return src def global_allocation(self, array): - buff = clArray.empty(self.queue, array.shape, self.precision, - order=ORDER, allocator=self.memPool) - self.available_mem -= array.nbytes - return buff - - def LocalMemAllocator(self, sizes_list): + # buff = clArray.empty(self.queue, array.shape, self.precision, + # order=ORDER, allocator=self.memPool) + # self.available_mem -= array.nbytes + # return buff + clBuff = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, + size=int(self.prec_size * np.prod(array.shape))) + # Touch the buffer on device to performs the allocation + # Transfers a single element in device (the precision no matters here) + e = np.zeros((1,), dtype=np.float64) + cl.enqueue_copy(self.queue, clBuff, e, + buffer_origin=(0,0,0), host_origin=(0,0,0), + region=(e.nbytes,)).wait() + self.available_mem -= clBuff.size + return clBuff + + def LocalMemAllocator(self, sizes_list, type_list=None): """ Allocates spaces in device local memory. @param sizes_list : list of sizes. + @param type_list : list of corresponding types It returns a list of buffers of given size (one per size specified in in the list) and the size of new buffers. @remark : Buffers are stored and could be reused. @remark : it assumes that all returned buffers are different """ new_alloc = 0 + if type_list is None: + type_list = [PARMES_REAL] * len(sizes_list) buff_list = [] # Returned list - for size in sizes_list: + keys_list = [] + for s, t in zip(sizes_list, type_list): + keys_list.append(int(t(0).nbytes * s)) + + for size, key, t in zip(sizes_list, keys_list, type_list): buff = None try: # List of existing buffers not already in the list - avail_buff = [b for b in self._locMem_Buffers[size] + avail_buff = [b for b in self._locMem_Buffers[key] if b not in buff_list] if len(avail_buff) > 0: # adding the first buffer buff = avail_buff[0] else: # Allocate a new buffer - buff = cl.LocalMemory(int(self.prec_size * size)) + buff = cl.LocalMemory(int(t(0).nbytes * size)) new_alloc += buff.size - self._locMem_Buffers[size].append(buff) + self._locMem_Buffers[key].append(buff) except KeyError: # Allocate a fist buffer of given size - buff = cl.LocalMemory(int(self.prec_size * size)) + buff = cl.LocalMemory(int(t(0).nbytes * size)) new_alloc += buff.size - self._locMem_Buffers[size] = [buff] + self._locMem_Buffers[key] = [buff] buff_list.append(buff) return buff_list, new_alloc -def get_opengl_shared_environment(platform_id=__DEFAULT_PLATFORM_ID__, - device_id=__DEFAULT_DEVICE_ID__, +def get_opengl_shared_environment(platform_id=None, + device_id=None, device_type=None, precision=PARMES_REAL, comm=None): """ @@ -540,6 +570,10 @@ def get_opengl_shared_environment(platform_id=__DEFAULT_PLATFORM_ID__, The context is obtained with gl-shared properties depending on the OS. """ + if platform_id is None: + platform_id = __DEFAULT_PLATFORM_ID__ + if device_id is None: + device_id = __DEFAULT_DEVICE_ID__ global __cl_env if __cl_env is None: __cl_env = OpenCLEnvironment(platform_id, device_id, device_type, @@ -551,7 +585,7 @@ def get_opengl_shared_environment(platform_id=__DEFAULT_PLATFORM_ID__, def get_opencl_environment(platform_id=None, - device_id=__DEFAULT_DEVICE_ID__, + device_id=None, device_type=None, precision=PARMES_REAL, comm=None): """ diff --git a/HySoP/hysop/operator/monitors/printer.py b/HySoP/hysop/operator/monitors/printer.py index 4d773c732..8f0110ac9 100644 --- a/HySoP/hysop/operator/monitors/printer.py +++ b/HySoP/hysop/operator/monitors/printer.py @@ -110,7 +110,7 @@ class Printer(Monitoring): self.subset.globalResolution(self._topology) else: self.globalResolution = \ - list(self._topology.mesh.resolution - 1) + list(self._topology.mesh.resolution) self._slices = self._topology.mesh.iCompute self.globalResolution.reverse() -- GitLab