diff --git a/HySoP/CMake/ParmesTests.cmake b/HySoP/CMake/ParmesTests.cmake index 3326a2ee8b0b801c1461dceb6d644d8747cd25be..af9db544aa8b46d34e73cc8df1e8aa607f3df669 100644 --- a/HySoP/CMake/ParmesTests.cmake +++ b/HySoP/CMake/ParmesTests.cmake @@ -27,6 +27,21 @@ set(ENV{PYTHONPATH} ${testDir}) display(testDir) +## Copy the OpenCL sources files to build dir (only python files are copied by setup.py) +set(clfiles) +file(GLOB clfilestmp RELATIVE ${CMAKE_SOURCE_DIR} parmepy/gpu/cl_src/*.cl) +set(clfiles ${clfiles} ${clfilestmp}) +file(GLOB clfilestmp RELATIVE ${CMAKE_SOURCE_DIR} parmepy/gpu/cl_src/kernels/*.cl) +set(clfiles ${clfiles} ${clfilestmp}) +file(GLOB clfilestmp RELATIVE ${CMAKE_SOURCE_DIR} parmepy/gpu/cl_src/advection/*.cl) +set(clfiles ${clfiles} ${clfilestmp}) +file(GLOB clfilestmp RELATIVE ${CMAKE_SOURCE_DIR} parmepy/gpu/cl_src/remeshing/*.cl) +set(clfiles ${clfiles} ${clfilestmp}) +foreach(_F ${clfiles}) + configure_file(${_F} ${testDir}/${_F} COPYONLY) +endforeach() + + ## Build a list of test_*.py files for each directory of parmepy/${py_src_dirs} set(py_test_files) foreach(testdir ${py_src_dirs}) @@ -73,7 +88,7 @@ endforeach() foreach(testfile ${py_doctest_files}) get_filename_component(testName ${testfile} NAME_WE) message(STATUS "Add test from doc doctest_${testName} ...") - add_test("doctest_${testName}" py.test -v ${testfile}) + add_test("doctest_${testName}" py.test -v --doctest-modules ${testfile}) endforeach() message(STATUS "===") diff --git a/HySoP/hysop/domain/box.py b/HySoP/hysop/domain/box.py index 92fcd9755f8932452ea5160423506e9dad4303c4..668353f92f7534d00a628afe376c4981ef8d3d88 100644 --- a/HySoP/hysop/domain/box.py +++ b/HySoP/hysop/domain/box.py @@ -34,6 +34,7 @@ class Box(Domain): >>> b = pp.Box() >>> (b.max == np.asarray([1.0, 1.0, 1.0])).all() True + \endcode """ if not (dimension == len(length) and dimension == len(origin)): diff --git a/HySoP/hysop/domain/tests/test_box.py b/HySoP/hysop/domain/tests/test_box.py index d7d034ff14be6e4e33111b4ff6efbe8ce9a4901a..c8136d762e1d61d360ca3d3b7ab94f687e3d2870 100644 --- a/HySoP/hysop/domain/tests/test_box.py +++ b/HySoP/hysop/domain/tests/test_box.py @@ -4,7 +4,6 @@ Testing parmepy.domain.box.Box from parmepy.constants import np, PERIODIC from parmepy.domain.box import Box - def test_create_box1(): """Test default parameters""" dom = Box() diff --git a/HySoP/hysop/fields/continuous.py b/HySoP/hysop/fields/continuous.py index f7a7c35f7a10b71924bff712dd3cacfd8ce52107..1888638bda9b64ce92fc448487e2e59ed6ecf8d4 100644 --- a/HySoP/hysop/fields/continuous.py +++ b/HySoP/hysop/fields/continuous.py @@ -84,7 +84,7 @@ class Field(object): if topo in self.discreteFields.keys(): return self.discreteFields[topo] else: - nameD = self.name + str(topo.getId()) + nameD = self.name + '_' + str(topo.getId()) self.discreteFields[topo] = DiscreteField(topo, isVector=self.isVector, name=nameD) diff --git a/HySoP/hysop/gpu/QtRendering.py b/HySoP/hysop/gpu/QtRendering.py index 7f0dd4a4fc295050fb8ee91c545e545934fe868d..f13a87abf7756a4661d989f4226f65b7ec615093 100644 --- a/HySoP/hysop/gpu/QtRendering.py +++ b/HySoP/hysop/gpu/QtRendering.py @@ -14,6 +14,7 @@ from parmepy.gpu import PARMES_REAL_GPU, cl from parmepy.gpu.gpu_discrete import GPUDiscreteField from parmepy.gpu.gpu_kernel import KernelLauncher from parmepy.tools.timers import timed_function +from parmepy.mpi import MPI class QtOpenGLRendering(Monitoring): @@ -58,8 +59,10 @@ class QtOpenGLRendering(Monitoring): self.window = TestWindow() self.isGLRender = True self.input = [field] - self.component = component + self.component = component if field.isVector else 0 self.output = [] + self.ctime = 0. + self.mtime = 0. @debug def setUp(self): @@ -162,20 +165,17 @@ class QtOpenGLRendering(Monitoring): """ t = simulation.time dt = simulation.timeStep - print simulation + simulation.printState() # OpenCL update - if self.gpu_field.isVector: - self.numMethod(self.gpu_field.gpu_data[self.component], - self.color) - else: - self.numMethod(self.gpu_field.gpu_data[0], - self.color) + self.numMethod(self.gpu_field.gpu_data[self.component], + self.color) self.window.widget.updateGL() if simulation.currentIteration > 1: self.window.label.setText( self.labelText + "t={0:6.2f}, fps={1:6.2f}".format( t + dt, - 1. / (self.timer.f_timers.values()[0].times[-1]))) + 1. / (self.timer.f_timers.values()[0].t - self.ctime))) + self.ctime = self.timer.f_timers.values()[0].t @debug def finalize(self): diff --git a/HySoP/hysop/gpu/cl_src/common.cl b/HySoP/hysop/gpu/cl_src/common.cl index 449793f2a6a63d9523ca9c09dad63ebc444c4b29..ba88801f061d3aac992eae542ae94abb5859af73 100644 --- a/HySoP/hysop/gpu/cl_src/common.cl +++ b/HySoP/hysop/gpu/cl_src/common.cl @@ -32,13 +32,65 @@ inline uint noBC_id(int id){ /** * Constants for remeshing formulas: - * - M4PRIME - * - M6PRIME - * - M8PRIME - * - L6STAR + * - M4PRIME 1 + * - L2STAR_C2 2 + * - L2STAR_C3 3 + * - L2STAR_C4 4 + * - L4STAR 5 + * - L4STAR_C3 6 + * - L4STAR_C4 7 + * - L6STAR 8 + * - L6STAR_C4 9 + * - L6STAR_C5 10 + * - L6STAR_C6 11 + * - M8PRIME 12 + * - L8STAR 13 */ #define M4PRIME 1 -#define M6PRIME 2 -#define M8PRIME 3 -#define L6STAR 4 +#define L2STAR_C2 2 +#define L2STAR_C3 3 +#define L2STAR_C4 4 #define L4STAR 5 +#define L4STAR_C3 6 +#define L4STAR_C4 7 +#define L6STAR 8 +#define L6STAR_C4 9 +#define L6STAR_C5 10 +#define L6STAR_C6 11 +#define M8PRIME 12 +#define L8STAR 13 + +/** + * Shift to left grid point + */ +#if FORMULA == M4PRIME +#define REMESH_SHIFT 1 +#elif FORMULA == L2STAR_C2 +#define REMESH_SHIFT 1 +#elif FORMULA == L2STAR_C3 +#define REMESH_SHIFT 1 +#elif FORMULA == L2STAR_C4 +#define REMESH_SHIFT 1 + +#elif FORMULA == L4STAR +#define REMESH_SHIFT 2 +#elif FORMULA == L4STAR_C3 +#define REMESH_SHIFT 2 +#elif FORMULA == L4STAR_C4 +#define REMESH_SHIFT 2 + +#elif FORMULA == M8PRIME +#define REMESH_SHIFT 3 +#elif FORMULA == L6STAR +#define REMESH_SHIFT 3 +#elif FORMULA == L6STAR_C4 +#define REMESH_SHIFT 3 +#elif FORMULA == L6STAR_C5 +#define REMESH_SHIFT 3 +#elif FORMULA == L6STAR_C6 +#define REMESH_SHIFT 3 + +#elif FORMULA == L8STAR +#define REMESH_SHIFT 4 +#endif + diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic.cl index 3106d65caf8216229b841f352f997b877fce31d1..73596e388c365dbfb3b9e7ea3b6dfb6bdc5169a2 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/basic.cl @@ -35,13 +35,7 @@ void remesh(uint i, float dx, float invdx, float__N__ s, float__N__ p, __local f ind = convert_int__N___rtn(p * invdx); y = (p - convert_float__N__(ind) * dx) * invdx; -#if FORMULA == M4PRIME - index = convert_uint__N__((ind - 1 + NB_I) % NB_I); -#elif FORMULA == M6PRIME || FORMULA == L4STAR - index = convert_uint__N__((ind - 2 + NB_I) % NB_I); -#elif FORMULA == M8PRIME || FORMULA == L6STAR - index = convert_uint__N__((ind - 3 + NB_I) % NB_I); -#endif + index = convert_uint__N__((ind - REMESH_SHIFT + NB_I) % NB_I); w__NN__ = alpha(y.s__NN__); gscal_loc[noBC_id(index.s__NN__)] += (w__NN__ * s.s__NN__); @@ -62,7 +56,7 @@ void remesh(uint i, float dx, float invdx, float__N__ s, float__N__ p, __local f gscal_loc[noBC_id(index.s__NN__)] += (w__NN__ * s.s__NN__); barrier(CLK_LOCAL_MEM_FENCE); -#if FORMULA == M6PRIME || FORMULA == L4STAR || FORMULA == M8PRIME || FORMULA == L6STAR +#if REMESH_SHIFT > 1 index = (index + 1) % NB_I; w__NN__ = eta(y.s__NN__); gscal_loc[noBC_id(index.s__NN__)] += (w__NN__ * s.s__NN__); @@ -72,8 +66,9 @@ void remesh(uint i, float dx, float invdx, float__N__ s, float__N__ p, __local f w__NN__ = zeta(y.s__NN__); gscal_loc[noBC_id(index.s__NN__)] += (w__NN__ * s.s__NN__); barrier(CLK_LOCAL_MEM_FENCE); +#endif -#if FORMULA == M8PRIME || FORMULA == L6STAR +#if REMESH_SHIFT > 2 index = (index + 1) % NB_I; w__NN__ = theta(y.s__NN__); gscal_loc[noBC_id(index.s__NN__)] += (w__NN__ * s.s__NN__); @@ -85,5 +80,17 @@ void remesh(uint i, float dx, float invdx, float__N__ s, float__N__ p, __local f barrier(CLK_LOCAL_MEM_FENCE); #endif + +#if REMESH_SHIFT > 3 + index = (index + 1) % NB_I; + w__NN__ = kappa(y.s__NN__); + gscal_loc[noBC_id(index.s__NN__)] += (w__NN__ * s.s__NN__); + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + w__NN__ = mu(y.s__NN__); + gscal_loc[noBC_id(index.s__NN__)] += (w__NN__ * s.s__NN__); + barrier(CLK_LOCAL_MEM_FENCE); + #endif } diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl index cd8a16962436dca6d363b2f2085507c8a8b800dd..b43a4adb746a3a4d5cae7a013af9e326ec6db294 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec.cl @@ -34,13 +34,7 @@ void remesh(uint i, float dx, float invdx, float s, float p, __local float* gsca ind = convert_int_rtn(p * invdx); y = (p - convert_float(ind) * dx) * invdx; -#if FORMULA == M4PRIME - index = convert_uint((ind - 1 + NB_I) % NB_I); -#elif FORMULA == M6PRIME - index = convert_uint((ind - 2 + NB_I) % NB_I); -#elif FORMULA == M8PRIME || FORMULA == L6STAR - index = convert_uint((ind - 3 + NB_I) % NB_I); -#endif + index = convert_uint((ind - REMESH_SHIFT + NB_I) % NB_I); gscal_loc[noBC_id(index)] += (alpha(y) * s); barrier(CLK_LOCAL_MEM_FENCE); @@ -57,7 +51,7 @@ void remesh(uint i, float dx, float invdx, float s, float p, __local float* gsca gscal_loc[noBC_id(index)] += (delta(y) * s); barrier(CLK_LOCAL_MEM_FENCE); -#if FORMULA == M6PRIME || FORMULA == M8PRIME +#if REMESH_SHIFT > 1 index = (index + 1) % NB_I; gscal_loc[noBC_id(index)] += (eta(y) * s); barrier(CLK_LOCAL_MEM_FENCE); @@ -65,8 +59,9 @@ void remesh(uint i, float dx, float invdx, float s, float p, __local float* gsca index = (index + 1) % NB_I; gscal_loc[noBC_id(index)] += (zeta(y) * s); barrier(CLK_LOCAL_MEM_FENCE); +#endif -#if FORMULA == M8PRIME || FORMULA == L6STAR +#if REMESH_SHIFT > 2 index = (index + 1) % NB_I; gscal_loc[noBC_id(index)] += (theta(y) * s); barrier(CLK_LOCAL_MEM_FENCE); @@ -74,7 +69,15 @@ void remesh(uint i, float dx, float invdx, float s, float p, __local float* gsca index = (index + 1) % NB_I; gscal_loc[noBC_id(index)] += (iota(y) * s); barrier(CLK_LOCAL_MEM_FENCE); - #endif + +#if REMESH_SHIFT > 3 + index = (index + 1) % NB_I; + gscal_loc[noBC_id(index)] += (kappa(y) * s); + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + gscal_loc[noBC_id(index)] += (mu(y) * s); + barrier(CLK_LOCAL_MEM_FENCE); #endif } diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_2d.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_2d.cl index bdeab9b7b00dea32f081f97e0109e90a298a1744..822a3d0499db57b13b1f5d49997d237e7245b869 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_2d.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_2d.cl @@ -41,13 +41,7 @@ void remesh(uint i, float dx, float invdx, ind = convert_int_rtn(p * invdx); y = (p - convert_float(ind) * dx) * invdx; -#if FORMULA == M4PRIME - index = convert_uint((ind - 1 + NB_I) % NB_I); -#elif FORMULA == M6PRIME - index = convert_uint((ind - 2 + NB_I) % NB_I); -#elif FORMULA == M8PRIME || FORMULA == L6STAR - index = convert_uint((ind - 3 + NB_I) % NB_I); -#endif + index = convert_uint((ind - REMESH_SHIFT + NB_I) % NB_I); w = alpha(y); gvec_X_loc[noBC_id(index)] += (w * v_X); @@ -72,7 +66,7 @@ void remesh(uint i, float dx, float invdx, gvec_Y_loc[noBC_id(index)] += (w * v_Y); barrier(CLK_LOCAL_MEM_FENCE); -#if FORMULA == M6PRIME || FORMULA == M8PRIME +#if REMESH_SHIFT > 1 index = (index + 1) % NB_I; w = eta(y); gvec_X_loc[noBC_id(index)] += (w * v_X); @@ -84,8 +78,9 @@ void remesh(uint i, float dx, float invdx, gvec_X_loc[noBC_id(index)] += (w * v_X); gvec_Y_loc[noBC_id(index)] += (w * v_Y); barrier(CLK_LOCAL_MEM_FENCE); +#endif -#if FORMULA == M8PRIME || FORMULA == L6STAR +#if REMESH_SHIFT > 2 index = (index + 1) % NB_I; w = theta(y); gvec_X_loc[noBC_id(index)] += (w * v_X); @@ -97,7 +92,19 @@ void remesh(uint i, float dx, float invdx, gvec_X_loc[noBC_id(index)] += (w * v_X); gvec_Y_loc[noBC_id(index)] += (w * v_Y); barrier(CLK_LOCAL_MEM_FENCE); - #endif + +#if REMESH_SHIFT > 3 + index = (index + 1) % NB_I; + w = kappa(y); + gvec_X_loc[noBC_id(index)] += (w * v_X); + gvec_Y_loc[noBC_id(index)] += (w * v_Y); + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + w = mu(y); + gvec_X_loc[noBC_id(index)] += (w * v_X); + gvec_Y_loc[noBC_id(index)] += (w * v_Y); + barrier(CLK_LOCAL_MEM_FENCE); #endif } diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_3d.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_3d.cl index cc029a05f1e5f22386e148f1039ade5e92aef0a8..531ee957922d3ce989c5adbfdbc248b14af0aba3 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_3d.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/basic_noVec_vector_3d.cl @@ -41,13 +41,7 @@ void remesh(uint i, float dx, float invdx, ind = convert_int_rtn(p * invdx); y = (p - convert_float(ind) * dx) * invdx; -#if FORMULA == M4PRIME - index = convert_uint((ind - 1 + NB_I) % NB_I); -#elif FORMULA == M6PRIME - index = convert_uint((ind - 2 + NB_I) % NB_I); -#elif FORMULA == M8PRIME || FORMULA == L6STAR - index = convert_uint((ind - 3 + NB_I) % NB_I); -#endif + index = convert_uint((ind - REMESH_SHIFT + NB_I) % NB_I); w = alpha(y); gvec_X_loc[noBC_id(index)] += (w * v_X); @@ -76,7 +70,7 @@ void remesh(uint i, float dx, float invdx, gvec_Z_loc[noBC_id(index)] += (w * v_Z); barrier(CLK_LOCAL_MEM_FENCE); -#if FORMULA == M6PRIME || FORMULA == M8PRIME +#if REMESH_SHIFT > 1 index = (index + 1) % NB_I; w = eta(y); gvec_X_loc[noBC_id(index)] += (w * v_X); @@ -90,8 +84,9 @@ void remesh(uint i, float dx, float invdx, gvec_Y_loc[noBC_id(index)] += (w * v_Y); gvec_Z_loc[noBC_id(index)] += (w * v_Z); barrier(CLK_LOCAL_MEM_FENCE); +#endif -#if FORMULA == M8PRIME || FORMULA == L6STAR +#if REMESH_SHIFT > 2 index = (index + 1) % NB_I; w = theta(y); gvec_X_loc[noBC_id(index)] += (w * v_X); @@ -105,7 +100,21 @@ void remesh(uint i, float dx, float invdx, gvec_Y_loc[noBC_id(index)] += (w * v_Y); gvec_Z_loc[noBC_id(index)] += (w * v_Z); barrier(CLK_LOCAL_MEM_FENCE); - #endif + +#if REMESH_SHIFT > 3 + index = (index + 1) % NB_I; + w = kappa(y); + gvec_X_loc[noBC_id(index)] += (w * v_X); + gvec_Y_loc[noBC_id(index)] += (w * v_Y); + gvec_Z_loc[noBC_id(index)] += (w * v_Z); + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + w = mu(y); + gvec_X_loc[noBC_id(index)] += (w * v_X); + gvec_Y_loc[noBC_id(index)] += (w * v_Y); + gvec_Z_loc[noBC_id(index)] += (w * v_Z); + barrier(CLK_LOCAL_MEM_FENCE); #endif } diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_2d.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_2d.cl index 03311f9e34f9881a3e070165509be170c8497f8b..d024d83847c63efce98a7f2e0bd6b11b464395a9 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_2d.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_2d.cl @@ -41,13 +41,7 @@ void remesh(uint i, float dx, float invdx, ind = convert_int__N___rtn(p * invdx); y = (p - convert_float__N__(ind) * dx) * invdx; -#if FORMULA == M4PRIME - index = convert_uint__N__((ind - 1 + NB_I) % NB_I); -#elif FORMULA == M6PRIME - index = convert_uint__N__((ind - 2 + NB_I) % NB_I); -#elif FORMULA == M8PRIME || FORMULA == L6STAR - index = convert_uint__N__((ind - 3 + NB_I) % NB_I); -#endif + index = convert_uint__N__((ind - REMESH_SHIFT + NB_I) % NB_I); w__NN__ = alpha(y.s__NN__); gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); @@ -72,7 +66,7 @@ void remesh(uint i, float dx, float invdx, gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); barrier(CLK_LOCAL_MEM_FENCE); -#if FORMULA == M6PRIME || FORMULA == M8PRIME || FORMULA == L6STAR +#if REMESH_SHIFT > 1 index = (index + 1) % NB_I; w__NN__ = eta(y.s__NN__); gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); @@ -84,8 +78,9 @@ void remesh(uint i, float dx, float invdx, gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); barrier(CLK_LOCAL_MEM_FENCE); +#endif -#if FORMULA == M8PRIME || FORMULA == L6STAR +#if REMESH_SHIFT > 2 index = (index + 1) % NB_I; w__NN__ = theta(y.s__NN__); gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); @@ -97,7 +92,19 @@ void remesh(uint i, float dx, float invdx, gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); barrier(CLK_LOCAL_MEM_FENCE); - #endif + +#if REMESH_SHIFT > 3 + index = (index + 1) % NB_I; + w__NN__ = kappa(y.s__NN__); + gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); + gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + w__NN__ = mu(y.s__NN__); + gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); + gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); + barrier(CLK_LOCAL_MEM_FENCE); #endif } diff --git a/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_3d.cl b/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_3d.cl index 4d84a687abfa8d56256d8b4b8080881951d8c417..e7e4b5a320de4b13d0c91b960cc5190ddd8acbdf 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_3d.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/basic_vector_3d.cl @@ -41,13 +41,7 @@ void remesh(uint i, float dx, float invdx, ind = convert_int__N___rtn(p * invdx); y = (p - convert_float__N__(ind) * dx) * invdx; -#if FORMULA == M4PRIME - index = convert_uint__N__((ind - 1 + NB_I) % NB_I); -#elif FORMULA == M6PRIME - index = convert_uint__N__((ind - 2 + NB_I) % NB_I); -#elif FORMULA == M8PRIME || FORMULA == L6STAR - index = convert_uint__N__((ind - 3 + NB_I) % NB_I); -#endif + index = convert_uint__N__((ind - REMESH_SHIFT + NB_I) % NB_I); w__NN__ = alpha(y.s__NN__); gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); @@ -76,7 +70,7 @@ void remesh(uint i, float dx, float invdx, gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); barrier(CLK_LOCAL_MEM_FENCE); -#if FORMULA == M6PRIME || FORMULA == M8PRIME || FORMULA == L6STAR +#if REMESH_SHIFT > 1 index = (index + 1) % NB_I; w__NN__ = eta(y.s__NN__); gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); @@ -90,8 +84,9 @@ void remesh(uint i, float dx, float invdx, gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); barrier(CLK_LOCAL_MEM_FENCE); +#endif -#if FORMULA == M8PRIME || FORMULA == L6STAR +#if REMESH_SHIFT > 2 index = (index + 1) % NB_I; w__NN__ = theta(y.s__NN__); gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); @@ -105,7 +100,21 @@ void remesh(uint i, float dx, float invdx, gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); barrier(CLK_LOCAL_MEM_FENCE); - #endif + +#if REMESH_SHIFT > 3 + index = (index + 1) % NB_I; + w__NN__ = kappa(y.s__NN__); + gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); + gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); + gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + w__NN__ = mu(y.s__NN__); + gvec_X_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_X.s__NN__); + gvec_Y_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Y.s__NN__); + gvec_Z_loc[noBC_id(index.s__NN__)] += (w__NN__ * v_Z.s__NN__); + barrier(CLK_LOCAL_MEM_FENCE); #endif } diff --git a/HySoP/hysop/gpu/cl_src/remeshing/private.cl b/HySoP/hysop/gpu/cl_src/remeshing/private.cl index d00d5eac8d64fe40e0bdc880dbda9f323d719a06..2e1bca5a472e6414cd82e47913c658346e5eab7e 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/private.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/private.cl @@ -36,13 +36,7 @@ void remesh(uint i, float dx, float invdx, float__N__ s, float__N__ p, __local f ind = convert_int__N___rtn(p * invdx); y = (p - convert_float__N__(ind) * dx) * invdx; -#if FORMULA == M4PRIME - index = convert_uint__N__((ind - 1 + NB_I) % NB_I); -#elif FORMULA == M6PRIME - index = convert_uint__N__((ind - 2 + NB_I) % NB_I); -#elif FORMULA == M8PRIME || FORMULA == L6STAR - index = convert_uint__N__((ind - 3 + NB_I) % NB_I); -#endif + index = convert_uint__N__((ind - REMESH_SHIFT + NB_I) % NB_I); temp = alpha(y) * s; gscal_loc[noBC_id(index.s__NN__)] += temp.s__NN__; @@ -63,7 +57,7 @@ void remesh(uint i, float dx, float invdx, float__N__ s, float__N__ p, __local f gscal_loc[noBC_id(index.s__NN__)] += temp.s__NN__; barrier(CLK_LOCAL_MEM_FENCE); -#if FORMULA == M6PRIME || FORMULA == M8PRIME +#if REMESH_SHIFT > 1 index = (index + 1) % NB_I; temp = eta(y) * s; gscal_loc[noBC_id(index.s__NN__)] += temp.s__NN__; @@ -73,8 +67,9 @@ void remesh(uint i, float dx, float invdx, float__N__ s, float__N__ p, __local f temp = zeta(y) * s; gscal_loc[noBC_id(index.s__NN__)] += temp.s__NN__; barrier(CLK_LOCAL_MEM_FENCE); +#endif -#if FORMULA == M8PRIME || FORMULA == L6STAR +#if REMESH_SHIFT > 2 index = (index + 1) % NB_I; temp = theta(y) * s; gscal_loc[noBC_id(index.s__NN__)] += temp.s__NN__; @@ -84,7 +79,17 @@ void remesh(uint i, float dx, float invdx, float__N__ s, float__N__ p, __local f temp = iota(y) * s; gscal_loc[noBC_id(index.s__NN__)] += temp.s__NN__; barrier(CLK_LOCAL_MEM_FENCE); - #endif + +#if REMESH_SHIFT > 3 + index = (index + 1) % NB_I; + temp = kappa(y) * s; + gscal_loc[noBC_id(index.s__NN__)] += temp.s__NN__; + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + temp = mu(y) * s; + gscal_loc[noBC_id(index.s__NN__)] += temp.s__NN__; + barrier(CLK_LOCAL_MEM_FENCE); #endif } diff --git a/HySoP/hysop/gpu/cl_src/remeshing/private_vector_2d.cl b/HySoP/hysop/gpu/cl_src/remeshing/private_vector_2d.cl index 72131935b859858142d6d125d37b8ccd98f31c77..366163fa2983c54c7ad040ebf049f1b72617ee7a 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/private_vector_2d.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/private_vector_2d.cl @@ -42,13 +42,7 @@ void remesh(uint i, float dx, float invdx, ind = convert_int__N___rtn(p * invdx); y = (p - convert_float__N__(ind) * dx) * invdx; -#if FORMULA == M4PRIME - index = convert_uint__N__((ind - 1 + NB_I) % NB_I); -#elif FORMULA == M6PRIME - index = convert_uint__N__((ind - 2 + NB_I) % NB_I); -#elif FORMULA == M8PRIME || FORMULA == L6STAR - index = convert_uint__N__((ind - 3 + NB_I) % NB_I); -#endif + index = convert_uint__N__((ind - REMESH_SHIFT + NB_I) % NB_I); w = alpha(y); gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; @@ -73,7 +67,7 @@ void remesh(uint i, float dx, float invdx, gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; barrier(CLK_LOCAL_MEM_FENCE); -#if FORMULA == M6PRIME || FORMULA == M8PRIME +#if REMESH_SHIFT > 1 index = (index + 1) % NB_I; w = eta(y); gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; @@ -85,8 +79,9 @@ void remesh(uint i, float dx, float invdx, gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; barrier(CLK_LOCAL_MEM_FENCE); +#endif -#if FORMULA == M8PRIME || FORMULA == L6STAR +#if REMESH_SHIFT > 2 index = (index + 1) % NB_I; w = theta(y); gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; @@ -98,7 +93,19 @@ void remesh(uint i, float dx, float invdx, gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; barrier(CLK_LOCAL_MEM_FENCE); - #endif + +#if REMESH_SHIFT > 3 + index = (index + 1) % NB_I; + w = kappa(y); + gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; + gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + w = mu(y); + gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; + gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; + barrier(CLK_LOCAL_MEM_FENCE); #endif } diff --git a/HySoP/hysop/gpu/cl_src/remeshing/private_vector_3d.cl b/HySoP/hysop/gpu/cl_src/remeshing/private_vector_3d.cl index 64fd45c178e8724649179100b9c310582ed8cb20..0c2c7fd0dfa18f6fe0e725d591ac283dc357796a 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/private_vector_3d.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/private_vector_3d.cl @@ -42,13 +42,7 @@ void remesh(uint i, float dx, float invdx, ind = convert_int__N___rtn(p * invdx); y = (p - convert_float__N__(ind) * dx) * invdx; -#if FORMULA == M4PRIME - index = convert_uint__N__((ind - 1 + NB_I) % NB_I); -#elif FORMULA == M6PRIME - index = convert_uint__N__((ind - 2 + NB_I) % NB_I); -#elif FORMULA == M8PRIME || FORMULA == L6STAR - index = convert_uint__N__((ind - 3 + NB_I) % NB_I); -#endif + index = convert_uint__N__((ind - REMESH_SHIFT + NB_I) % NB_I); w = alpha(y); gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; @@ -77,7 +71,7 @@ void remesh(uint i, float dx, float invdx, gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; barrier(CLK_LOCAL_MEM_FENCE); -#if FORMULA == M6PRIME || FORMULA == M8PRIME +#if REMESH_SHIFT > 1 index = (index + 1) % NB_I; w = eta(y); gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; @@ -91,8 +85,9 @@ void remesh(uint i, float dx, float invdx, gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; barrier(CLK_LOCAL_MEM_FENCE); +#endif -#if FORMULA == M8PRIME || FORMULA == L6STAR +#if REMESH_SHIFT > 2 index = (index + 1) % NB_I; w = theta(y); gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; @@ -106,7 +101,21 @@ void remesh(uint i, float dx, float invdx, gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; barrier(CLK_LOCAL_MEM_FENCE); - #endif + +#if REMESH_SHIFT > 3 + index = (index + 1) % NB_I; + w = kappa(y); + gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; + gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; + gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; + barrier(CLK_LOCAL_MEM_FENCE); + + index = (index + 1) % NB_I; + w = mu(y); + gvec_X_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_X.s__NN__; + gvec_Y_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Y.s__NN__; + gvec_Z_loc[noBC_id(index.s__NN__)] += w.s__NN__ * v_Z.s__NN__; + barrier(CLK_LOCAL_MEM_FENCE); #endif } diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights.cl index 1d3e5f219ba5f7f8f2295d4aa2e7c9779b3535ad..fec13ff858f4f197961650192da85b5b68a3a6bb 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/weights.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/weights.cl @@ -16,20 +16,90 @@ inline float__N__ delta(float__N__ y){ return ((y * y * (y - 1.0)) / 2.0);} -#elif FORMULA == M6PRIME +#elif FORMULA == L2STAR_C2 inline float__N__ alpha(float__N__ y){ - return (y * (y * (y * (y * (13.0 - 5.0 * y) - 9.0) - 1.0) + 2.0) / 24.0);} + return ((y * (y * (y * (y * (2.0 * y - 5.0) + 3.0) + 1.0) - 1.0)) / 2.0);} inline float__N__ beta(float__N__ y){ - return (y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0) / 24.0);} + return ((y * y * (y * (y * (-6.0 * y + 15.0) - 9.0) - 2.0) + 2.0) / 2.0);} inline float__N__ gamma(float__N__ y){ - return (y * y * (y * (y * (126.0 - 50.0 * y) - 70.0) - 30.0) / 24.0 + 1.0);} + return ((y * (y * (y * (y * (6.0 * y - 15.0) + 9.0) + 1.0) + 1.0)) / 2.0);} inline float__N__ delta(float__N__ y){ - return (y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0) / 24.0);} + return ((y * y * y * (y * (-2.0 * y + 5.0) - 3.0)) / 2.0);} + + +#elif FORMULA == L2STAR_C3 + +inline float__N__ alpha(float__N__ y){ + return ((y * (y * (y * y * (y * (y * (-6.0 * y + 21.0) - 25.0) + 10.0) + 1.0) - 1.0)) / 2.0);} +inline float__N__ beta(float__N__ y){ + return ((y * y * (y * y * (y * (y * (18.0 * y - 63.0) + 75.0) - 30.0) - 2.0) + 2.0) / 2.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * (y * (y * y * (y * (y * (-18.0 * y + 63.0) - 75.0) + 30.0) + 1.0) + 1.0)) / 2.0);} +inline float__N__ delta(float__N__ y){ + return ((y * y * y * y * (y * (y * (6.0 * y - 21.0) + 25.0) - 10.0)) / 2.0);} + + +#elif FORMULA == L2STAR_C4 + +inline float__N__ alpha(float__N__ y){ + return ((y * (y * (y * y * y * (y * (y * (y * (20.0 * y - 90.0) + 154.0) - 119.0) + 35.0) + 1.0) - 1.0)) / 2.0);} +inline float__N__ beta(float__N__ y){ + return ((y * y * (y * y * y * (y * (y * (y * (-60.0 * y + 270.0) - 462.0) + 357.0) - 105.0) - 2.0) + 2.0) / 2.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * (y * (y * y * y * (y * (y * (y * (60.0 * y - 270.0) + 462.0) - 357.0) + 105.0) + 1.0) + 1.0)) / 2.0);} +inline float__N__ delta(float__N__ y){ + return ((y * y * y * y * y * (y * (y * (y * (-20.0 * y + 90.0) - 154.0) + 119.0) - 35.0)) / 2.0);} + + + +#elif FORMULA == L4STAR + +inline float__N__ alpha(float__N__ y){ + return ((y * (y * (y * (y * (-5.0 * y + 13.0) - 9.0) - 1.0) + 2.0)) / 24.0);} +inline float__N__ beta(float__N__ y){ + return ((y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0)) / 24.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * y * (y * (y * (-50.0 * y + 126.0) - 70.0) - 30.0) + 24.0) / 24.0);} +inline float__N__ delta(float__N__ y){ + return ((y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0)) / 24.0);} inline float__N__ eta(float__N__ y){ - return (y * (y * (y * (y * (61.0 - 25.0 * y) - 33.0) - 1.0) - 2.0) / 24.0);} + return ((y * (y * (y * (y * (-25.0 * y + 61.0) - 33.0) - 1.0) - 2.0)) / 24.0);} inline float__N__ zeta(float__N__ y){ - return (y * y * y * (y * (5.0 * y - 12.0) + 7.0) / 24.0);} + return ((y * y * y * (y * (5.0 * y - 12.0) + 7.0)) / 24.0);} + + +#elif FORMULA == L4STAR_C3 + +inline float__N__ alpha(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (14.0 * y - 49.0) + 58.0) - 22.0) - 2.0) - 1.0) + 2.0)) / 24.0);} +inline float__N__ beta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (-70.0 * y + 245.0) - 290.0) + 111.0) + 4.0) + 16.0) - 16.0)) / 24.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * y * (y * y * (y * (y * (140.0 * y - 490.0) + 580.0) - 224.0) - 30.0) + 24.0) / 24.0);} +inline float__N__ delta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (-140.0 * y + 490.0) - 580.0) + 226.0) - 4.0) + 16.0) + 16.0)) / 24.0);} +inline float__N__ eta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (70.0 * y - 245.0) + 290.0) - 114.0) + 2.0) - 1.0) - 2.0)) / 24.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * y * y * y * (y * (y * (-14.0 * y + 49.0) - 58.0) + 23.0)) / 24.0);} + + +#elif FORMULA == L4STAR_C4 + +inline float__N__ alpha(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (-46.0 * y + 207.0) - 354.0) + 273.0) - 80.0) + 1.0) - 2.0) - 1.0) + 2.0)) / 24.0);} +inline float__N__ beta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (230.0 * y - 1035.0) + 1770.0) - 1365.0) + 400.0) - 4.0) + 4.0) + 16.0) - 16.0)) / 24.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * y * (y * y * (y * (y * (y * (y * (-460.0 * y + 2070.0) - 3540.0) + 2730.0) - 800.0) + 6.0) - 30.0) + 24.0) / 24.0);} +inline float__N__ delta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (460.0 * y - 2070.0) + 3540.0) - 2730.0) + 800.0) - 4.0) - 4.0) + 16.0) + 16.0)) / 24.0);} +inline float__N__ eta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (-230.0 * y + 1035.0) - 1770.0) + 1365.0) - 400.0) + 1.0) + 2.0) - 1.0) - 2.0)) / 24.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * y * y * y * y * (y * (y * (y * (46.0 * y - 207.0) + 354.0) - 273.0) + 80.0)) / 24.0);} + #elif FORMULA == M8PRIME @@ -55,19 +125,107 @@ inline float__N__ iota(float__N__ y){ #elif FORMULA == L6STAR inline float__N__ alpha(float__N__ y){ - return (((-12.0 + (4.0 + (15.0 + (140.0 + (-370.0 + (312.0 - 89.0 * y) * y) * y) * y) * y) * y) * y) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (-89.0 * y + 312.0) - 370.0) + 140.0) + 15.0) + 4.0) - 12.0)) / 720.0);} inline float__N__ beta(float__N__ y){ - return (((108.0 + (-54.0 + (-120.0 + (-955.0 + (2581.0 + (-2183.0 + 623.0 * y) * y) * y) * y) * y) * y ) * y) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (623.0 * y - 2183.0) + 2581.0) - 955.0) - 120.0) - 54.0) + 108.0)) / 720.0);} inline float__N__ gamma(float__N__ y){ - return (((-180.0 + (180.0 + (65.0 + (950.0 + (-2574.0 + (2182.0 - 623.0 * y) * y) * y) * y) * y) * y) * y) / 240.0);} + return ((y * (y * (y * (y * (y * (y * (-1869.0 * y + 6546.0) - 7722.0) + 2850.0) + 195.0) + 540.0) - 540.0)) / 720.0);} inline float__N__ delta(float__N__ y){ - return ( 1.0 + ((-196.0 + (-959.0 + (2569.0 + (-2181.0 + 623.0 * y) * y) * y) * y * y ) * y * y) / 144.0);} + return ((y * y * (y * y * (y * (y * (3115.0 * y - 10905.0) + 12845.0) - 4795.0) - 980.0) + 720.0) / 720.0);} inline float__N__ eta(float__N__ y){ - return (((108.0 + (108.0 + (-39.0 + (976.0 + (-2566.0 + (2180.0 - 623.0 * y) * y) * y) * y) * y) * y) * y) / 144.0);} + return ((y * (y * (y * (y * (y * (y * (-3115.0 * y + 10900.0) - 12830.0) + 4880.0) - 195.0) + 540.0) + 540.0)) / 720.0);} inline float__N__ zeta(float__N__ y){ - return (((-36.0 + (-18.0 + (40.0 + (-995.0 + (2565.0 + (-2179.0 + 623.0 * y) * y) * y) * y) * y) * y) * y) / 240.0);} + return ((y * (y * (y * (y * (y * (y * (1869.0 * y - 6537.0) + 7695.0) - 2985.0) + 120.0) - 54.0) - 108.0)) / 720.0);} inline float__N__ theta(float__N__ y){ - return (((12.0 + (4.0 + (-15.0 + (1010.0 + (-2566.0 + (2178.0 - 623.0 * y) * y) * y) * y) * y) * y) * y) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (-623.0 * y + 2178.0) - 2566.0) + 1010.0) - 15.0) + 4.0) + 12.0)) / 720.0);} inline float__N__ iota(float__N__ y){ - return (((-145.0 + (367.0 + (-311.0 + 89.0 * y) * y) * y) * y * y * y * y) / 720.0);} + return ((y * y * y * y * (y * (y * (89.0 * y - 311.0) + 367.0) - 145.0)) / 720.0);} + + +#elif FORMULA == L6STAR_C4 + +inline float__N__ alpha(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (290.0 * y - 1305.0) + 2231.0) - 1718.0) + 500.0) - 5.0) + 15.0) + 4.0) - 12.0)) / 720.0);} +inline float__N__ beta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (-2030.0 * y + 9135.0) - 15617.0) + 12027.0) - 3509.0) + 60.0) - 120.0) - 54.0) + 108.0)) / 720.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (6090.0 * y - 27405.0) + 46851.0) - 36084.0) + 10548.0) - 195.0) + 195.0) + 540.0) - 540.0)) / 720.0);} +inline float__N__ delta(float__N__ y){ + return ((y * y * (y * y * (y * (y * (y * (y * (-10150.0 * y + 45675.0) - 78085.0) + 60145.0) - 17605.0) + 280.0) - 980.0) + 720.0) / 720.0);} +inline float__N__ eta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (10150.0 * y - 45675.0) + 78085.0) - 60150.0) + 17620.0) - 195.0) - 195.0) + 540.0) + 540.0)) / 720.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (-6090.0 * y + 27405.0) - 46851.0) + 36093.0) - 10575.0) + 60.0) + 120.0) - 54.0) - 108.0)) / 720.0);} +inline float__N__ theta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (2030.0 * y - 9135.0) + 15617.0) - 12032.0) + 3524.0) - 5.0) - 15.0) + 4.0) + 12.0)) / 720.0);} +inline float__N__ iota(float__N__ y){ + return ((y * y * y * y * y * (y * (y * (y * (-290.0 * y + 1305.0) - 2231.0) + 1719.0) - 503.0)) / 720.0);} + + +#elif FORMULA == L6STAR_C5 + +inline float__N__ alpha(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-1006.0 * y + 5533.0) - 12285.0) + 13785.0) - 7829.0) + 1803.0) - 3.0) - 5.0) + 15.0) + 4.0) - 12.0)) / 720.0);} +inline float__N__ beta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (7042.0 * y - 38731.0) + 85995.0) - 96495.0) + 54803.0) - 12620.0) + 12.0) + 60.0) - 120.0) - 54.0) + 108.0)) / 720.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-21126.0 * y + 116193.0) - 257985.0) + 289485.0) - 164409.0) + 37857.0) - 15.0) - 195.0) + 195.0) + 540.0) - 540.0)) / 720.0);} +inline float__N__ delta(float__N__ y){ + return ((y * y * (y * y * (y * y * (y * (y * (y * (y * (35210.0 * y - 193655.0) + 429975.0) - 482475.0) + 274015.0) - 63090.0) + 280.0) - 980.0) + 720.0) / 720.0);} +inline float__N__ eta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-35210.0 * y + 193655.0) - 429975.0) + 482475.0) - 274015.0) + 63085.0) + 15.0) - 195.0) - 195.0) + 540.0) + 540.0)) / 720.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (21126.0 * y - 116193.0) + 257985.0) - 289485.0) + 164409.0) - 37848.0) - 12.0) + 60.0) + 120.0) - 54.0) - 108.0)) / 720.0);} +inline float__N__ theta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-7042.0 * y + 38731.0) - 85995.0) + 96495.0) - 54803.0) + 12615.0) + 3.0) - 5.0) - 15.0) + 4.0) + 12.0)) / 720.0);} +inline float__N__ iota(float__N__ y){ + return ((y * y * y * y * y * y * (y * (y * (y * (y * (1006.0 * y - 5533.0) + 12285.0) - 13785.0) + 7829.0) - 1802.0)) / 720.0);} + + +#elif FORMULA == L6STAR_C6 + +inline float__N__ alpha(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (3604.0 * y - 23426.0) + 63866.0) - 93577.0) + 77815.0) - 34869.0) + 6587.0) + 1.0) - 3.0) - 5.0) + 15.0) + 4.0) - 12.0)) / 720.0);} +inline float__N__ beta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-25228.0 * y + 163982.0) - 447062.0) + 655039.0) - 544705.0) + 244083.0) - 46109.0) - 6.0) + 12.0) + 60.0) - 120.0) - 54.0) + 108.0)) / 720.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (75684.0 * y - 491946.0) + 1341186.0) - 1965117.0) + 1634115.0) - 732249.0) + 138327.0) + 15.0) - 15.0) - 195.0) + 195.0) + 540.0) - 540.0)) / 720.0);} +inline float__N__ delta(float__N__ y){ + return ((y * y * (y * y * (y * y * (y * (y * (y * (y * (y * (y * (-126140.0 * y + 819910.0) - 2235310.0) + 3275195.0) - 2723525.0) + 1220415.0) - 230545.0) - 20.0) + 280.0) - 980.0) + 720.0) / 720.0);} +inline float__N__ eta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (126140.0 * y - 819910.0) + 2235310.0) - 3275195.0) + 2723525.0) - 1220415.0) + 230545.0) + 15.0) + 15.0) - 195.0) - 195.0) + 540.0) + 540.0)) / 720.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-75684.0 * y + 491946.0) - 1341186.0) + 1965117.0) - 1634115.0) + 732249.0) - 138327.0) - 6.0) - 12.0) + 60.0) + 120.0) - 54.0) - 108.0)) / 720.0);} +inline float__N__ theta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (25228.0 * y - 163982.0) + 447062.0) - 655039.0) + 544705.0) - 244083.0) + 46109.0) + 1.0) + 3.0) - 5.0) - 15.0) + 4.0) + 12.0)) / 720.0);} +inline float__N__ iota(float__N__ y){ + return ((y * y * y * y * y * y * y * (y * (y * (y * (y * (y * (-3604.0 * y + 23426.0) - 63866.0) + 93577.0) - 77815.0) + 34869.0) - 6587.0)) / 720.0);} + + + +#elif FORMULA == L8STAR + +inline float__N__ alpha(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (-3569.0 * y + 16061.0) - 27454.0) + 21126.0) - 6125.0) + 49.0) - 196.0) - 36.0) + 144.0)) / 40320.0);} +inline float__N__ beta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (32121.0 * y - 144548.0) + 247074.0) - 190092.0) + 55125.0) - 672.0) + 2016.0) + 512.0) - 1536.0)) / 40320.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (-128484.0 * y + 578188.0) - 988256.0) + 760312.0) - 221060.0) + 4732.0) - 9464.0) - 4032.0) + 8064.0)) / 40320.0);} +inline float__N__ delta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (299796.0 * y - 1349096.0) + 2305856.0) - 1774136.0) + 517580.0) - 13664.0) + 13664.0) + 32256.0) - 32256.0)) / 40320.0);} +inline float__N__ eta(float__N__ y){ + return ((y * y * (y * y * (y * (y * (y * (y * (-449694.0 * y + 2023630.0) - 3458700.0) + 2661540.0) - 778806.0) + 19110.0) - 57400.0) + 40320.0) / 40320.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (449694.0 * y - 2023616.0) + 3458644.0) - 2662016.0) + 780430.0) - 13664.0) - 13664.0) + 32256.0) + 32256.0)) / 40320.0);} +inline float__N__ theta(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (-299796.0 * y + 1349068.0) - 2305744.0) + 1775032.0) - 520660.0) + 4732.0) + 9464.0) - 4032.0) - 8064.0)) / 40320.0);} +inline float__N__ iota(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (128484.0 * y - 578168.0) + 988176.0) - 760872.0) + 223020.0) - 672.0) - 2016.0) + 512.0) + 1536.0)) / 40320.0);} +inline float__N__ kappa(float__N__ y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (-32121.0 * y + 144541.0) - 247046.0) + 190246.0) - 55685.0) + 49.0) + 196.0) - 36.0) - 144.0)) / 40320.0);} +inline float__N__ mu(float__N__ y){ + return ((y * y * y * y * y * (y * (y * (y * (3569.0 * y - 16060.0) + 27450.0) - 21140.0) + 6181.0)) / 40320.0);} + + + #endif diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl index 38d792041b54671de9a163fb644646a32d2f5a88..910aafda309363be8718c965bde79b86fdc4057f 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/weights_builtin.cl @@ -16,20 +16,88 @@ inline float__N__ delta(float__N__ y){ return ((y * y * fma(1.0, y, - 1.0)) / 2.0);} -#elif FORMULA == M6PRIME +#elif FORMULA == L2STAR_C2 inline float__N__ alpha(float__N__ y){ - return (y * fma(y , fma(y , fma(y , fma(-5.0, y, 13.0),- 9.0),- 1.0), 2.0) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, 2.0, -5.0), 3.0), 1.0), -1.0)) / 2.0);} inline float__N__ beta(float__N__ y){ - return (y * fma(y , fma(y , fma(y , fma(25.0 , y ,- 64.0), 39.0) , 16.0), - 16.0) / 24.0);} + return (fma(y * y, fma(y, fma(y, fma(y, -6.0, 15.0), -9.0), -2.0), 2.0) / 2.0);} inline float__N__ gamma(float__N__ y){ - return (y * y * fma(y , fma(y , fma( - 50.0 , y, 126.0) ,- 70.0) ,- 30.0) / 24.0 + 1.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, 6.0, -15.0), 9.0), 1.0), 1.0)) / 2.0);} inline float__N__ delta(float__N__ y){ - return (y * fma(y , fma(y, fma(y, fma(50.0, y, - 124.0), 66.0) , 16.0) , 16.0) / 24.0);} + return ((y * y * y * fma(y, fma(y, -2.0, 5.0), -3.0)) / 2.0);} + + +#elif FORMULA == L2STAR_C3 + +inline float__N__ alpha(float__N__ y){ + return ((y * fma(y, fma(y * y, fma(y, fma(y, fma(y, -6.0, 21.0), -25.0), 10.0), 1.0), -1.0)) / 2.0);} +inline float__N__ beta(float__N__ y){ + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 18.0, -63.0), 75.0), -30.0), -2.0), 2.0) / 2.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * fma(y, fma(y * y, fma(y, fma(y, fma(y, -18.0, 63.0), -75.0), 30.0), 1.0), 1.0)) / 2.0);} +inline float__N__ delta(float__N__ y){ + return ((y * y * y * y * fma(y, fma(y, fma(y, 6.0, -21.0), 25.0), -10.0)) / 2.0);} + + +#elif FORMULA == L2STAR_C4 + +inline float__N__ alpha(float__N__ y){ + return ((y * fma(y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, 20.0, -90.0), 154.0), -119.0), 35.0), 1.0), -1.0)) / 2.0);} +inline float__N__ beta(float__N__ y){ + return (fma(y * y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, -60.0, 270.0), -462.0), 357.0), -105.0), -2.0), 2.0) / 2.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * fma(y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, 60.0, -270.0), 462.0), -357.0), 105.0), 1.0), 1.0)) / 2.0);} +inline float__N__ delta(float__N__ y){ + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, -20.0, 90.0), -154.0), 119.0), -35.0)) / 2.0);} + + +#elif FORMULA == L4STAR + +inline float__N__ alpha(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, -5.0, 13.0), -9.0), -1.0), 2.0)) / 24.0);} +inline float__N__ beta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, 25.0, -64.0), 39.0), 16.0), -16.0)) / 24.0);} +inline float__N__ gamma(float__N__ y){ + return (fma(y * y, fma(y, fma(y, fma(y, -50.0, 126.0), -70.0), -30.0), 24.0) / 24.0);} +inline float__N__ delta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, 50.0, -124.0), 66.0), 16.0), 16.0)) / 24.0);} inline float__N__ eta(float__N__ y){ - return (y * fma(y , fma(y , fma(y , fma(- 25.0 , y, 61.0), - 33.0), - 1.0), - 2.0) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, -25.0, 61.0), -33.0), -1.0), -2.0)) / 24.0);} inline float__N__ zeta(float__N__ y){ - return (y * y * y * fma(y , fma(5.0 , y ,- 12.0) , 7.0) / 24.0);} + return ((y * y * y * fma(y, fma(y, 5.0, -12.0), 7.0)) / 24.0);} + + +#elif FORMULA == L4STAR_C3 + +inline float__N__ alpha(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 14.0, -49.0), 58.0), -22.0), -2.0), -1.0), 2.0)) / 24.0);} +inline float__N__ beta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -70.0, 245.0), -290.0), 111.0), 4.0), 16.0), -16.0)) / 24.0);} +inline float__N__ gamma(float__N__ y){ + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 140.0, -490.0), 580.0), -224.0), -30.0), 24.0) / 24.0);} +inline float__N__ delta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -140.0, 490.0), -580.0), 226.0), -4.0), 16.0), 16.0)) / 24.0);} +inline float__N__ eta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 70.0, -245.0), 290.0), -114.0), 2.0), -1.0), -2.0)) / 24.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * y * y * y * fma(y, fma(y, fma(y, -14.0, 49.0), -58.0), 23.0)) / 24.0);} + + +#elif FORMULA == L4STAR_C4 + +inline float__N__ alpha(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -46.0, 207.0), -354.0), 273.0), -80.0), 1.0), -2.0), -1.0), 2.0)) / 24.0);} +inline float__N__ beta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 230.0, -1035.0), 1770.0), -1365.0), 400.0), -4.0), 4.0), 16.0), -16.0)) / 24.0);} +inline float__N__ gamma(float__N__ y){ + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -460.0, 2070.0), -3540.0), 2730.0), -800.0), 6.0), -30.0), 24.0) / 24.0);} +inline float__N__ delta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 460.0, -2070.0), 3540.0), -2730.0), 800.0), -4.0), -4.0), 16.0), 16.0)) / 24.0);} +inline float__N__ eta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -230.0, 1035.0), -1770.0), 1365.0), -400.0), 1.0), 2.0), -1.0), -2.0)) / 24.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, 46.0, -207.0), 354.0), -273.0), 80.0)) / 24.0);} #elif FORMULA == M8PRIME @@ -55,19 +123,106 @@ inline float__N__ iota(float__N__ y){ #elif FORMULA == L6STAR inline float__N__ alpha(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(-89.0, y, 312.0), -370.0), 140.0), 15.0), 4.0), -12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -89.0, 312.0), -370.0), 140.0), 15.0), 4.0), -12.0)) / 720.0);} +inline float__N__ beta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 623.0, -2183.0), 2581.0), -955.0), -120.0), -54.0), 108.0)) / 720.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1869.0, 6546.0), -7722.0), 2850.0), 195.0), 540.0), -540.0)) / 720.0);} +inline float__N__ delta(float__N__ y){ + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 3115.0, -10905.0), 12845.0), -4795.0), -980.0), 720.0) / 720.0);} +inline float__N__ eta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3115.0, 10900.0), -12830.0), 4880.0), -195.0), 540.0), 540.0)) / 720.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 1869.0, -6537.0), 7695.0), -2985.0), 120.0), -54.0), -108.0)) / 720.0);} +inline float__N__ theta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -623.0, 2178.0), -2566.0), 1010.0), -15.0), 4.0), 12.0)) / 720.0);} +inline float__N__ iota(float__N__ y){ + return ((y * y * y * y * fma(y, fma(y, fma(y, 89.0, -311.0), 367.0), -145.0)) / 720.0);} + + +#elif FORMULA == L6STAR_C4 + +inline float__N__ alpha(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 290.0, -1305.0), 2231.0), -1718.0), 500.0), -5.0), 15.0), 4.0), -12.0)) / 720.0);} +inline float__N__ beta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -2030.0, 9135.0), -15617.0), 12027.0), -3509.0), 60.0), -120.0), -54.0), 108.0)) / 720.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 6090.0, -27405.0), 46851.0), -36084.0), 10548.0), -195.0), 195.0), 540.0), -540.0)) / 720.0);} +inline float__N__ delta(float__N__ y){ + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -10150.0, 45675.0), -78085.0), 60145.0), -17605.0), 280.0), -980.0), 720.0) / 720.0);} +inline float__N__ eta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 10150.0, -45675.0), 78085.0), -60150.0), 17620.0), -195.0), -195.0), 540.0), 540.0)) / 720.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -6090.0, 27405.0), -46851.0), 36093.0), -10575.0), 60.0), 120.0), -54.0), -108.0)) / 720.0);} +inline float__N__ theta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 2030.0, -9135.0), 15617.0), -12032.0), 3524.0), -5.0), -15.0), 4.0), 12.0)) / 720.0);} +inline float__N__ iota(float__N__ y){ + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, -290.0, 1305.0), -2231.0), 1719.0), -503.0)) / 720.0);} + + +#elif FORMULA == L6STAR_C5 + +inline float__N__ alpha(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1006.0, 5533.0), -12285.0), 13785.0), -7829.0), 1803.0), -3.0), -5.0), 15.0), 4.0), -12.0)) / 720.0);} inline float__N__ beta(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(623.0, y, -2183.0), 2581.0), -955.0), -120.0), -54.0), 108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 7042.0, -38731.0), 85995.0), -96495.0), 54803.0), -12620.0), 12.0), 60.0), -120.0), -54.0), 108.0)) / 720.0);} inline float__N__ gamma(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(-623.0, y, 2182.0), -2574.0), 950.0), 65.0), 180.0), -180.0)) / 240.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -21126.0, 116193.0), -257985.0), 289485.0), -164409.0), 37857.0), -15.0), -195.0), 195.0), 540.0), -540.0)) / 720.0);} inline float__N__ delta(float__N__ y){ - return ((y * y * fma(y * y, fma(y, fma(y, fma(623.0, y, -2181.0), 2569.0), -959.0), -196.0)) / 144.0 + 1.0);} + return (fma(y * y, fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, 35210.0, -193655.0), 429975.0), -482475.0), 274015.0), -63090.0), 280.0), -980.0), 720.0) / 720.0);} inline float__N__ eta(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(-623.0, y, 2180.0), -2566.0), 976.0), -39.0), 108.0), 108.0)) / 144.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -35210.0, 193655.0), -429975.0), 482475.0), -274015.0), 63085.0), 15.0), -195.0), -195.0), 540.0), 540.0)) / 720.0);} inline float__N__ zeta(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(623.0, y, -2179.0), 2565.0), -995.0), 40.0), -18.0), -36.0)) / 240.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 21126.0, -116193.0), 257985.0), -289485.0), 164409.0), -37848.0), -12.0), 60.0), 120.0), -54.0), -108.0)) / 720.0);} inline float__N__ theta(float__N__ y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(-623.0, y, 2178.0), -2566.0), 1010.0), -15.0), 4.0), 12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -7042.0, 38731.0), -85995.0), 96495.0), -54803.0), 12615.0), 3.0), -5.0), -15.0), 4.0), 12.0)) / 720.0);} inline float__N__ iota(float__N__ y){ - return ((y * y * y * y * fma(y, fma(y, fma(89.0, y, -311.0), 367.0), -145.0)) / 720.0);} + return ((y * y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, fma(y, 1006.0, -5533.0), 12285.0), -13785.0), 7829.0), -1802.0)) / 720.0);} + + +#elif FORMULA == L6STAR_C6 + +inline float__N__ alpha(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 3604.0, -23426.0), 63866.0), -93577.0), 77815.0), -34869.0), 6587.0), 1.0), -3.0), -5.0), 15.0), 4.0), -12.0)) / 720.0);} +inline float__N__ beta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -25228.0, 163982.0), -447062.0), 655039.0), -544705.0), 244083.0), -46109.0), -6.0), 12.0), 60.0), -120.0), -54.0), 108.0)) / 720.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 75684.0, -491946.0), 1341186.0), -1965117.0), 1634115.0), -732249.0), 138327.0), 15.0), -15.0), -195.0), 195.0), 540.0), -540.0)) / 720.0);} +inline float__N__ delta(float__N__ y){ + return (fma(y * y, fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -126140.0, 819910.0), -2235310.0), 3275195.0), -2723525.0), 1220415.0), -230545.0), -20.0), 280.0), -980.0), 720.0) / 720.0);} +inline float__N__ eta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 126140.0, -819910.0), 2235310.0), -3275195.0), 2723525.0), -1220415.0), 230545.0), 15.0), 15.0), -195.0), -195.0), 540.0), 540.0)) / 720.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -75684.0, 491946.0), -1341186.0), 1965117.0), -1634115.0), 732249.0), -138327.0), -6.0), -12.0), 60.0), 120.0), -54.0), -108.0)) / 720.0);} +inline float__N__ theta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 25228.0, -163982.0), 447062.0), -655039.0), 544705.0), -244083.0), 46109.0), 1.0), 3.0), -5.0), -15.0), 4.0), 12.0)) / 720.0);} +inline float__N__ iota(float__N__ y){ + return ((y * y * y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3604.0, 23426.0), -63866.0), 93577.0), -77815.0), 34869.0), -6587.0)) / 720.0);} + + + +#elif FORMULA == L8STAR + +inline float__N__ alpha(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3569.0, 16061.0), -27454.0), 21126.0), -6125.0), 49.0), -196.0), -36.0), 144.0)) / 40320.0);} +inline float__N__ beta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 32121.0, -144548.0), 247074.0), -190092.0), 55125.0), -672.0), 2016.0), 512.0), -1536.0)) / 40320.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -128484.0, 578188.0), -988256.0), 760312.0), -221060.0), 4732.0), -9464.0), -4032.0), 8064.0)) / 40320.0);} +inline float__N__ delta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 299796.0, -1349096.0), 2305856.0), -1774136.0), 517580.0), -13664.0), 13664.0), 32256.0), -32256.0)) / 40320.0);} +inline float__N__ eta(float__N__ y){ + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -449694.0, 2023630.0), -3458700.0), 2661540.0), -778806.0), 19110.0), -57400.0), 40320.0) / 40320.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 449694.0, -2023616.0), 3458644.0), -2662016.0), 780430.0), -13664.0), -13664.0), 32256.0), 32256.0)) / 40320.0);} +inline float__N__ theta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -299796.0, 1349068.0), -2305744.0), 1775032.0), -520660.0), 4732.0), 9464.0), -4032.0), -8064.0)) / 40320.0);} +inline float__N__ iota(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 128484.0, -578168.0), 988176.0), -760872.0), 223020.0), -672.0), -2016.0), 512.0), 1536.0)) / 40320.0);} +inline float__N__ kappa(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -32121.0, 144541.0), -247046.0), 190246.0), -55685.0), 49.0), 196.0), -36.0), -144.0)) / 40320.0);} +inline float__N__ mu(float__N__ y){ + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, 3569.0, -16060.0), 27450.0), -21140.0), 6181.0)) / 40320.0);} + + #endif diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl index 4e40adb5933c9d3efadd831fccfe9dd351548ab5..70782b26ad0474592d94f966a55e96e91995a029 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec.cl @@ -16,68 +16,90 @@ inline float delta(float y){ return ((y * y * (y - 1.0)) / 2.0);} -#elif FORMULA == M6PRIME +#elif FORMULA == L2STAR_C2 inline float alpha(float y){ - return (y * (y * (y * (y * (13.0 - 5.0 * y) - 9.0) - 1.0) + 2.0) / 24.0);} + return ((y * (y * (y * (y * (2.0 * y - 5.0) + 3.0) + 1.0) - 1.0)) / 2.0);} inline float beta(float y){ - return (y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0) / 24.0);} + return ((y * y * (y * (y * (-6.0 * y + 15.0) - 9.0) - 2.0) + 2.0) / 2.0);} inline float gamma(float y){ - return (y * y * (y * (y * (126.0 - 50.0 * y) - 70.0) - 30.0) / 24.0 + 1.0);} + return ((y * (y * (y * (y * (6.0 * y - 15.0) + 9.0) + 1.0) + 1.0)) / 2.0);} inline float delta(float y){ - return (y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0) / 24.0);} -inline float eta(float y){ - return (y * (y * (y * (y * (61.0 - 25.0 * y) - 33.0) - 1.0) - 2.0) / 24.0);} -inline float zeta(float y){ - return (y * y * y * (y * (5.0 * y - 12.0) + 7.0) / 24.0);} + return ((y * y * y * (y * (-2.0 * y + 5.0) - 3.0)) / 2.0);} + + +#elif FORMULA == L2STAR_C3 + +inline float alpha(float y){ + return ((y * (y * (y * y * (y * (y * (-6.0 * y + 21.0) - 25.0) + 10.0) + 1.0) - 1.0)) / 2.0);} +inline float beta(float y){ + return ((y * y * (y * y * (y * (y * (18.0 * y - 63.0) + 75.0) - 30.0) - 2.0) + 2.0) / 2.0);} +inline float gamma(float y){ + return ((y * (y * (y * y * (y * (y * (-18.0 * y + 63.0) - 75.0) + 30.0) + 1.0) + 1.0)) / 2.0);} +inline float delta(float y){ + return ((y * y * y * y * (y * (y * (6.0 * y - 21.0) + 25.0) - 10.0)) / 2.0);} -#elif FORMULA == L4STAR -/* L4 star C2*/ +#elif FORMULA == L2STAR_C4 inline float alpha(float y){ - return ((2.0+(-1.0+(-9.0+(13.0-5.0*y)*y)*y)*y)*y / 24.0);} + return ((y * (y * (y * y * y * (y * (y * (y * (20.0 * y - 90.0) + 154.0) - 119.0) + 35.0) + 1.0) - 1.0)) / 2.0);} inline float beta(float y){ - return ((-16.0+(16.0+(39.0+(-64.0+25.0*y)*y)*y)*y)*y / 24.0);} + return ((y * y * (y * y * y * (y * (y * (y * (-60.0 * y + 270.0) - 462.0) + 357.0) - 105.0) - 2.0) + 2.0) / 2.0);} inline float gamma(float y){ - return ((-30.0+(-70.0+(126.0-50.0*y)*y)*y)*y*y / 24.0 + 1.0);} + return ((y * (y * (y * y * y * (y * (y * (y * (60.0 * y - 270.0) + 462.0) - 357.0) + 105.0) + 1.0) + 1.0)) / 2.0);} inline float delta(float y){ - return ((16.0+(16.0+(66.0+(-124.0+50.0*y)*y)*y)*y)*y / 24.0);} + return ((y * y * y * y * y * (y * (y * (y * (-20.0 * y + 90.0) - 154.0) + 119.0) - 35.0)) / 2.0);} + + + +#elif FORMULA == L4STAR + +inline float alpha(float y){ + return ((y * (y * (y * (y * (-5.0 * y + 13.0) - 9.0) - 1.0) + 2.0)) / 24.0);} +inline float beta(float y){ + return ((y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0)) / 24.0);} +inline float gamma(float y){ + return ((y * y * (y * (y * (-50.0 * y + 126.0) - 70.0) - 30.0) + 24.0) / 24.0);} +inline float delta(float y){ + return ((y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0)) / 24.0);} inline float eta(float y){ - return ((-2.0+(-1.0+(-33.0+(61.0-25.0*y)*y)*y)*y)*y / 24.0);} + return ((y * (y * (y * (y * (-25.0 * y + 61.0) - 33.0) - 1.0) - 2.0)) / 24.0);} inline float zeta(float y){ -return ((7.0+(-12.0+5.0*y)*y)*y*y*y / 24.0);} + return ((y * y * y * (y * (5.0 * y - 12.0) + 7.0)) / 24.0);} + +#elif FORMULA == L4STAR_C3 -/* L4 star C3*/ -/* inline float alpha(float y){ - return ((2.0+(-1.0+(-2.0+(-22.0+(58.0+(-49.0+14.0*y)*y)*y)*y)*y)*y)*y / 24.0);} + return ((y * (y * (y * (y * (y * (y * (14.0 * y - 49.0) + 58.0) - 22.0) - 2.0) - 1.0) + 2.0)) / 24.0);} inline float beta(float y){ - return ((-16.0+(16.0+(4.0+(111.0+(-290.0+(245.0-70.0*y)*y)*y)*y)*y)*y)*y/ 24.0);} + return ((y * (y * (y * (y * (y * (y * (-70.0 * y + 245.0) - 290.0) + 111.0) + 4.0) + 16.0) - 16.0)) / 24.0);} inline float gamma(float y){ - return ((-30.0+(-224.0+(580.0+(-490.0+140.0*y)*y)*y)*y*y)*y*y / 24.0 + 1.0);} + return ((y * y * (y * y * (y * (y * (140.0 * y - 490.0) + 580.0) - 224.0) - 30.0) + 24.0) / 24.0);} inline float delta(float y){ - return ((16.0+(16.0+(-4.0+(226.0+(-580.0+(490.0-140.0*y)*y)*y)*y)*y)*y)*y / 24.0);} + return ((y * (y * (y * (y * (y * (y * (-140.0 * y + 490.0) - 580.0) + 226.0) - 4.0) + 16.0) + 16.0)) / 24.0);} inline float eta(float y){ - return ((-2.0+(-1.0+(2.0+(-114.0+(290.0+(-245.0+70.0*y)*y)*y)*y)*y)*y)*y / 24.0);} + return ((y * (y * (y * (y * (y * (y * (70.0 * y - 245.0) + 290.0) - 114.0) + 2.0) - 1.0) - 2.0)) / 24.0);} inline float zeta(float y){ -return ((23.0+(-58.0+(49.0-14.0*y)*y)*y)*y*y*y*y / 24.0);}*/ + return ((y * y * y * y * (y * (y * (-14.0 * y + 49.0) - 58.0) + 23.0)) / 24.0);} + + +#elif FORMULA == L4STAR_C4 -/* L4 star C4*/ -/* inline float alpha(float y){ - return ((2.0+(-1.0+(-2.0+(1.0+(-80.0+(273.0+(-354.0+(207.0-46.0*y)*y)*y)*y)*y)*y)*y)*y)*y / 24.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-46.0 * y + 207.0) - 354.0) + 273.0) - 80.0) + 1.0) - 2.0) - 1.0) + 2.0)) / 24.0);} inline float beta(float y){ - return ((-16.0+(16.0+(4.0+(-4.0+(400.0+(-1365.0+(1770.0+(-1035.0+230.0*y)*y)*y)*y)*y)*y)*y)*y)*y / 24.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (230.0 * y - 1035.0) + 1770.0) - 1365.0) + 400.0) - 4.0) + 4.0) + 16.0) - 16.0)) / 24.0);} inline float gamma(float y){ - return ((-30.0+(6.0+(-800.0+(2730.0+(-3540.0+(2070.0-460.0*y)*y)*y)*y)*y)*y*y)*y*y / 24.0 + 1.0);} + return ((y * y * (y * y * (y * (y * (y * (y * (-460.0 * y + 2070.0) - 3540.0) + 2730.0) - 800.0) + 6.0) - 30.0) + 24.0) / 24.0);} inline float delta(float y){ - return ((16.0+(16.0+(-4.0+(-4.0+(800.0+(-2730.0+(3540.0+(-2070.0+460.0*y)*y)*y)*y)*y)*y)*y)*y)*y / 24.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (460.0 * y - 2070.0) + 3540.0) - 2730.0) + 800.0) - 4.0) - 4.0) + 16.0) + 16.0)) / 24.0);} inline float eta(float y){ - return ((-2.0+(-1.0+(2.0+(1.0+(-400.0+(1365.0+(-1770.0+(1035.0-230.0*y)*y)*y)*y)*y)*y)*y)*y)*y / 24.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-230.0 * y + 1035.0) - 1770.0) + 1365.0) - 400.0) + 1.0) + 2.0) - 1.0) - 2.0)) / 24.0);} inline float zeta(float y){ -return ((80.0+(-273.0+(354.0+(-207.0+46.0*y)*y)*y)*y)*y*y*y*y*y / 24.0);}*/ + return ((y * y * y * y * y * (y * (y * (y * (46.0 * y - 207.0) + 354.0) - 273.0) + 80.0)) / 24.0);} + #elif FORMULA == M8PRIME @@ -101,83 +123,109 @@ inline float iota(float y){ #elif FORMULA == L6STAR -/*L6 star C3*/ -/* + +inline float alpha(float y){ + return ((y * (y * (y * (y * (y * (y * (-89.0 * y + 312.0) - 370.0) + 140.0) + 15.0) + 4.0) - 12.0)) / 720.0);} +inline float beta(float y){ + return ((y * (y * (y * (y * (y * (y * (623.0 * y - 2183.0) + 2581.0) - 955.0) - 120.0) - 54.0) + 108.0)) / 720.0);} +inline float gamma(float y){ + return ((y * (y * (y * (y * (y * (y * (-1869.0 * y + 6546.0) - 7722.0) + 2850.0) + 195.0) + 540.0) - 540.0)) / 720.0);} +inline float delta(float y){ + return ((y * y * (y * y * (y * (y * (3115.0 * y - 10905.0) + 12845.0) - 4795.0) - 980.0) + 720.0) / 720.0);} +inline float eta(float y){ + return ((y * (y * (y * (y * (y * (y * (-3115.0 * y + 10900.0) - 12830.0) + 4880.0) - 195.0) + 540.0) + 540.0)) / 720.0);} +inline float zeta(float y){ + return ((y * (y * (y * (y * (y * (y * (1869.0 * y - 6537.0) + 7695.0) - 2985.0) + 120.0) - 54.0) - 108.0)) / 720.0);} +inline float theta(float y){ + return ((y * (y * (y * (y * (y * (y * (-623.0 * y + 2178.0) - 2566.0) + 1010.0) - 15.0) + 4.0) + 12.0)) / 720.0);} +inline float iota(float y){ + return ((y * y * y * y * (y * (y * (89.0 * y - 311.0) + 367.0) - 145.0)) / 720.0);} + + +#elif FORMULA == L6STAR_C4 + inline float alpha(float y){ - return (((-12.0 + (4.0 + (15.0 + (140.0 + (-370.0 + (312.0 - 89.0 * y) * y) * y) * y) * y) * y) * y) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (290.0 * y - 1305.0) + 2231.0) - 1718.0) + 500.0) - 5.0) + 15.0) + 4.0) - 12.0)) / 720.0);} inline float beta(float y){ - return (((108.0 + (-54.0 + (-120.0 + (-955.0 + (2581.0 + (-2183.0 + 623.0 * y) * y) * y) * y) * y) * y ) * y) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-2030.0 * y + 9135.0) - 15617.0) + 12027.0) - 3509.0) + 60.0) - 120.0) - 54.0) + 108.0)) / 720.0);} inline float gamma(float y){ - return (((-180.0 + (180.0 + (65.0 + (950.0 + (-2574.0 + (2182.0 - 623.0 * y) * y) * y) * y) * y) * y) * y) / 240.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (6090.0 * y - 27405.0) + 46851.0) - 36084.0) + 10548.0) - 195.0) + 195.0) + 540.0) - 540.0)) / 720.0);} inline float delta(float y){ - return ( 1.0 + ((-196.0 + (-959.0 + (2569.0 + (-2181.0 + 623.0 * y) * y) * y) * y * y ) * y * y) / 144.0);} + return ((y * y * (y * y * (y * (y * (y * (y * (-10150.0 * y + 45675.0) - 78085.0) + 60145.0) - 17605.0) + 280.0) - 980.0) + 720.0) / 720.0);} inline float eta(float y){ - return (((108.0 + (108.0 + (-39.0 + (976.0 + (-2566.0 + (2180.0 - 623.0 * y) * y) * y) * y) * y) * y) * y) / 144.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (10150.0 * y - 45675.0) + 78085.0) - 60150.0) + 17620.0) - 195.0) - 195.0) + 540.0) + 540.0)) / 720.0);} inline float zeta(float y){ - return (((-36.0 + (-18.0 + (40.0 + (-995.0 + (2565.0 + (-2179.0 + 623.0 * y) * y) * y) * y) * y) * y) * y) / 240.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-6090.0 * y + 27405.0) - 46851.0) + 36093.0) - 10575.0) + 60.0) + 120.0) - 54.0) - 108.0)) / 720.0);} inline float theta(float y){ - return (((12.0 + (4.0 + (-15.0 + (1010.0 + (-2566.0 + (2178.0 - 623.0 * y) * y) * y) * y) * y) * y) * y) / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (2030.0 * y - 9135.0) + 15617.0) - 12032.0) + 3524.0) - 5.0) - 15.0) + 4.0) + 12.0)) / 720.0);} inline float iota(float y){ -return (((-145.0 + (367.0 + (-311.0 + 89.0 * y) * y) * y) * y * y * y * y) / 720.0);}*/ + return ((y * y * y * y * y * (y * (y * (y * (-290.0 * y + 1305.0) - 2231.0) + 1719.0) - 503.0)) / 720.0);} + + +#elif FORMULA == L6STAR_C5 -/*L6 star C4*/ -/* inline float alpha(float y){ - return ((-12.0+(4.0+(15.0+(-5.0+(500.0+(-1718.0+(2231.0+(-1305.0+290.0*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-1006.0 * y + 5533.0) - 12285.0) + 13785.0) - 7829.0) + 1803.0) - 3.0) - 5.0) + 15.0) + 4.0) - 12.0)) / 720.0);} inline float beta(float y){ - return ((108.0+(-54.0+(-120.0+(60.0+(-3509.0+(12027.0+(-15617.0+(9135.0-2030.0*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (7042.0 * y - 38731.0) + 85995.0) - 96495.0) + 54803.0) - 12620.0) + 12.0) + 60.0) - 120.0) - 54.0) + 108.0)) / 720.0);} inline float gamma(float y){ - return ((-540.0+(540.0+(195.0+(-195.0+(10548.0+(-36084.0+(46851.0+(-27405.0+6090.0*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-21126.0 * y + 116193.0) - 257985.0) + 289485.0) - 164409.0) + 37857.0) - 15.0) - 195.0) + 195.0) + 540.0) - 540.0)) / 720.0);} inline float delta(float y){ - return ( 1.0 + (-980.0+(280.0+(-17605.0+(60145.0+(-78085.0+(45675.0-10150.0*y)*y)*y)*y)*y)*y*y)*y*y / 720.0);} + return ((y * y * (y * y * (y * y * (y * (y * (y * (y * (35210.0 * y - 193655.0) + 429975.0) - 482475.0) + 274015.0) - 63090.0) + 280.0) - 980.0) + 720.0) / 720.0);} inline float eta(float y){ - return ((540.0+(540.0+(-195.0+(-195.0+(17620.0+(-60150.0+(78085.0+(-45675.0+10150.0*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-35210.0 * y + 193655.0) - 429975.0) + 482475.0) - 274015.0) + 63085.0) + 15.0) - 195.0) - 195.0) + 540.0) + 540.0)) / 720.0);} inline float zeta(float y){ - return ((-108.0+(-54.0+(120.0+(60.0+(-10575.0+(36093.0+(-46851.0+(27405.0-6090.0*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (21126.0 * y - 116193.0) + 257985.0) - 289485.0) + 164409.0) - 37848.0) - 12.0) + 60.0) + 120.0) - 54.0) - 108.0)) / 720.0);} inline float theta(float y){ - return ((12.0+(4.0+(-15.0+(-5.0+(3524.0+(-12032.0+(15617.0+(-9135.0+2030.0*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-7042.0 * y + 38731.0) - 85995.0) + 96495.0) - 54803.0) + 12615.0) + 3.0) - 5.0) - 15.0) + 4.0) + 12.0)) / 720.0);} inline float iota(float y){ -return ((-503.0+(1719.0+(-2231.0+(1305.0-290.0*y)*y)*y)*y)*y*y*y*y*y / 720.0);}*/ + return ((y * y * y * y * y * y * (y * (y * (y * (y * (1006.0 * y - 5533.0) + 12285.0) - 13785.0) + 7829.0) - 1802.0)) / 720.0);} + +#elif FORMULA == L6STAR_C6 -/*L6 star C5*/ -/* inline float alpha(float y){ - return ((-12.0+(4.0+(15.0+(-5.0+(-3.0+(1803.0+(-7829.0+(13785.0+(-12285.0+(5533.0-1006.0*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (3604.0 * y - 23426.0) + 63866.0) - 93577.0) + 77815.0) - 34869.0) + 6587.0) + 1.0) - 3.0) - 5.0) + 15.0) + 4.0) - 12.0)) / 720.0);} inline float beta(float y){ - return ((108.0+(-54.0+(-120.0+(60.0+(12.0+(-12620.0+(54803.0+(-96495.0+(85995.0+(-38731.0+7042.0*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-25228.0 * y + 163982.0) - 447062.0) + 655039.0) - 544705.0) + 244083.0) - 46109.0) - 6.0) + 12.0) + 60.0) - 120.0) - 54.0) + 108.0)) / 720.0);} inline float gamma(float y){ - return ((-540.0+(540.0+(195.0+(-195.0+(-15.0+(37857.0+(-164409.0+(289485.0+(-257985.0+(116193.0-21126.0*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (75684.0 * y - 491946.0) + 1341186.0) - 1965117.0) + 1634115.0) - 732249.0) + 138327.0) + 15.0) - 15.0) - 195.0) + 195.0) + 540.0) - 540.0)) / 720.0);} inline float delta(float y){ - return ( 1.0 + (-980.0+(280.0+(-63090.0+(274015.0+(-482475.0+(429975.0+(-193655.0+35210.0*y)*y)*y)*y)*y)*y*y)*y*y)*y*y / 720.0);} + return ((y * y * (y * y * (y * y * (y * (y * (y * (y * (y * (y * (-126140.0 * y + 819910.0) - 2235310.0) + 3275195.0) - 2723525.0) + 1220415.0) - 230545.0) - 20.0) + 280.0) - 980.0) + 720.0) / 720.0);} inline float eta(float y){ - return ((540.0+(540.0+(-195.0+(-195.0+(15.0+(63085.0+(-274015.0+(482475.0+(-429975.0+(193655.0-35210.0*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (126140.0 * y - 819910.0) + 2235310.0) - 3275195.0) + 2723525.0) - 1220415.0) + 230545.0) + 15.0) + 15.0) - 195.0) - 195.0) + 540.0) + 540.0)) / 720.0);} inline float zeta(float y){ - return ((-108.0+(-54.0+(120.0+(60.0+(-12.0+(-37848.0+(164409.0+(-289485.0+(257985.0+(-116193.0+21126.0*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (-75684.0 * y + 491946.0) - 1341186.0) + 1965117.0) - 1634115.0) + 732249.0) - 138327.0) - 6.0) - 12.0) + 60.0) + 120.0) - 54.0) - 108.0)) / 720.0);} inline float theta(float y){ - return ((12.0+(4.0+(-15.0+(-5.0+(3.0+(12615.0+(-54803.0+(96495.0+(-85995.0+(38731.0-7042.0*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (y * (25228.0 * y - 163982.0) + 447062.0) - 655039.0) + 544705.0) - 244083.0) + 46109.0) + 1.0) + 3.0) - 5.0) - 15.0) + 4.0) + 12.0)) / 720.0);} inline float iota(float y){ -return ((-1802.0+(7829.0+(-13785.0+(12285.0+(-5533.0+1006.0*y)*y)*y)*y)*y)*y*y*y*y*y*y / 720.0);}*/ + return ((y * y * y * y * y * y * y * (y * (y * (y * (y * (y * (-3604.0 * y + 23426.0) - 63866.0) + 93577.0) - 77815.0) + 34869.0) - 6587.0)) / 720.0);} -/*L6 star C6*/ + +#elif FORMULA == L8STAR inline float alpha(float y){ - return ((-12.0+(4.0+(15.0+(-5.0+(-3.0+(1.0+(6587.0+(-34869.0+(77815.0+(-93577.0+(63866.0+(-23426.0+3604.0*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-3569.0 * y + 16061.0) - 27454.0) + 21126.0) - 6125.0) + 49.0) - 196.0) - 36.0) + 144.0)) / 40320.0);} inline float beta(float y){ - return ((108.0+(-54.0+(-120.0+(60.0+(12.0+(-6.0+(-46109.0+(244083.0+(-544705.0+(655039.0+(-447062.0+(163982.0-25228.0*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (32121.0 * y - 144548.0) + 247074.0) - 190092.0) + 55125.0) - 672.0) + 2016.0) + 512.0) - 1536.0)) / 40320.0);} inline float gamma(float y){ - return ((-540.0+(540.0+(195.0+(-195.0+(-15.0+(15.0+(138327.0+(-732249.0+(1634115.0+(-1965117.0+(1341186.0+(-491946.0+75684.0*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-128484.0 * y + 578188.0) - 988256.0) + 760312.0) - 221060.0) + 4732.0) - 9464.0) - 4032.0) + 8064.0)) / 40320.0);} inline float delta(float y){ - return ( 1.0 + (-980.0+(280.0+(-20.0+(-230545.0+(1220415.0+(-2723525.0+(3275195.0+(-2235310.0+(819910.0-126140.0*y)*y)*y)*y)*y)*y)*y)*y*y)*y*y)*y*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (299796.0 * y - 1349096.0) + 2305856.0) - 1774136.0) + 517580.0) - 13664.0) + 13664.0) + 32256.0) - 32256.0)) / 40320.0);} inline float eta(float y){ - return ((540.0+(540.0+(-195.0+(-195.0+(15.0+(15.0+(230545.0+(-1220415.0+(2723525.0+(-3275195.0+(2235310.0+(-819910.0+126140.0*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * y * (y * y * (y * (y * (y * (y * (-449694.0 * y + 2023630.0) - 3458700.0) + 2661540.0) - 778806.0) + 19110.0) - 57400.0) + 40320.0) / 40320.0);} inline float zeta(float y){ - return ((-108.0+(-54.0+(120.0+(60.0+(-12.0+(-6.0+(-138327.0+(732249.0+(-1634115.0+(1965117.0+(-1341186.0+(491946.0-75684.0*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (449694.0 * y - 2023616.0) + 3458644.0) - 2662016.0) + 780430.0) - 13664.0) - 13664.0) + 32256.0) + 32256.0)) / 40320.0);} inline float theta(float y){ - return ((12.0+(4.0+(-15.0+(-5.0+(3.0+(1.0+(46109.0+(-244083.0+(544705.0+(-655039.0+(447062.0+(-163982.0+25228.0*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y)*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (-299796.0 * y + 1349068.0) - 2305744.0) + 1775032.0) - 520660.0) + 4732.0) + 9464.0) - 4032.0) - 8064.0)) / 40320.0);} inline float iota(float y){ -return ((-6587.0+(34869.0+(-77815.0+(93577.0+(-63866.0+(23426.0-3604.0*y)*y)*y)*y)*y)*y)*y*y*y*y*y*y*y / 720.0);} + return ((y * (y * (y * (y * (y * (y * (y * (y * (128484.0 * y - 578168.0) + 988176.0) - 760872.0) + 223020.0) - 672.0) - 2016.0) + 512.0) + 1536.0)) / 40320.0);} +inline float kappa(float y){ + return ((y * (y * (y * (y * (y * (y * (y * (y * (-32121.0 * y + 144541.0) - 247046.0) + 190246.0) - 55685.0) + 49.0) + 196.0) - 36.0) - 144.0)) / 40320.0);} +inline float mu(float y){ + return ((y * y * y * y * y * (y * (y * (y * (3569.0 * y - 16060.0) + 27450.0) - 21140.0) + 6181.0)) / 40320.0);} + #endif diff --git a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl index ce6dc8532164bcae8a1e8e09b45e0e898516cf3f..69de73a5e041ff6031f79d44efd5b61afb4698a2 100644 --- a/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl +++ b/HySoP/hysop/gpu/cl_src/remeshing/weights_noVec_builtin.cl @@ -1,10 +1,11 @@ /** - * @file weights_noVec_builtin.cl - * Remeshing formulas, basic version, use of builtin OpenCL fma. + * @file weights_npVec_builtin.cl + * Remeshing formulas, vectorized version, use of builtin OpenCL fma. * Polynomials under Horner form. */ #if FORMULA == M4PRIME + inline float alpha(float y){ return (y*fma(y,fma(y,-1.0, 2.0), - 1.0)/2.0);} inline float beta(float y){ @@ -14,21 +15,93 @@ inline float gamma(float y){ inline float delta(float y){ return ((y * y * fma(1.0, y, - 1.0)) / 2.0);} -#elif FORMULA == M6PRIME + +#elif FORMULA == L2STAR_C2 + +inline float alpha(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, 2.0, -5.0), 3.0), 1.0), -1.0)) / 2.0);} +inline float beta(float y){ + return (fma(y * y, fma(y, fma(y, fma(y, -6.0, 15.0), -9.0), -2.0), 2.0) / 2.0);} +inline float gamma(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, 6.0, -15.0), 9.0), 1.0), 1.0)) / 2.0);} +inline float delta(float y){ + return ((y * y * y * fma(y, fma(y, -2.0, 5.0), -3.0)) / 2.0);} + + +#elif FORMULA == L2STAR_C3 + +inline float alpha(float y){ + return ((y * fma(y, fma(y * y, fma(y, fma(y, fma(y, -6.0, 21.0), -25.0), 10.0), 1.0), -1.0)) / 2.0);} +inline float beta(float y){ + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 18.0, -63.0), 75.0), -30.0), -2.0), 2.0) / 2.0);} +inline float gamma(float y){ + return ((y * fma(y, fma(y * y, fma(y, fma(y, fma(y, -18.0, 63.0), -75.0), 30.0), 1.0), 1.0)) / 2.0);} +inline float delta(float y){ + return ((y * y * y * y * fma(y, fma(y, fma(y, 6.0, -21.0), 25.0), -10.0)) / 2.0);} + + +#elif FORMULA == L2STAR_C4 + inline float alpha(float y){ - return (y * fma(y , fma(y , fma(y , fma(-5.0, y, 13.0),- 9.0),- 1.0), 2.0) / 24.0);} + return ((y * fma(y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, 20.0, -90.0), 154.0), -119.0), 35.0), 1.0), -1.0)) / 2.0);} inline float beta(float y){ - return (y * fma(y , fma(y , fma(y , fma(25.0 , y ,- 64.0), 39.0) , 16.0), - 16.0) / 24.0);} + return (fma(y * y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, -60.0, 270.0), -462.0), 357.0), -105.0), -2.0), 2.0) / 2.0);} inline float gamma(float y){ - return (y * y * fma(y , fma(y , fma( - 50.0 , y, 126.0) ,- 70.0) ,- 30.0) / 24.0 + 1.0);} + return ((y * fma(y, fma(y * y * y, fma(y, fma(y, fma(y, fma(y, 60.0, -270.0), 462.0), -357.0), 105.0), 1.0), 1.0)) / 2.0);} inline float delta(float y){ - return (y * fma(y , fma(y, fma(y, fma(50.0, y, - 124.0), 66.0) , 16.0) , 16.0) / 24.0);} + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, -20.0, 90.0), -154.0), 119.0), -35.0)) / 2.0);} + + +#elif FORMULA == L4STAR + +inline float alpha(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, -5.0, 13.0), -9.0), -1.0), 2.0)) / 24.0);} +inline float beta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, 25.0, -64.0), 39.0), 16.0), -16.0)) / 24.0);} +inline float gamma(float y){ + return (fma(y * y, fma(y, fma(y, fma(y, -50.0, 126.0), -70.0), -30.0), 24.0) / 24.0);} +inline float delta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, 50.0, -124.0), 66.0), 16.0), 16.0)) / 24.0);} +inline float eta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, -25.0, 61.0), -33.0), -1.0), -2.0)) / 24.0);} +inline float zeta(float y){ + return ((y * y * y * fma(y, fma(y, 5.0, -12.0), 7.0)) / 24.0);} + + +#elif FORMULA == L4STAR_C3 + +inline float alpha(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 14.0, -49.0), 58.0), -22.0), -2.0), -1.0), 2.0)) / 24.0);} +inline float beta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -70.0, 245.0), -290.0), 111.0), 4.0), 16.0), -16.0)) / 24.0);} +inline float gamma(float y){ + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 140.0, -490.0), 580.0), -224.0), -30.0), 24.0) / 24.0);} +inline float delta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -140.0, 490.0), -580.0), 226.0), -4.0), 16.0), 16.0)) / 24.0);} inline float eta(float y){ - return (y * fma(y , fma(y , fma(y , fma(- 25.0 , y, 61.0), - 33.0), - 1.0), - 2.0) / 24.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 70.0, -245.0), 290.0), -114.0), 2.0), -1.0), -2.0)) / 24.0);} inline float zeta(float y){ - return (y * y * y * fma(y , fma(5.0 , y ,- 12.0) , 7.0) / 24.0);} + return ((y * y * y * y * fma(y, fma(y, fma(y, -14.0, 49.0), -58.0), 23.0)) / 24.0);} + + +#elif FORMULA == L4STAR_C4 + +inline float alpha(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -46.0, 207.0), -354.0), 273.0), -80.0), 1.0), -2.0), -1.0), 2.0)) / 24.0);} +inline float beta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 230.0, -1035.0), 1770.0), -1365.0), 400.0), -4.0), 4.0), 16.0), -16.0)) / 24.0);} +inline float gamma(float y){ + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -460.0, 2070.0), -3540.0), 2730.0), -800.0), 6.0), -30.0), 24.0) / 24.0);} +inline float delta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 460.0, -2070.0), 3540.0), -2730.0), 800.0), -4.0), -4.0), 16.0), 16.0)) / 24.0);} +inline float eta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -230.0, 1035.0), -1770.0), 1365.0), -400.0), 1.0), 2.0), -1.0), -2.0)) / 24.0);} +inline float zeta(float y){ + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, 46.0, -207.0), 354.0), -273.0), 80.0)) / 24.0);} + #elif FORMULA == M8PRIME + inline float alpha(float y){ return (fma(y,fma(y,fma(y,fma(y,fma(y,fma(y,fma(-10.0,y, + 21.0), + 28.0), - 105.0), + 70.0), + 35.0), - 56.0), + 17.0) / 3360.0);} inline float beta(float y){ @@ -44,32 +117,112 @@ inline float zeta(float y){ inline float theta(float y){ return (fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(-70.0, y, 315.0), -280.0), -105.0), -70.0), 35.0), 56.0), 17.0) / 3360.0);} inline float iota(float y){ -return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) / 3360.0);} + return ((y * y * y * y * y * fma(y , fma(10.0 , y ,- 49.0) , 56.0)) / 3360.0);} + #elif FORMULA == L6STAR + inline float alpha(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(-89.0, y, 312.0), -370.0), 140.0), 15.0), 4.0), -12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -89.0, 312.0), -370.0), 140.0), 15.0), 4.0), -12.0)) / 720.0);} inline float beta(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(623.0, y, -2183.0), 2581.0), -955.0), -120.0), -54.0), 108.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 623.0, -2183.0), 2581.0), -955.0), -120.0), -54.0), 108.0)) / 720.0);} inline float gamma(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(-623.0, y, 2182.0), -2574.0), 950.0), 65.0), 180.0), -180.0)) / 240.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1869.0, 6546.0), -7722.0), 2850.0), 195.0), 540.0), -540.0)) / 720.0);} inline float delta(float y){ - return ((y * y * fma(y * y, fma(y, fma(y, fma(623.0, y, -2181.0), 2569.0), -959.0), -196.0)) / 144.0 + 1.0);} + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 3115.0, -10905.0), 12845.0), -4795.0), -980.0), 720.0) / 720.0);} inline float eta(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(-623.0, y, 2180.0), -2566.0), 976.0), -39.0), 108.0), 108.0)) / 144.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3115.0, 10900.0), -12830.0), 4880.0), -195.0), 540.0), 540.0)) / 720.0);} inline float zeta(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(623.0, y, -2179.0), 2565.0), -995.0), 40.0), -18.0), -36.0)) / 240.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 1869.0, -6537.0), 7695.0), -2985.0), 120.0), -54.0), -108.0)) / 720.0);} inline float theta(float y){ - return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(-623.0, y, 2178.0), -2566.0), 1010.0), -15.0), 4.0), 12.0)) / 720.0);} + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -623.0, 2178.0), -2566.0), 1010.0), -15.0), 4.0), 12.0)) / 720.0);} inline float iota(float y){ - return ((y * y * y * y * fma(y, fma(y, fma(89.0, y, -311.0), 367.0), -145.0)) / 720.0);} + return ((y * y * y * y * fma(y, fma(y, fma(y, 89.0, -311.0), 367.0), -145.0)) / 720.0);} -#endif +#elif FORMULA == L6STAR_C4 +inline float alpha(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 290.0, -1305.0), 2231.0), -1718.0), 500.0), -5.0), 15.0), 4.0), -12.0)) / 720.0);} +inline float beta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -2030.0, 9135.0), -15617.0), 12027.0), -3509.0), 60.0), -120.0), -54.0), 108.0)) / 720.0);} +inline float gamma(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 6090.0, -27405.0), 46851.0), -36084.0), 10548.0), -195.0), 195.0), 540.0), -540.0)) / 720.0);} +inline float delta(float y){ + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -10150.0, 45675.0), -78085.0), 60145.0), -17605.0), 280.0), -980.0), 720.0) / 720.0);} +inline float eta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 10150.0, -45675.0), 78085.0), -60150.0), 17620.0), -195.0), -195.0), 540.0), 540.0)) / 720.0);} +inline float zeta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -6090.0, 27405.0), -46851.0), 36093.0), -10575.0), 60.0), 120.0), -54.0), -108.0)) / 720.0);} +inline float theta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 2030.0, -9135.0), 15617.0), -12032.0), 3524.0), -5.0), -15.0), 4.0), 12.0)) / 720.0);} +inline float iota(float y){ + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, -290.0, 1305.0), -2231.0), 1719.0), -503.0)) / 720.0);} +#elif FORMULA == L6STAR_C5 +inline float alpha(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1006.0, 5533.0), -12285.0), 13785.0), -7829.0), 1803.0), -3.0), -5.0), 15.0), 4.0), -12.0)) / 720.0);} +inline float beta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 7042.0, -38731.0), 85995.0), -96495.0), 54803.0), -12620.0), 12.0), 60.0), -120.0), -54.0), 108.0)) / 720.0);} +inline float gamma(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -21126.0, 116193.0), -257985.0), 289485.0), -164409.0), 37857.0), -15.0), -195.0), 195.0), 540.0), -540.0)) / 720.0);} +inline float delta(float y){ + return (fma(y * y, fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, 35210.0, -193655.0), 429975.0), -482475.0), 274015.0), -63090.0), 280.0), -980.0), 720.0) / 720.0);} +inline float eta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -35210.0, 193655.0), -429975.0), 482475.0), -274015.0), 63085.0), 15.0), -195.0), -195.0), 540.0), 540.0)) / 720.0);} +inline float zeta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 21126.0, -116193.0), 257985.0), -289485.0), 164409.0), -37848.0), -12.0), 60.0), 120.0), -54.0), -108.0)) / 720.0);} +inline float theta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -7042.0, 38731.0), -85995.0), 96495.0), -54803.0), 12615.0), 3.0), -5.0), -15.0), 4.0), 12.0)) / 720.0);} +inline float iota(float y){ + return ((y * y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, fma(y, 1006.0, -5533.0), 12285.0), -13785.0), 7829.0), -1802.0)) / 720.0);} +#elif FORMULA == L6STAR_C6 +inline float alpha(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 3604.0, -23426.0), 63866.0), -93577.0), 77815.0), -34869.0), 6587.0), 1.0), -3.0), -5.0), 15.0), 4.0), -12.0)) / 720.0);} +inline float beta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -25228.0, 163982.0), -447062.0), 655039.0), -544705.0), 244083.0), -46109.0), -6.0), 12.0), 60.0), -120.0), -54.0), 108.0)) / 720.0);} +inline float gamma(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 75684.0, -491946.0), 1341186.0), -1965117.0), 1634115.0), -732249.0), 138327.0), 15.0), -15.0), -195.0), 195.0), 540.0), -540.0)) / 720.0);} +inline float delta(float y){ + return (fma(y * y, fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -126140.0, 819910.0), -2235310.0), 3275195.0), -2723525.0), 1220415.0), -230545.0), -20.0), 280.0), -980.0), 720.0) / 720.0);} +inline float eta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 126140.0, -819910.0), 2235310.0), -3275195.0), 2723525.0), -1220415.0), 230545.0), 15.0), 15.0), -195.0), -195.0), 540.0), 540.0)) / 720.0);} +inline float zeta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -75684.0, 491946.0), -1341186.0), 1965117.0), -1634115.0), 732249.0), -138327.0), -6.0), -12.0), 60.0), 120.0), -54.0), -108.0)) / 720.0);} +inline float theta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 25228.0, -163982.0), 447062.0), -655039.0), 544705.0), -244083.0), 46109.0), 1.0), 3.0), -5.0), -15.0), 4.0), 12.0)) / 720.0);} +inline float iota(float y){ + return ((y * y * y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3604.0, 23426.0), -63866.0), 93577.0), -77815.0), 34869.0), -6587.0)) / 720.0);} + + + +#elif FORMULA == L8STAR + +inline float alpha(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3569.0, 16061.0), -27454.0), 21126.0), -6125.0), 49.0), -196.0), -36.0), 144.0)) / 40320.0);} +inline float beta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 32121.0, -144548.0), 247074.0), -190092.0), 55125.0), -672.0), 2016.0), 512.0), -1536.0)) / 40320.0);} +inline float gamma(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -128484.0, 578188.0), -988256.0), 760312.0), -221060.0), 4732.0), -9464.0), -4032.0), 8064.0)) / 40320.0);} +inline float delta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 299796.0, -1349096.0), 2305856.0), -1774136.0), 517580.0), -13664.0), 13664.0), 32256.0), -32256.0)) / 40320.0);} +inline float eta(float y){ + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, fma(y, fma(y, -449694.0, 2023630.0), -3458700.0), 2661540.0), -778806.0), 19110.0), -57400.0), 40320.0) / 40320.0);} +inline float zeta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 449694.0, -2023616.0), 3458644.0), -2662016.0), 780430.0), -13664.0), -13664.0), 32256.0), 32256.0)) / 40320.0);} +inline float theta(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -299796.0, 1349068.0), -2305744.0), 1775032.0), -520660.0), 4732.0), 9464.0), -4032.0), -8064.0)) / 40320.0);} +inline float iota(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 128484.0, -578168.0), 988176.0), -760872.0), 223020.0), -672.0), -2016.0), 512.0), 1536.0)) / 40320.0);} +inline float kappa(float y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -32121.0, 144541.0), -247046.0), 190246.0), -55685.0), 49.0), 196.0), -36.0), -144.0)) / 40320.0);} +inline float mu(float y){ + return ((y * y * y * y * y * fma(y, fma(y, fma(y, fma(y, 3569.0, -16060.0), 27450.0), -21140.0), 6181.0)) / 40320.0);} + + +#endif diff --git a/HySoP/hysop/gpu/gpu_analytic.py b/HySoP/hysop/gpu/gpu_analytic.py index b8298b008c22b84ed61119196b38bc8a35be9524..c783c032862da0bde3b71d98be0350d8e93de384 100644 --- a/HySoP/hysop/gpu/gpu_analytic.py +++ b/HySoP/hysop/gpu/gpu_analytic.py @@ -68,7 +68,7 @@ class GPUAnalytic(DiscreteOperator): if __VERBOSE__: print "Work-items basic config : ", print self.workItemNumber, self.gwi, self.lwi - dim = self.field.topology.dim + dim = self.field.dimension ## Minimum field coordinate (as GPU float4) self.coord_min = np.ones(4, dtype=self.gpu_precision) ## Field mesh size (as GPU float4) @@ -85,7 +85,7 @@ class GPUAnalytic(DiscreteOperator): self.build_options += " -D WI_NB=" + str(self.workItemNumber) ## OpenCL Binaries self.prg = self.cl_env.build_src(self.usr_src, self.build_options) - kernel_name = 'analytic' + self.field.name.split('_D')[0] + kernel_name = 'analytic' + self.field.name.rsplit('_', 1)[0] self.numMethod = KernelLauncher(eval('self.prg.' + kernel_name), self.cl_env.queue, self.gwi, diff --git a/HySoP/hysop/gpu/gpu_discrete.py b/HySoP/hysop/gpu/gpu_discrete.py index 2c50893394da0f26d3b8d021d281ca47d1ba42ea..08e007365081f778a0034f814272f04c0d0c31d3 100644 --- a/HySoP/hysop/gpu/gpu_discrete.py +++ b/HySoP/hysop/gpu/gpu_discrete.py @@ -3,35 +3,35 @@ Contains class for discrete fields on GPU. """ -from abc import ABCMeta, abstractmethod - from parmepy import __VERBOSE__ from parmepy.constants import ORDER, np, debug -from parmepy.fields.scalar import ScalarField -from parmepy.fields.vector import VectorField +from parmepy.fields.discrete import DiscreteField from parmepy.gpu import cl, PARMES_REAL_GPU, CL_PROFILE from parmepy.tools.timers import timed_function from parmepy.gpu.gpu_kernel import KernelLauncher, KernelListLauncher -class GPUDiscreteField(object): +class GPUDiscreteField(DiscreteField): """ - GPU discrete field interface. + GPU Discrete vector field implementation. + Allocates OpenCL device memory for the field. """ - - __metaclass__ = ABCMeta - - @abstractmethod - def __init__(self, queue, precision): + def __init__(self, queue, topology=None, isVector=False, name="?", + precision=PARMES_REAL_GPU, layout=True): """ Constructor. @param queue : OpenCL queue - @param precision : Required numerical precision. + @param precision : Floating point precision + @param parent : Continuous field. + @param topology : Topology informations + @param name : Field name + @param idFromParent : Index in the parent's discrete fields + @param layout : Boolean indicating if components are arranged in memory + @see parmepy.fields.vector.VectorField.__init__ """ + super(GPUDiscreteField, self).__init__(topology, isVector, name) ## OpenCL queue self.queue = queue - ## OpenCL Buffer pointer - self.gpu_data = None ## Precision for the field self.precision = precision ## Memory used @@ -41,48 +41,11 @@ class GPUDiscreteField(object): ## Initialization OpenCL kernel as KernelLauncher self.init_kernel = None self._isReleased = False - - def setInitializationKernel(self, kernel): - """ - Set the initialization kernel - @param kernel : KernelLauncher to use for initialize field. - """ - self.init_kernel = kernel - - @abstractmethod - def toHost(self): - pass - - @abstractmethod - def toDevice(self): - pass - - -class GPUVectorField(VectorField, GPUDiscreteField): - """ - GPU Discrete vector field implementation. - Allocates OpenCL device memory for the field. - """ - def __init__(self, queue, parent, - topology=None, name="?", idFromParent=None, - precision=PARMES_REAL_GPU, layout=True): - """ - Constructor. - @param queue : OpenCL queue - @param precision : Floating point precision - @param parent : Continuous field. - @param topology : Topology informations - @param name : Field name - @param idFromParent : Index in the parent's discrete fields - @param layout : Boolean indicating if components are arranged in memory - @see parmepy.fields.vector.VectorField.__init__ - """ - super(GPUVectorField, self).__init__(self, queue, precision, - parent, topology, - name, idFromParent) - self.gpu_data = [None for d in xrange(self.dimension)] + ## OpenCL Buffer pointer + self.gpu_data = [None] * self.nbComponents + ## Data layout is direction dependant self.layout = layout - for d in xrange(self.dimension): + for d in xrange(self.nbComponents): self.data[d] = np.asarray(self.data[d], dtype=precision, order=ORDER) self.gpu_data[d] = cl.Buffer(self.queue.context, @@ -94,8 +57,8 @@ class GPUVectorField(VectorField, GPUDiscreteField): print self.mem_size / (1024 ** 2), "MB)" @classmethod - def fromVectorField(cls, queue, vfield, precision=PARMES_REAL_GPU, - layout=True): + def fromField(cls, queue, vfield, precision=PARMES_REAL_GPU, + layout=True): """ Contructor from a discrete vector field. Mutates the given VectorField to a GPUVectorField. @@ -107,10 +70,13 @@ class GPUVectorField(VectorField, GPUDiscreteField): """ if not isinstance(vfield, GPUDiscreteField): vfield.__class__ = cls - GPUDiscreteField.__init__(vfield, queue, precision) - vfield.gpu_data = [None for d in xrange(vfield.dimension)] + GPUDiscreteField.__init__( + vfield, queue, + vfield.topology, vfield.isVector, vfield.name, + precision) + vfield.gpu_data = [None] * vfield.nbComponents vfield.layout = layout - for d in xrange(vfield.dimension): + for d in xrange(vfield.nbComponents): vfield.data[d] = np.asarray(vfield.data[d], dtype=precision, order=ORDER) vfield.gpu_data[d] = cl.Buffer(vfield.queue.context, @@ -121,16 +87,23 @@ class GPUVectorField(VectorField, GPUDiscreteField): print vfield.name, vfield.mem_size, "Bytes (", print vfield.mem_size / (1024 ** 2), "MB)" + def setInitializationKernel(self, kernel): + """ + Set the initialization kernel + @param kernel : KernelLauncher to use for initialize field. + """ + self.init_kernel = kernel + @debug @timed_function def dump(self, filename, mode=None): self.toHost() - VectorField.dump(self, filename, mode) + DiscreteField.dump(self, filename, mode) @debug @timed_function def load(self, filename): - VectorField.load(self, filename) + DiscreteField.load(self, filename) self.toDevice() self.contains_data = False self.data_on_device = True @@ -150,30 +123,16 @@ class GPUVectorField(VectorField, GPUDiscreteField): isGPUKernel = isinstance(formula, KernelLauncher) \ or isinstance(formula, KernelListLauncher) if not isGPUKernel and self.init_kernel is None: - VectorField.initialize(self, formula, *args) + DiscreteField.initialize(self, formula, *args) if self.contains_data: - if self.dimension == 2: - self.data[0] = np.asarray( - self.data[0], - dtype=self.precision, order=ORDER) - self.data[1] = np.asarray( - self.data[1], - dtype=self.precision, order=ORDER) - else: - self.data[0] = np.asarray( - self.data[0], - dtype=self.precision, order=ORDER) - self.data[1] = np.asarray( - self.data[1], - dtype=self.precision, order=ORDER) - self.data[2] = np.asarray( - self.data[2], + for d in xrange(self.nbComponents): + self.data[d] = np.asarray( + self.data[d], dtype=self.precision, order=ORDER) self.toDevice() else: if isGPUKernel: self.init_kernel = formula - print "Using OpenCL kernel", self.init_kernel coord_min = np.ones(4, dtype=self.precision) mesh_size = np.ones(4, dtype=self.precision) coord_min[0:self.dimension] = np.asarray( @@ -182,15 +141,18 @@ class GPUVectorField(VectorField, GPUDiscreteField): mesh_size[0:self.dimension] = np.asarray( self.topology.mesh.space_step, dtype=self.precision) - if self.dimension == 2: + if self.nbComponents == 2: self.init_kernel(self.gpu_data[0], self.gpu_data[1], coord_min, mesh_size, *args) - else: + elif self.nbComponents == 3: self.init_kernel(self.gpu_data[0], self.gpu_data[1], self.gpu_data[2], coord_min, mesh_size, *args) + else: + self.init_kernel(self.gpu_data[0], + coord_min, mesh_size, *args) self.contains_data = False self.data_on_device = True @@ -199,8 +161,8 @@ class GPUVectorField(VectorField, GPUDiscreteField): if __VERBOSE__: print "deallocate :", self.name, print " (" + str(self.mem_size / (1024. ** 2)) + " MBytes)" - [self.gpu_data[d].release() - for d in xrange(len(self.resolution))] + for d in xrange(self.nbComponents): + self.gpu_data[d].release() if self.init_kernel is not None: for f_timer in self.init_kernel.f_timer: self.timer.addFunctionTimer(f_timer) @@ -228,30 +190,26 @@ class GPUVectorField(VectorField, GPUDiscreteField): if __VERBOSE__: print "host->device :", self.name, time = 0. - evt = [None, None, None] + evt = [None] * 3 + if self.layout: evt[0] = cl.enqueue_copy( self.queue, self.gpu_data[0], self.data[0].ravel(order=ORDER)) - evt[1] = cl.enqueue_copy( - self.queue, self.gpu_data[1], - self.data[1].swapaxes(0, 1).ravel(order=ORDER)) - if self.dimension == 3: + if self.nbComponents > 1: + evt[1] = cl.enqueue_copy( + self.queue, self.gpu_data[1], + self.data[1].swapaxes(0, 1).ravel(order=ORDER)) + if self.nbComponents > 2: evt[2] = cl.enqueue_copy( self.queue, self.gpu_data[2], self.data[2].swapaxes(0, 1).swapaxes(0, 2).ravel( order=ORDER)) else: - evt[0] = cl.enqueue_copy( - self.queue, self.gpu_data[0], - self.data[0].ravel(order=ORDER)) - evt[1] = cl.enqueue_copy( - self.queue, self.gpu_data[1], - self.data[1].ravel(order=ORDER)) - if self.dimension == 3: - evt[2] = cl.enqueue_copy( - self.queue, self.gpu_data[2], - self.data[2].ravel(order=ORDER)) + for d in xrange(self.nbComponents): + evt[d] = cl.enqueue_copy( + self.queue, self.gpu_data[d], + self.data[d].ravel(order=ORDER)) if CL_PROFILE: self.queue.finish() for e in evt: @@ -280,53 +238,33 @@ class GPUVectorField(VectorField, GPUDiscreteField): self.queue.finish() shape = self.data[0].shape time = 0. - evt = [None, None, None] + evt = [None] * 3 if self.layout: - if self.dimension == 3: - # Get ZDIR component - temp = np.empty_like(self.data[2]).resize( - (shape[2], shape[0], shape[1])) - evt[2] = cl.enqueue_copy(self.queue, temp, - self.gpu_data[2]) - self.data[2] = temp.swapaxes(0, 2).swapaxes(0, 1) - # Get YDIR component - temp = np.empty_like(self.data[1]).resize( - (shape[1], shape[0], shape[2])) - evt[1] = cl.enqueue_copy(self.queue, temp, - self.gpu_data[1]) - self.data[1] = temp.swapaxes(0, 1) - # Get XDIR component - evt[0] = cl.enqueue_copy(self.queue, self.data[0], - self.gpu_data[0]) - else: + # Get XDIR component + evt[0] = cl.enqueue_copy(self.queue, self.data[0], + self.gpu_data[0]) + if self.nbComponents > 1: # Get YDIR component - temp = np.empty_like(self.data[1]).resize( - (shape[1], shape[0])) - print "ICICICICIIC ", temp.shape + temp = np.empty_like(self.data[1]) + if self.dimension == 3: + temp.resize((shape[1], shape[0], shape[2])) + else: + temp.resize((shape[1], shape[0])) evt[1] = cl.enqueue_copy(self.queue, temp, self.gpu_data[1]) self.data[1] = temp.swapaxes(0, 1) - # Get XDIR component - evt[0] = cl.enqueue_copy(self.queue, self.data[0], - self.gpu_data[0]) - else: - if self.dimension == 3: + if self.nbComponents > 2: # Get ZDIR component - evt[2] = cl.enqueue_copy(self.queue, self.data[2], + temp = np.empty_like(self.data[2]) + temp.resize((shape[2], shape[0], shape[1])) + evt[2] = cl.enqueue_copy(self.queue, temp, self.gpu_data[2]) - # Get YDIR component - evt[1] = cl.enqueue_copy(self.queue, self.data[1], - self.gpu_data[1]) - # Get XDIR component - evt[0] = cl.enqueue_copy(self.queue, self.data[0], - self.gpu_data[0]) - else: - # Get YDIR component - evt[1] = cl.enqueue_copy(self.queue, self.data[1], - self.gpu_data[1]) - # Get XDIR component - evt[0] = cl.enqueue_copy(self.queue, self.data[0], - self.gpu_data[0]) + self.data[2] = temp.swapaxes(0, 2).swapaxes(0, 1) + else: + for d in xrange(self.nbComponents): + # Get d component + evt[d] = cl.enqueue_copy(self.queue, self.data[d], + self.gpu_data[d]) if CL_PROFILE: self.queue.finish() for e in evt: @@ -337,180 +275,3 @@ class GPUVectorField(VectorField, GPUDiscreteField): print "{0:.3f} GBytes/sec".format( self.mem_size / (time * 1024 ** 3)) print 'Transfer device->host' - - -class GPUScalarField(ScalarField, GPUDiscreteField): - """ - GPU Discrete scalar field implementation. - """ - def __init__(self, queue, parent, - topology=None, name="?", idFromParent=None, - precision=PARMES_REAL_GPU): - """ - Constructor. - @param queue : OpenCL queue - @param precision : Floating point precision - @param parent : Continuous field. - @param topology : Topology informations - @param name : Field name - @param idFromParent : Index in the parent's discrete fields - @see parmepy.fields.vector.ScalarField.__init__ - """ - super(GPUScalarField, self).__init__(self, queue, precision, - parent, topology, - name, idFromParent) - self.data = np.asarray(self.data, - dtype=precision, order=ORDER) - self.gpu_data = cl.Buffer(self.queue.context, - cl.mem_flags.READ_WRITE, - size=self.data.nbytes) - self.mem_size = self.gpu_data.size - if __VERBOSE__: - print self.name, self.mem_size, "Bytes (", - self.mem_size / (1024 ** 2), "MB)" - - @classmethod - def fromScalarField(cls, queue, sfield, precision=PARMES_REAL_GPU): - """ - Contructor from a discrete scalar field. - - Mutates the given ScalarField to a GPUScalarField. - - @param cls : Class - @param queue : OpenCL queue - @param sfield : ScalarField - @param precision : Floating point precision - """ - if not isinstance(sfield, GPUDiscreteField): - sfield.__class__ = cls - GPUDiscreteField.__init__(sfield, queue, precision) - sfield.data = np.asarray(sfield.data, - dtype=precision, order=ORDER) - sfield.gpu_data = cl.Buffer(sfield.queue.context, - cl.mem_flags.READ_WRITE, - size=sfield.data.nbytes) - sfield.mem_size = sfield.gpu_data.size - if __VERBOSE__: - print sfield.name, sfield.mem_size, "Bytes (", - print sfield.mem_size / (1024 ** 2), "MB)" - - @debug - def dump(self, filename, mode=None): - self.toHost() - ScalarField.dump(self, filename, mode) - - @debug - def load(self, filename): - ScalarField.load(self, filename) - self.toDevice() - self.contains_data = False - self.data_on_device = True - - @debug - @timed_function - def initialize(self, formula=None, *args): - """ - GPU data initialization. - @param formula : Python formula to use - @param args : Extra parameters - Performs the initialization from different ways if device not already - contains up-to-date data: - - with an OpenCL kernel, - - with a python formula (as VectorField) and the copy data to device. - """ - isGPUKernel = isinstance(formula, KernelLauncher) \ - or isinstance(formula, KernelListLauncher) - if not isGPUKernel and self.init_kernel is None: - ScalarField.initialize(self, formula, *args) - if self.contains_data: - self.data = np.asarray(self.data, - dtype=self.precision, order=ORDER) - self.toDevice() - else: - if isGPUKernel: - self.init_kernel = formula - coord_min = np.ones(4, dtype=self.precision) - mesh_size = np.ones(4, dtype=self.precision) - coord_min[0:self.dimension] = np.asarray( - self.topology.mesh.origin, - dtype=self.precision) - mesh_size[0:self.dimension] = np.asarray( - self.topology.mesh.space_step, - dtype=self.precision) - self.init_kernel(self.gpu_data, coord_min, mesh_size, *args) - self.contains_data = False - self.data_on_device = True - - def release(self): - if not self._isReleased: - if __VERBOSE__: - print "deallocate :", self.name, - print " (" + str(self.mem_size / (1024. ** 2)) + " MBytes)" - self.gpu_data.release() - if self.init_kernel is not None: - for f_timer in self.init_kernel.f_timer: - self.timer.addFunctionTimer(f_timer) - print self.timer - self._isReleased = True - - @timed_function - def toDevice(self): - """ - Host to device method. - - Performs a direct OpenCL copy from numpy arrays - to OpenCL Buffers.\n - Arrange memory on device so that vector components are - contiguous in the direction of the component.\n - Example : A 3D vector field F(x,y,z) is made up of 3 - OpenCL Buffers Fx, Fy, Fz. The memory layout is : - - Fx : x-major ordering. On device, - Fx[i + j*WIDTH + k*WIDTH*HEIGHT] access to Fx(i,j,k) - - Fy : y-major ordering. On device, - Fy[i + j*WIDTH + k*WIDTH*HEIGHT] access to Fy(j,i,k) - - Fz : z-major ordering. On device, - Fz[i + j*WIDTH + k*WIDTH*HEIGHT] access to Fz(k,i,j) - """ - if __VERBOSE__: - print "host->device :", self.name, - time = 0. - evt = [None, None, None] - evt[0] = cl.enqueue_copy(self.queue, self.gpu_data, - self.data.ravel(order=ORDER)) - if CL_PROFILE: - self.queue.finish() - for e in evt: - if e is not None: - time += (e.profile.end - e.profile.start) * 1e-9 - if __VERBOSE__ and CL_PROFILE: - print self.mem_size, "Bytes transfered at ", - print "{0:.3f} GBytes/sec".format( - self.mem_size / (time * 1024 ** 3)) - - @timed_function - def toHost(self): - """ - Device to host method. - - Performs a direct OpenCL copy from OpenCL Buffers - to numpy arrays.\n - As memory layout is arranged on device, not only a - copy is performed but also transpositions to have numpy - arrays consistent to each other. - """ - if not self.contains_data: - if __VERBOSE__: - print "device->host :", self.name, - time = 0. - evt = [None, None, None] - evt[0] = cl.enqueue_copy(self.queue, self.data, - self.gpu_data) - if CL_PROFILE: - self.queue.finish() - for e in evt: - if e is not None: - time += (e.profile.end - e.profile.start) * 1e-9 - if __VERBOSE__ and CL_PROFILE: - print self.mem_size, "Bytes transfered at ", - print "{0:.3f} GBytes/sec".format( - self.mem_size / (time * 1024 ** 3)) diff --git a/HySoP/hysop/gpu/gpu_particle_advection.py b/HySoP/hysop/gpu/gpu_particle_advection.py index 8d23b5798dfe5c0f0471d677816c768bfc5e07b6..2d218b7e59000ecdeb340bc2dd50d8820efec542 100644 --- a/HySoP/hysop/gpu/gpu_particle_advection.py +++ b/HySoP/hysop/gpu/gpu_particle_advection.py @@ -10,8 +10,6 @@ from parmepy.operator.discrete.particle_advection import ParticleAdvection from parmepy.gpu import PARMES_REAL_GPU, cl from parmepy.gpu.tools import get_opencl_environment from parmepy.gpu.gpu_kernel import KernelLauncher -#from parmepy.gpu.gpu_kernel import KernelListLauncher -#from parmepy.numerics.splitting import Splitting class GPUParticleAdvection(ParticleAdvection): @@ -51,19 +49,16 @@ class GPUParticleAdvection(ParticleAdvection): """ ParticleAdvection.__init__(self, velocity, advectedFields, d, part_position, part_advectedFields, method) - if method.find('m4prime') >= 0: - self.method = "m4prime" - elif method.find('m6prime') >= 0: - self.method = "m6prime" - elif method.find('m8prime') >= 0: - self.method = "m8prime" - elif method.find('l6star') >= 0: - self.method = "l6star" - elif method.find('l4star') >= 0: - self.method = "l4star" - else: - print 'No remeshing fromula specified, use default m6prime' - self.method = "m6prime" + self.available_methods = [ + 'l2star_c4', 'l2star_c3', 'l2star_c2', 'm4prime', + 'l4star_c4', 'l4star_c3', 'l4star', + 'l6star_c6', 'l6star_c5', 'l6star_c4', 'l6star', 'm8prime', + 'l8star'] + self.method = 'l4star' + for m in self.available_methods: + if method.find(m) >= 0: + self.method = m + break self.name += ' GPU ' + self.method self.user_gpu_src = src self.num_method = None @@ -71,9 +66,6 @@ class GPUParticleAdvection(ParticleAdvection): # Compute resolutions for kernels for each direction. resol = self.advectedFields[0].topology.mesh.resolution self.resolution = resol.copy() - # resolution[0] = NB_X, NB_Y (, NB_Z) for X direction - # resolution[1] = NB_Y, NB_X (, NB_Z) for Y direction - # resolution[2] = NB_Z, NB_X, NB_Y for Z direction if len(resol) == 2: if self.dir == 1: self.resolution[0] = resol[1] @@ -111,7 +103,7 @@ class GPUParticleAdvection(ParticleAdvection): if __VERBOSE__: print "Work-items basic config : ", print self.workItemNumber, self.gwi, self.lwi - dim = self.advectedFields[0].topology.dim + dim = self.advectedFields[0].dimension self.coord_min = np.ones(4, dtype=self.gpu_precision) self.mesh_size = np.ones(4, dtype=self.gpu_precision) self.coord_min[0:dim] = np.asarray( @@ -135,22 +127,6 @@ class GPUParticleAdvection(ParticleAdvection): self._buffer_initialisations() if __VERBOSE__: print "===\n" - if self.advectedFields[0].isVector: - if dim == 2: - self.apply = self._apply_in_dir_vector_2d - #self.numMethod = Splitting(self._apply_in_dir_vector_2d, - # self.advectedFields[0].dimension, - # config=self.splittingConfig) - else: - self.apply = self._apply_in_dir_vector_3d - #self.numMethod = Splitting(self._apply_in_dir_vector_3d, - # self.advectedFields[0].dimension, - # config=self.splittingConfig) - else: - self.apply = self._apply_in_dir - #self.numMethod = Splitting(self._apply_in_dir, - # self.advectedFields[0].dimension, - # config=self.splittingConfig) def _buffer_initialisations(self): """ @@ -410,7 +386,7 @@ class GPUParticleAdvection(ParticleAdvection): self.prg = self.cl_env.build_src(usr_src, build_options) @debug - def _apply_in_dir(self, simulation, dtCoeff, split_id, old_dir): + def apply(self, simulation, dtCoeff, split_id, old_dir): """ Apply operator along specified splitting direction. @param t : Current time @@ -426,95 +402,23 @@ class GPUParticleAdvection(ParticleAdvection): self.cl_env.queue.finish() # Particle init if old_dir == self.dir: - self.init_copy(self.advectedFields[0].gpu_data, - self.part_advectedFields[0].gpu_data) + for p, g in zip(self.advectedFields[0].gpu_data, + self.part_advectedFields[0].gpu_data): + self.init_copy(p, g) else: if dim == 2: - self.init_transpose_xy(self.advectedFields[0].gpu_data, - self.part_advectedFields[0].gpu_data) + for p, g in zip(self.advectedFields[0].gpu_data, + self.part_advectedFields[0].gpu_data): + self.init_transpose_xy(p, g) else: if min(old_dir, self.dir) == 0: - self.init_transpose_xy( - self.advectedFields[0].gpu_data, - self.part_advectedFields[0].gpu_data) + for p, g in zip(self.advectedFields[0].gpu_data, + self.part_advectedFields[0].gpu_data): + self.init_transpose_xy(p, g) else: - self.init_transpose_xz( - self.advectedFields[0].gpu_data, - self.part_advectedFields[0].gpu_data) - - @debug - def _apply_in_dir_vector_2d(self, simulation, dtCoeff, split_id, old_dir): - """ - Apply operator along specified splitting direction. For 2D vector - advection. - @param t : Current time - @param dt : Time step - @param d : Splitting direction - @param split_id : Splitting step id - """ - # Check new data on host - for df in self.input: - if df.contains_data and not df.data_on_device: - df.toDevice() - self.cl_env.queue.finish() - # Particle init - if old_dir == self.dir: - self.init_copy(self.advectedFields[0].gpu_data[0], - self.part_advectedFields[0].gpu_data[0]) - self.init_copy(self.advectedFields[0].gpu_data[1], - self.part_advectedFields[0].gpu_data[1]) - else: - self.init_transpose_xy( - self.advectedFields[0].gpu_data[0], - self.part_advectedFields[0].gpu_data[0]) - self.init_transpose_xy( - self.advectedFields[0].gpu_data[1], - self.part_advectedFields[0].gpu_data[1]) - - @debug - def _apply_in_dir_vector_3d(self, simulation, dtCoeff, split_id, old_dir): - """ - Apply operator along specified splitting direction. For 3D vector - advection. - @param t : Current time - @param dt : Time step - @param d : Splitting direction - @param split_id : Splitting step id - """ - # Check new data on host - for df in self.input: - if df.contains_data and not df.data_on_device: - df.toDevice() - self.cl_env.queue.finish() - # Particle init - if old_dir == self.dir: - self.init_copy(self.advectedFields[0].gpu_data[0], - self.part_advectedFields[0].gpu_data[0]) - self.init_copy(self.advectedFields[0].gpu_data[1], - self.part_advectedFields[0].gpu_data[1]) - self.init_copy(self.advectedFields[0].gpu_data[2], - self.part_advectedFields[0].gpu_data[2]) - else: - if min(old_dir, self.dir) == 0: - self.init_transpose_xy( - self.advectedFields[0].gpu_data[0], - self.part_advectedFields[0].gpu_data[0]) - self.init_transpose_xy( - self.advectedFields[0].gpu_data[1], - self.part_advectedFields[0].gpu_data[1]) - self.init_transpose_xy( - self.advectedFields[0].gpu_data[2], - self.part_advectedFields[0].gpu_data[2]) - else: - self.init_transpose_xz( - self.advectedFields[0].gpu_data[0], - self.part_advectedFields[0].gpu_data[0]) - self.init_transpose_xz( - self.advectedFields[0].gpu_data[1], - self.part_advectedFields[0].gpu_data[1]) - self.init_transpose_xz( - self.advectedFields[0].gpu_data[2], - self.part_advectedFields[0].gpu_data[2]) + for p, g in zip(self.advectedFields[0].gpu_data, + self.part_advectedFields[0].gpu_data): + self.init_transpose_xz(p, g) @debug def finalize(self): diff --git a/HySoP/hysop/gpu/gpu_particle_advection_1k.py b/HySoP/hysop/gpu/gpu_particle_advection_1k.py index 88592e9de1aa7a6110f0d74d720ed8244bafdb3d..68b0456553d1890278382b958ef0b6601aa2376a 100644 --- a/HySoP/hysop/gpu/gpu_particle_advection_1k.py +++ b/HySoP/hysop/gpu/gpu_particle_advection_1k.py @@ -6,10 +6,10 @@ Discrete advection representation. from parmepy.constants import debug from parmepy.gpu.gpu_particle_advection import GPUParticleAdvection from parmepy.fields.continuous import Field -from parmepy.gpu.gpu_discrete import GPUVectorField -from parmepy.gpu.gpu_discrete import GPUScalarField +from parmepy.gpu.gpu_discrete import GPUDiscreteField from parmepy.gpu import PARMES_REAL_GPU from parmepy.gpu.gpu_kernel import KernelLauncher +from parmepy.tools.timers import timed_function class GPUParticleAdvection1k(GPUParticleAdvection): @@ -62,18 +62,13 @@ class GPUParticleAdvection1k(GPUParticleAdvection): grid values and the other for particles. """ ## Velocity. - GPUVectorField.fromVectorField(self.cl_env.queue, self.velocity, - self.gpu_precision) - ## Transported scalar. - if self.advectedFields[0].isVector: - GPUVectorField.fromVectorField(self.cl_env.queue, - self.advectedFields[0], - self.gpu_precision, - layout=False) - else: - GPUScalarField.fromScalarField(self.cl_env.queue, - self.advectedFields[0], - self.gpu_precision) + GPUDiscreteField.fromField(self.cl_env.queue, self.velocity, + self.gpu_precision) + ## Transported field. + GPUDiscreteField.fromField(self.cl_env.queue, + self.advectedFields[0], + self.gpu_precision, + layout=False) ## Result scalar if self.part_advectedFields is None: self.part_advectedFields = [ @@ -81,14 +76,10 @@ class GPUParticleAdvection1k(GPUParticleAdvection): name="Particle_AdvectedFields", isVector=self.advectedFields[0].isVector ).discretize(self.advectedFields[0].topology)] - if self.part_advectedFields[0].isVector: - GPUVectorField.fromVectorField( - self.cl_env.queue, self.part_advectedFields[0], - self.gpu_precision, layout=False) - else: - GPUScalarField.fromScalarField( - self.cl_env.queue, self.part_advectedFields[0], - self.gpu_precision) + GPUDiscreteField.fromField( + self.cl_env.queue, self.part_advectedFields[0], + self.gpu_precision, + layout=False) self.variables = [self.advectedFields[0], self.velocity, self.part_advectedFields[0]] @@ -159,8 +150,7 @@ class GPUParticleAdvection1k(GPUParticleAdvection): if self.gpu_precision is PARMES_REAL_GPU: with_noBC = False else: - #file_list.append("remeshing/weights_noVec_builtin.cl") - file_list.append("remeshing/weights_noVec.cl") + file_list.append("remeshing/weights_noVec_builtin.cl") if self.advectedFields[0].isVector: file_list.append("remeshing/basic_vector_2d.cl") else: @@ -168,13 +158,10 @@ class GPUParticleAdvection1k(GPUParticleAdvection): if self.gpu_precision is PARMES_REAL_GPU: vec = 8 file_list.append("advection/builtin.cl") - if self.advectedFields[0].isVector: - if len(self.resolution) == 3: - file_list.append( - "kernels/advection_and_remeshing_vector_3d.cl") - else: - file_list.append( - "kernels/advection_and_remeshing_vector_2d.cl") + if self.advectedFields[0].nbComponents == 3: + file_list.append("kernels/advection_and_remeshing_vector_3d.cl") + elif self.advectedFields[0].nbComponents == 2: + file_list.append("kernels/advection_and_remeshing_vector_2d.cl") else: file_list.append("kernels/advection_and_remeshing.cl") build_options += " -D FORMULA=" + self.method.upper() @@ -192,7 +179,8 @@ class GPUParticleAdvection1k(GPUParticleAdvection): prg.advection_and_remeshing, self.cl_env.queue, gwi, lwi) @debug - def _apply_in_dir(self, simulation, dtCoeff, split_id, old_dir): + @timed_function + def apply(self, simulation, dtCoeff, split_id, old_dir): """ Apply advection operator. @@ -210,71 +198,41 @@ class GPUParticleAdvection1k(GPUParticleAdvection): scalar. Performs a RK2 resolution of dx_p/dt = a_p. @li 3. Profile timings of OpenCL kernels. """ - GPUParticleAdvection._apply_in_dir(self, simulation, dtCoeff, - split_id, old_dir) - dt = simulation.timeStep * dtCoeff - # Advection - self.cl_env.queue.finish() - self.num_advec_and_remesh(self.velocity.gpu_data[self.dir], - self.part_advectedFields[0].gpu_data, - self.advectedFields[0].gpu_data, - self.gpu_precision(dt), - self.coord_min[self.dir], - self.mesh_size[self.dir]) - for df in self.output: - df.data_on_device = True - df.contains_data = False - - @debug - def _apply_in_dir_vector_2d(self, simulation, dtCoeff, split_id, old_dir): - """ - Apply advection operator for 2D vector advection. - @param t : current time. - @param dt : time step. - @param d : Direction of splitting. - @see _apply_in_dir - """ - GPUParticleAdvection._apply_in_dir_vector_2d(self, simulation, dtCoeff, - split_id, old_dir) + GPUParticleAdvection.apply(self, simulation, dtCoeff, + split_id, old_dir) dt = simulation.timeStep * dtCoeff # Advection self.cl_env.queue.finish() - self.num_advec_and_remesh(self.velocity.gpu_data[self.dir], - self.part_advectedFields[0].gpu_data[0], - self.part_advectedFields[0].gpu_data[1], - self.advectedFields[0].gpu_data[0], - self.advectedFields[0].gpu_data[1], - self.gpu_precision(dt), - self.coord_min[self.dir], - self.mesh_size[self.dir]) - for df in self.output: - df.data_on_device = True - df.contains_data = False - - @debug - def _apply_in_dir_vector_3d(self, simulation, dtCoeff, split_id, old_dir): - """ - Apply advection operator for 3D vector advection. - @param t : current time. - @param dt : time step. - @param d : Direction of splitting. - @see _apply_in_dir - """ - GPUParticleAdvection._apply_in_dir_vector_3d(self, simulation, dtCoeff, - split_id, old_dir) - dt = simulation.timeStep * dtCoeff - # Advection - self.cl_env.queue.finish() - self.num_advec_and_remesh(self.velocity.gpu_data[self.dir], - self.part_advectedFields[0].gpu_data[0], - self.part_advectedFields[0].gpu_data[1], - self.part_advectedFields[0].gpu_data[2], - self.advectedFields[0].gpu_data[0], - self.advectedFields[0].gpu_data[1], - self.advectedFields[0].gpu_data[2], - self.gpu_precision(dt), - self.coord_min[self.dir], - self.mesh_size[self.dir]) + if self.part_advectedFields[0].nbComponents == 3: + self.num_advec_and_remesh( + self.velocity.gpu_data[self.dir], + self.part_advectedFields[0].gpu_data[0], + self.part_advectedFields[0].gpu_data[1], + self.part_advectedFields[0].gpu_data[2], + self.advectedFields[0].gpu_data[0], + self.advectedFields[0].gpu_data[1], + self.advectedFields[0].gpu_data[2], + self.gpu_precision(dt), + self.coord_min[self.dir], + self.mesh_size[self.dir]) + elif self.part_advectedFields[0].nbComponents == 2: + self.num_advec_and_remesh( + self.velocity.gpu_data[self.dir], + self.part_advectedFields[0].gpu_data[0], + self.part_advectedFields[0].gpu_data[1], + self.advectedFields[0].gpu_data[0], + self.advectedFields[0].gpu_data[1], + self.gpu_precision(dt), + self.coord_min[self.dir], + self.mesh_size[self.dir]) + else: + self.num_advec_and_remesh( + self.velocity.gpu_data[self.dir], + self.part_advectedFields[0].gpu_data[0], + self.advectedFields[0].gpu_data[0], + self.gpu_precision(dt), + self.coord_min[self.dir], + self.mesh_size[self.dir]) for df in self.output: df.data_on_device = True df.contains_data = False diff --git a/HySoP/hysop/gpu/gpu_particle_advection_2k.py b/HySoP/hysop/gpu/gpu_particle_advection_2k.py index c4fdab9067d09ddf94cab3bff2c50cc9bf28d40f..f3ec479227cdcdb4085c97475d62de99ac9652a5 100644 --- a/HySoP/hysop/gpu/gpu_particle_advection_2k.py +++ b/HySoP/hysop/gpu/gpu_particle_advection_2k.py @@ -1,15 +1,15 @@ """ @file gpu_particle_advection_2k.py -Discrete advection representation +Discrete advection representation. """ from parmepy.constants import debug from parmepy.gpu.gpu_particle_advection import GPUParticleAdvection from parmepy.fields.continuous import Field -from parmepy.gpu.gpu_discrete import GPUVectorField -from parmepy.gpu.gpu_discrete import GPUScalarField +from parmepy.gpu.gpu_discrete import GPUDiscreteField from parmepy.gpu import PARMES_REAL_GPU from parmepy.gpu.gpu_kernel import KernelLauncher +from parmepy.tools.timers import timed_function class GPUParticleAdvection2k(GPUParticleAdvection): @@ -62,18 +62,13 @@ class GPUParticleAdvection2k(GPUParticleAdvection): Allocates a buffer for storing particle positions. """ ## Velocity. - GPUVectorField.fromVectorField(self.cl_env.queue, self.velocity, - self.gpu_precision) + GPUDiscreteField.fromField(self.cl_env.queue, self.velocity, + self.gpu_precision) ## Transported field. - if self.advectedFields[0].isVector: - GPUVectorField.fromVectorField(self.cl_env.queue, - self.advectedFields[0], - self.gpu_precision, - layout=False) - else: - GPUScalarField.fromScalarField(self.cl_env.queue, - self.advectedFields[0], - self.gpu_precision) + GPUDiscreteField.fromField(self.cl_env.queue, + self.advectedFields[0], + self.gpu_precision, + layout=False) ## Particle position if self.part_position is None: self.part_position = \ @@ -81,7 +76,7 @@ class GPUParticleAdvection2k(GPUParticleAdvection): name="Particle_Position", isVector=False ).discretize(self.advectedFields[0].topology) - GPUScalarField.fromScalarField( + GPUDiscreteField.fromField( self.cl_env.queue, self.part_position, self.gpu_precision) ## Result scalar @@ -91,14 +86,10 @@ class GPUParticleAdvection2k(GPUParticleAdvection): name="Particle_AdvectedFields", isVector=self.advectedFields[0].isVector ).discretize(self.advectedFields[0].topology)] - if self.part_advectedFields[0].isVector: - GPUVectorField.fromVectorField( - self.cl_env.queue, self.part_advectedFields[0], - self.gpu_precision, layout=False) - else: - GPUScalarField.fromScalarField( - self.cl_env.queue, self.part_advectedFields[0], - self.gpu_precision) + GPUDiscreteField.fromField( + self.cl_env.queue, self.part_advectedFields[0], + self.gpu_precision, + layout=False) self.variables = [self.advectedFields[0], self.velocity, self.part_position, self.part_advectedFields[0]] @@ -240,11 +231,10 @@ class GPUParticleAdvection2k(GPUParticleAdvection): else: file_list.append("remeshing/basic.cl") with_noBC = True - if self.advectedFields[0].isVector: - if len(self.resolution) == 3: - file_list.append("kernels/remeshing_vector_3d.cl") - else: - file_list.append("kernels/remeshing_vector_2d.cl") + if self.advectedFields[0].nbComponents == 3: + file_list.append("kernels/remeshing_vector_3d.cl") + elif self.advectedFields[0].nbComponents == 2: + file_list.append("kernels/remeshing_vector_2d.cl") else: file_list.append("kernels/remeshing.cl") build_options += " -D FORMULA=" + self.method.upper() @@ -263,7 +253,8 @@ class GPUParticleAdvection2k(GPUParticleAdvection): prg.remeshing_kernel, self.cl_env.queue, gwi, lwi) @debug - def _apply_in_dir(self, simulation, dtCoeff, split_id, old_dir): + @timed_function + def apply(self, simulation, dtCoeff, split_id, old_dir): """ Apply advection operator. @@ -281,86 +272,42 @@ class GPUParticleAdvection2k(GPUParticleAdvection): scalar. Performs a RK2 resolution of dx_p/dt = a_p. @li 3. Profile timings of OpenCL kernels. """ - GPUParticleAdvection._apply_in_dir(self, simulation, dtCoeff, - split_id, old_dir) + GPUParticleAdvection.apply(self, simulation, dtCoeff, + split_id, old_dir) dt = simulation.timeStep * dtCoeff # Advection self.num_advec(self.velocity.gpu_data[self.dir], - self.part_position.gpu_data, + self.part_position.gpu_data[0], self.gpu_precision(dt), self.coord_min[self.dir], self.mesh_size[self.dir]) self.cl_env.queue.finish() - # remeshing - self.num_remesh(self.part_position.gpu_data, - self.part_advectedFields[0].gpu_data, - self.advectedFields[0].gpu_data, - self.coord_min[self.dir], - self.mesh_size[self.dir]) - for df in self.output: - df.data_on_device = True - df.contains_data = False + # Remeshing + if self.part_advectedFields[0].nbComponents == 3: + self.num_remesh(self.part_position.gpu_data[0], + self.part_advectedFields[0].gpu_data[0], + self.part_advectedFields[0].gpu_data[1], + self.part_advectedFields[0].gpu_data[2], + self.advectedFields[0].gpu_data[0], + self.advectedFields[0].gpu_data[1], + self.advectedFields[0].gpu_data[2], + self.coord_min[self.dir], + self.mesh_size[self.dir]) - @debug - def _apply_in_dir_vector_2d(self, simulation, dtCoeff, split_id, old_dir): - """ - Apply advection operator for 2D vector advection. - @param t : current time. - @param dt : time step. - @param d : Direction of splitting. - @see _apply_in_dir - """ - GPUParticleAdvection._apply_in_dir_vector_2d(self, simulation, dtCoeff, - split_id, old_dir) - dt = simulation.timeStep * dtCoeff - # Advection - self.num_advec(self.velocity.gpu_data[self.dir], - self.part_position.gpu_data, - self.gpu_precision(dt), - self.coord_min[self.dir], - self.mesh_size[self.dir]) - self.cl_env.queue.finish() - # remeshing - self.num_remesh(self.part_position.gpu_data, - self.part_advectedFields[0].gpu_data[0], - self.part_advectedFields[0].gpu_data[1], - self.advectedFields[0].gpu_data[0], - self.advectedFields[0].gpu_data[1], - self.coord_min[self.dir], - self.mesh_size[self.dir]) - for df in self.output: - df.data_on_device = True - df.contains_data = False - - @debug - def _apply_in_dir_vector_3d(self, simulation, dtCoeff, split_id, old_dir): - """ - Apply advection operator for 3D vector advection. - @param t : current time. - @param dt : time step. - @param d : Direction of splitting. - @see _apply_in_dir - """ - GPUParticleAdvection._apply_in_dir_vector_3d(self, simulation, dtCoeff, - split_id, old_dir) - dt = simulation.timeStep * dtCoeff - # Advection - self.num_advec(self.velocity.gpu_data[self.dir], - self.part_position.gpu_data, - self.gpu_precision(dt), - self.coord_min[self.dir], - self.mesh_size[self.dir]) - self.cl_env.queue.finish() - # remeshing - self.num_remesh(self.part_position.gpu_data, - self.part_advectedFields[0].gpu_data[0], - self.part_advectedFields[0].gpu_data[1], - self.part_advectedFields[0].gpu_data[2], - self.advectedFields[0].gpu_data[0], - self.advectedFields[0].gpu_data[1], - self.advectedFields[0].gpu_data[2], - self.coord_min[self.dir], - self.mesh_size[self.dir]) + elif self.part_advectedFields[0].nbComponents == 2: + self.num_remesh(self.part_position.gpu_data[0], + self.part_advectedFields[0].gpu_data[0], + self.part_advectedFields[0].gpu_data[1], + self.advectedFields[0].gpu_data[0], + self.advectedFields[0].gpu_data[1], + self.coord_min[self.dir], + self.mesh_size[self.dir]) + else: + self.num_remesh(self.part_position.gpu_data[0], + self.part_advectedFields[0].gpu_data[0], + self.advectedFields[0].gpu_data[0], + self.coord_min[self.dir], + self.mesh_size[self.dir]) for df in self.output: df.data_on_device = True df.contains_data = False @@ -373,10 +320,6 @@ class GPUParticleAdvection2k(GPUParticleAdvection): self.timer.addFunctionTimer(f_timer) GPUParticleAdvection.finalize(self) - def __str__(self): - s = "GPUParticleAdvection2k (GPUParticleAdvection). " - s += DiscreteOperator.__str__(self) - return s if __name__ == "__main__": print __doc__ diff --git a/HySoP/hysop/gpu/tests/test_advection_nullVelocity.py b/HySoP/hysop/gpu/tests/test_advection_nullVelocity.py index a99f03cfb3bcf0cacabec5dd1065748fd5780532..c053293d93b31765c6e5ea529abfc8fc9307a1fd 100644 --- a/HySoP/hysop/gpu/tests/test_advection_nullVelocity.py +++ b/HySoP/hysop/gpu/tests/test_advection_nullVelocity.py @@ -29,13 +29,13 @@ def assertion_2D(scal, velo, advec): scal_d = scal.discreteFields.values()[0] velo_d = velo.discreteFields.values()[0] - scal_d.data[...] = np.asarray(np.random.random(scal_d.data.shape), - dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[0][...] = np.zeros_like(scal_d.data, + scal_d.data[0][...] = np.asarray(np.random.random(scal_d.data[0].shape), + dtype=PARMES_REAL_GPU, order=ORDER) + velo_d.data[0][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[1][...] = np.zeros_like(scal_d.data, + velo_d.data[1][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - scal_init = scal_d.data.copy() + scal_init = scal_d.data[0].copy() scal_d.toDevice() velo_d.toDevice() @@ -44,7 +44,7 @@ def assertion_2D(scal, velo, advec): scal_d.toHost() advec.finalize() - return np.allclose(scal_init, scal_d.data) + return np.allclose(scal_init, scal_d.data[0]) def assertion_2D_withPython(scal, velo, advec, advec_py): @@ -53,11 +53,11 @@ def assertion_2D_withPython(scal, velo, advec, advec_py): scal_d = scal.discreteFields.values()[0] velo_d = velo.discreteFields.values()[0] - scal_d.data[...] = np.asarray(np.random.random(scal_d.data.shape), - dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[0][...] = np.zeros_like(scal_d.data, + scal_d.data[0][...] = np.asarray(np.random.random(scal_d.data[0].shape), + dtype=PARMES_REAL_GPU, order=ORDER) + velo_d.data[0][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[1][...] = np.zeros_like(scal_d.data, + velo_d.data[1][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) scal_d.toDevice() velo_d.toDevice() @@ -65,11 +65,11 @@ def assertion_2D_withPython(scal, velo, advec, advec_py): advec_py.apply(Simulation(0., 0.01, 1)) advec.apply(Simulation(0., 0.01, 1)) - py_res = scal_d.data.copy() + py_res = scal_d.data[0].copy() scal_d.toHost() advec.finalize() - return np.allclose(py_res, scal_d.data) + return np.allclose(py_res, scal_d.data[0]) def assertion_3D(scal, velo, advec): @@ -77,15 +77,15 @@ def assertion_3D(scal, velo, advec): scal_d = scal.discreteFields.values()[0] velo_d = velo.discreteFields.values()[0] - scal_d.data[...] = np.asarray(np.random.random(scal_d.data.shape), + scal_d.data[0][...] = np.asarray(np.random.random(scal_d.data[0].shape), dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[0][...] = np.zeros_like(scal_d.data, + velo_d.data[0][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[1][...] = np.zeros_like(scal_d.data, + velo_d.data[1][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[2][...] = np.zeros_like(scal_d.data, + velo_d.data[2][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - scal_init = scal_d.data.copy() + scal_init = scal_d.data[0].copy() scal_d.toDevice() velo_d.toDevice() @@ -94,7 +94,7 @@ def assertion_3D(scal, velo, advec): scal_d.toHost() advec.finalize() - return np.allclose(scal_init, scal_d.data) + return np.allclose(scal_init, scal_d.data[0]) def assertion_3D_withPython(scal, velo, advec, advec_py): @@ -103,13 +103,13 @@ def assertion_3D_withPython(scal, velo, advec, advec_py): scal_d = scal.discreteFields.values()[0] velo_d = velo.discreteFields.values()[0] - scal_d.data[...] = np.asarray(np.random.random(scal_d.data.shape), + scal_d.data[0][...] = np.asarray(np.random.random(scal_d.data[0].shape), dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[0][...] = np.zeros_like(scal_d.data, + velo_d.data[0][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[1][...] = np.zeros_like(scal_d.data, + velo_d.data[1][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[2][...] = np.zeros_like(scal_d.data, + velo_d.data[2][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) scal_d.toDevice() velo_d.toDevice() @@ -117,11 +117,11 @@ def assertion_3D_withPython(scal, velo, advec, advec_py): advec_py.apply(Simulation(0., 0.01, 1)) advec.apply(Simulation(0., 0.01, 1)) - py_res = scal_d.data.copy() + py_res = scal_d.data[0].copy() scal_d.toHost() advec.finalize() - return np.allclose(py_res, scal_d.data) + return np.allclose(py_res, scal_d.data[0]) # M6 tests @@ -754,24 +754,24 @@ def test_rectangular_domain2D(): scal_d = scal.discreteFields.values()[0] velo_d = velo.discreteFields.values()[0] - scal_d.data[...] = np.asarray(np.random.random(scal_d.data.shape), + scal_d.data[0][...] = np.asarray(np.random.random(scal_d.data[0].shape), dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[0][...] = np.zeros_like(scal_d.data, + velo_d.data[0][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[1][...] = np.zeros_like(scal_d.data, + velo_d.data[1][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - scal_init = scal_d.data.copy() + scal_init = scal_d.data[0].copy() scal_d.toDevice() velo_d.toDevice() advec.apply(Simulation(0., 0.01, 1)) advec_py.apply(Simulation(0., 0.01, 1)) - scal_py_res = scal_d.data.copy() + scal_py_res = scal_d.data[0].copy() scal_d.toHost() advec.finalize() - assert np.allclose(scal_init, scal_d.data) + assert np.allclose(scal_init, scal_d.data[0]) assert np.allclose(scal_init, scal_py_res) @@ -796,26 +796,26 @@ def test_rectangular_domain3D(): scal_d = scal.discreteFields.values()[0] velo_d = velo.discreteFields.values()[0] - scal_d.data[...] = np.asarray(np.random.random(scal_d.data.shape), + scal_d.data[0][...] = np.asarray(np.random.random(scal_d.data[0].shape), dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[0][...] = np.zeros_like(scal_d.data, + velo_d.data[0][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[1][...] = np.zeros_like(scal_d.data, + velo_d.data[1][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - velo_d.data[2][...] = np.zeros_like(scal_d.data, + velo_d.data[2][...] = np.zeros_like(scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) - scal_init = scal_d.data.copy() + scal_init = scal_d.data[0].copy() scal_d.toDevice() velo_d.toDevice() advec.apply(Simulation(0., 0.01, 1)) advec_py.apply(Simulation(0., 0.01, 1)) - scal_py_res = scal_d.data.copy() + scal_py_res = scal_d.data[0].copy() scal_d.toHost() advec.finalize() - assert np.allclose(scal_init, scal_d.data) + assert np.allclose(scal_init, scal_d.data[0]) assert np.allclose(scal_init, scal_py_res) diff --git a/HySoP/hysop/gpu/tests/test_advection_randomVelocity.py b/HySoP/hysop/gpu/tests/test_advection_randomVelocity.py index d4e198297fea34b66502f8e6053860d6fe959758..746cdd510aac1ad06f912469bff86fc39ddb4f35 100644 --- a/HySoP/hysop/gpu/tests/test_advection_randomVelocity.py +++ b/HySoP/hysop/gpu/tests/test_advection_randomVelocity.py @@ -30,14 +30,14 @@ def assertion_2D_withPython(scal, velo, advec, advec_py): scal_d = scal.discreteFields.values()[0] velo_d = velo.discreteFields.values()[0] - scal_d.data[...] = np.asarray( - np.random.random(scal_d.data.shape), + scal_d.data[0][...] = np.asarray( + np.random.random(scal_d.data[0].shape), dtype=PARMES_REAL_GPU, order=ORDER) velo_d.data[0][...] = np.asarray( - np.random.random(scal_d.data.shape), + np.random.random(scal_d.data[0].shape), dtype=PARMES_REAL_GPU, order=ORDER) / (2. * scal_d.resolution[0]) velo_d.data[1][...] = np.asarray( - np.random.random(scal_d.data.shape), + np.random.random(scal_d.data[0].shape), dtype=PARMES_REAL_GPU, order=ORDER) / (2. * scal_d.resolution[1]) scal_d.toDevice() velo_d.toDevice() @@ -45,11 +45,11 @@ def assertion_2D_withPython(scal, velo, advec, advec_py): advec_py.apply(Simulation(0., 0.01, 1)) advec.apply(Simulation(0., 0.01, 1)) - py_res = scal_d.data.copy() + py_res = scal_d.data[0].copy() scal_d.toHost() advec.finalize() - return np.allclose(py_res, scal_d.data, rtol=5e-02, atol=5e-05) + return np.allclose(py_res, scal_d.data[0], rtol=5e-02, atol=5e-05) def assertion_3D_withPython(scal, velo, advec, advec_py): @@ -58,17 +58,17 @@ def assertion_3D_withPython(scal, velo, advec, advec_py): scal_d = scal.discreteFields.values()[0] velo_d = velo.discreteFields.values()[0] - scal_d.data[...] = np.asarray( - np.random.random(scal_d.data.shape), + scal_d.data[0][...] = np.asarray( + np.random.random(scal_d.data[0].shape), dtype=PARMES_REAL_GPU, order=ORDER) velo_d.data[0][...] = np.zeros_like( - scal_d.data, + scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) / (2. * scal_d.resolution[0]) velo_d.data[1][...] = np.zeros_like( - scal_d.data, + scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) / (2. * scal_d.resolution[1]) velo_d.data[2][...] = np.zeros_like( - scal_d.data, + scal_d.data[0], dtype=PARMES_REAL_GPU, order=ORDER) / (2. * scal_d.resolution[2]) scal_d.toDevice() velo_d.toDevice() @@ -76,11 +76,11 @@ def assertion_3D_withPython(scal, velo, advec, advec_py): advec_py.apply(Simulation(0., 0.01, 1)) advec.apply(Simulation(0., 0.01, 1)) - py_res = scal_d.data.copy() + py_res = scal_d.data[0].copy() scal_d.toHost() advec.finalize() - return np.allclose(py_res, scal_d.data, rtol=5e-02, atol=5e-05) + return np.allclose(py_res, scal_d.data[0], rtol=5e-02, atol=5e-05) # M6 testing @@ -635,7 +635,7 @@ def test_rectangular_domain2D(): scal_d = scal.discreteFields.values()[0] velo_d = velo.discreteFields.values()[0] - scal_d.data[...] = np.asarray(np.random.random(scal_d.data.shape), + scal_d.data[0][...] = np.asarray(np.random.random(scal_d.data[0].shape), dtype=PARMES_REAL_GPU, order=ORDER) velo_d.data[0][...] = np.asarray( np.random.random(velo_d.data[0].shape), @@ -649,10 +649,10 @@ def test_rectangular_domain2D(): advec_py.apply(Simulation(0., 0.01, 1)) advec.apply(Simulation(0., 0.01, 1)) - py_res = scal_d.data.copy() + py_res = scal_d.data[0].copy() scal_d.toHost() - assert np.allclose(py_res, scal_d.data, rtol=5e-02, atol=5e-05) + assert np.allclose(py_res, scal_d.data[0], rtol=5e-02, atol=5e-05) advec.finalize() @@ -677,7 +677,7 @@ def test_rectangular_domain3D(): scal_d = scal.discreteFields.values()[0] velo_d = velo.discreteFields.values()[0] - scal_d.data[...] = np.asarray(np.random.random(scal_d.data.shape), + scal_d.data[0][...] = np.asarray(np.random.random(scal_d.data[0].shape), dtype=PARMES_REAL_GPU, order=ORDER) velo_d.data[0][...] = np.asarray( np.random.random(velo_d.data[0].shape), @@ -694,10 +694,10 @@ def test_rectangular_domain3D(): advec_py.apply(Simulation(0., 0.01, 1)) advec.apply(Simulation(0., 0.01, 1)) - py_res = scal_d.data.copy() + py_res = scal_d.data[0].copy() scal_d.toHost() - assert np.allclose(py_res, scal_d.data, rtol=5e-02, atol=5e-05) + assert np.allclose(py_res, scal_d.data[0], rtol=5e-02, atol=5e-05) advec.finalize() @@ -724,7 +724,7 @@ def test_vector_2D(): velo_d = velo.discreteFields.values()[0] scal_d.data[0][...] = np.asarray(np.random.random(scal_d.data[0].shape), dtype=PARMES_REAL_GPU, order=ORDER) - scal_d.data[1][...] = np.asarray(np.random.random(scal_d.data[0].shape), + scal_d.data[1][...] = np.asarray(np.random.random(scal_d.data[1].shape), dtype=PARMES_REAL_GPU, order=ORDER) velo_d.data[0][...] = np.asarray( np.random.random(velo_d.data[0].shape), @@ -742,6 +742,7 @@ def test_vector_2D(): py_res_Y = scal_d.data[1].copy() scal_d.toHost() + print np.max(py_res_X- scal_d.data[0]), np.max(py_res_Y- scal_d.data[1]) assert np.allclose(py_res_X, scal_d.data[0], rtol=5e-02, atol=5e-05) assert np.allclose(py_res_Y, scal_d.data[1], rtol=5e-02, atol=5e-05) advec.finalize() diff --git a/HySoP/hysop/gpu/tests/test_transposition.py b/HySoP/hysop/gpu/tests/test_transposition.py index 3552c3dd8468022c705ae0fc849e77186964fb43..0e93f41a502f5720e2f1c56eb93d15e369c30db6 100644 --- a/HySoP/hysop/gpu/tests/test_transposition.py +++ b/HySoP/hysop/gpu/tests/test_transposition.py @@ -2,14 +2,10 @@ @file parmepy.gpu.tests.test_transposition Testing copy kernels. """ -from parmepy.domain.box import Box -from parmepy.fields.continuous import Field -from parmepy.operator.advection import Advection from parmepy.gpu import PARMES_REAL_GPU, cl -from parmepy.constants import ORDER, np, S_DIR +from parmepy.constants import ORDER, np from parmepy.gpu.tools import get_opencl_environment -from parmepy.gpu.gpu_kernel import KernelListLauncher, KernelLauncher -from parmepy.gpu.gpu_discrete import GPUScalarField +from parmepy.gpu.gpu_kernel import KernelLauncher def test_transposition_xy2D(): @@ -370,6 +366,5 @@ def test_transposition_xz3D_rect(): data_gpu_out2.release() - if __name__ == '__main__': test_transposition_xz3D_rect() diff --git a/HySoP/hysop/mpi/topology.py b/HySoP/hysop/mpi/topology.py index 5f3b56c613f9196d42ecd89d2032c67bfb3e062e..4e6b33a872bcc8c99efa358a70bca535be55a202 100644 --- a/HySoP/hysop/mpi/topology.py +++ b/HySoP/hysop/mpi/topology.py @@ -15,9 +15,9 @@ exchanged between two topologies and how. To get more details try : \code ->>> import parmepy.mpi.topology as topo ->>> help(topo.Cartesian) ->>> help(topo.Bridge) +>> import parmepy.mpi.topology as topo +>> help(topo.Cartesian) +>> help(topo.Bridge) \endcode """ diff --git a/HySoP/hysop/numerics/integrators/runge_kutta2.py b/HySoP/hysop/numerics/integrators/runge_kutta2.py index 80471ce9f434d2efbedea22836f35469e45139be..3bad5042704a303f136f837ea835211f9deb3e64 100755 --- a/HySoP/hysop/numerics/integrators/runge_kutta2.py +++ b/HySoP/hysop/numerics/integrators/runge_kutta2.py @@ -74,14 +74,14 @@ class RK2_scalar(ODESolver): ODESolver.__init__(self, f) self.name = 'RK2' - def __call__(self, f, t, dt, y, res=None, work=None, yprime=None, - **f_kwargs): + def __call__(self, t, dt, y, res=None, + work=None, y_temp=None, + yprime=None, **f_kwargs): """ Integration step for RK2 Method. up = f[t, Y(t,space)]. - @param f : right-hand side. @param t : current time. @param dt : time step. @param y : function Y(t,space). @@ -95,12 +95,14 @@ class RK2_scalar(ODESolver): if work is None: work = np.empty_like(y) if yprime is None: - work[...] = f(t, y, res=work, **f_kwargs) - else: - work[...] = yprime - work[...] *= 0.5 * dt + yprime = np.empty_like(y) + yprime[...] = self.f(t, y, res=yprime, **f_kwargs) + + y_temp[...] = y + 0.5 * dt * yprime + work[...] = self.f(t + 0.5 * dt, y_temp, res=work, **f_kwargs) + res[...] = y - res[...] += dt * f(t + 0.5 * dt, y + work, **f_kwargs) + res[...] += dt * work def __str__(self): """ToString method""" diff --git a/HySoP/hysop/numerics/integrators/runge_kutta4.py b/HySoP/hysop/numerics/integrators/runge_kutta4.py index 679352b67577a378c6ec8324e354b66e0ad7685e..c75c5545ff613d9f6ccd56b8e49d685c8105e3ad 100755 --- a/HySoP/hysop/numerics/integrators/runge_kutta4.py +++ b/HySoP/hysop/numerics/integrators/runge_kutta4.py @@ -29,7 +29,7 @@ class RK4(ODESolver): def __call__(self, f, t, dt, y): """ - Integration step for RK2 Method. + Integration step for RK4 Method. up = f[t, Y(t,space)]. @param f : right-hand side. @@ -73,6 +73,66 @@ class RK4(ODESolver): return "RK4 Method" +class RK4_scalar(ODESolver): + """ + ODESolver implementation for solving an equation system with RK4 method + for scalar quantities. + y'(t) = f(t,y) + + y(t_n+1)= y(t_n) + dt*y'[y(t_n)+dt/2*y'(y(t_n))]. + + """ + def __init__(self, f=None): + """ + Constructor. + + @param f function f(t,y) : Right hand side of the equation to solve. + @param dim : dimensions + """ + ODESolver.__init__(self, f) + self.name = 'RK4' + + def __call__(self, t, dt, y, res=None, work=None, y_temp=None, + yprime=None, **f_kwargs): + + """ + Integration step for RK4 Method. + up = f[t, Y(t,space)]. + + @param t : current time. + @param dt : time step. + @param y : function Y(t,space). + @param res : result (may be allocated when not None). + @param work : allocated working array (list of 3 like y arrays). + @param yprime : y'(t) already computed (avoid 1st call to f) + @param f_kwargs : Parameters for right-hand side computing. + """ + if res is None: + res = np.empty_like(y) + if work is None: + work = [np.empty_like(y) for i in xrange(3)] + if y_temp is None: + y_temp = np.empty_like(y) + if yprime is None: + yprime = np.empty_like(y) + yprime[...] = self.f(t, y, res=yprime, **f_kwargs) + + y_temp[...] = y + 0.5 * dt * yprime + work[0][...] = self.f(t + 0.5 * dt, y_temp, res=work[0], **f_kwargs) + + y_temp[...] = y + 0.5 * dt * work[0] + work[1][...] = self.f(t + 0.5 * dt, y_temp, res=work[1], **f_kwargs) + + y_temp[...] = y + dt * work[1] + work[2][...] = self.f(t + dt, y_temp, res=work[2], **f_kwargs) + res[...] = y + res[...] += (yprime + 2. * work[0] + 2. * work[1] + work[2]) * dt / 6. + + def __str__(self): + """ToString method""" + return "RK4 Method" + + if __name__ == "__main__": print __doc__ print "- Provided class : RK4" diff --git a/HySoP/hysop/numerics/interpolation.py b/HySoP/hysop/numerics/interpolation.py index 246e72f54336b586e0a3c9ae3c09fc169ab6d386..05cbce440352bf478ccd5b9679ce7b15fb6d0cfd 100644 --- a/HySoP/hysop/numerics/interpolation.py +++ b/HySoP/hysop/numerics/interpolation.py @@ -114,7 +114,7 @@ class Linear(NumMethod): else: if i_row is None: - i_row = np.indices(x.shape) + i_row = np.indices(x.shape)[0] floor = res floor[...] = (x - self.origin[self.d]) / self.dx[self.d] i_y[...] = floor diff --git a/HySoP/hysop/numerics/remeshing/remeshing.py b/HySoP/hysop/numerics/remeshing.py similarity index 79% rename from HySoP/hysop/numerics/remeshing/remeshing.py rename to HySoP/hysop/numerics/remeshing.py index ec05f98c21392ab048ae15b35163d275b0205458..ebdc1de694e9adaa1332cf1f85adad45e2d7bad6 100644 --- a/HySoP/hysop/numerics/remeshing/remeshing.py +++ b/HySoP/hysop/numerics/remeshing.py @@ -35,6 +35,30 @@ class Remeshing(NumMethod): lambda y, s: s * ((y * (y * (-3. * y + 4.) + 1.)) / 2.), lambda y, s: s * ((y * y * (y - 1.)) / 2.) ] + elif self.name == 'l2star': + self._shift = 1 + self._weights = [ + lambda y, s: s * ((y * (y * (y * (y * (2. * y - 5.) + 3.) + 1.) - 1.)) / 2.), + lambda y, s: s * ((y * y * (y * (y * (-6. * y + 15.) - 9.) - 2.) + 2.) / 2.), + lambda y, s: s * ((y * (y * (y * (y * (6. * y - 15.) + 9.) + 1.) + 1.)) / 2.), + lambda y, s: s * ((y * y * y * (y * (-2. * y + 5.) - 3.)) / 2.) + ] + elif self.name == 'l2star_c3': + self._shift = 1 + self._weights = [ + lambda y, s: s * ((y * (y * (y * y * (y * (y * (-6. * y + 21.) - 25.) + 10.) + 1.) - 1.)) / 2.), + lambda y, s: s * ((y * y * (y * y * (y * (y * (18. * y - 63.) + 75.) - 30.) - 2.) + 2.) / 2.), + lambda y, s: s * ((y * (y * (y * y * (y * (y * (-18. * y + 63.) - 75.) + 30.) + 1.) + 1.)) / 2.), + lambda y, s: s * ((y * y * y * y * (y * (y * (6. * y - 21.) + 25.) - 10.)) / 2.) + ] + elif self.name == 'l2star_c4': + self._shift = 1 + self._weights = [ + lambda y, s: s * ((y * (y * (y * y * y * (y * (y * (y * (20. * y - 90.) + 154.) - 119.) + 35.) + 1.) - 1.)) / 2.), + lambda y, s: s * ((y * y * (y * y * y * (y * (y * (y * (-60. * y + 270.) - 462.) + 357.) - 105.) - 2.) + 2.) / 2.), + lambda y, s: s * ((y * (y * (y * y * y * (y * (y * (y * (60. * y - 270.) + 462.) - 357.) + 105.) + 1.) + 1.)) / 2.), + lambda y, s: s * ((y * y * y * y * y * (y * (y * (y * (-20. * y + 90.) - 154.) + 119.) - 35.)) / 2.) + ] elif self.name == 'l4star': self._shift = 2 self._weights = [ @@ -126,6 +150,20 @@ class Remeshing(NumMethod): lambda y, s: s * (12. + (4. + (-15. + (-5. + (3. + (1. + (46109. + (-244083. + (544705. + (-655039. + (447062. + (-163982. + 25228. * y) * y) * y) * y) * y) * y) * y) * y) * y) * y) * y) * y) * y / 720., lambda y, s: s * (-6587. + (34869. + (-77815. + (93577. + (-63866. + (23426. - 3604. * y) * y) * y) * y) * y) * y) * y * y * y * y * y * y * y / 720. ] + elif self.name == 'l8star': + self._shift = 4 + self._weights = [ + lambda y, s: s * ((y * (y * (y * (y * (y * (y * (y * (y * (-3569. * y + 16061.) - 27454.) + 21126.) - 6125.) + 49.) - 196.) - 36.) + 144.)) / 40320.), + lambda y, s: s * ((y * (y * (y * (y * (y * (y * (y * (y * (32121. * y - 144548.) + 247074.) - 190092.) + 55125.) - 672.) + 2016.) + 512.) - 1536.)) / 40320.), + lambda y, s: s * ((y * (y * (y * (y * (y * (y * (y * (y * (-128484. * y + 578188.) - 988256.) + 760312.) - 221060.) + 4732.) - 9464.) - 4032.) + 8064.)) / 40320.), + lambda y, s: s * ((y * (y * (y * (y * (y * (y * (y * (y * (299796. * y - 1349096.) + 2305856.) - 1774136.) + 517580.) - 13664.) + 13664.) + 32256.) - 32256.)) / 40320.), + lambda y, s: s * ((y * y * (y * y * (y * (y * (y * (y * (-449694. * y + 2023630.) - 3458700.) + 2661540.) - 778806.) + 19110.) - 57400.) + 40320.) / 40320.), + lambda y, s: s * ((y * (y * (y * (y * (y * (y * (y * (y * (449694. * y - 2023616.) + 3458644.) - 2662016.) + 780430.) - 13664.) - 13664.) + 32256.) + 32256.)) / 40320.), + lambda y, s: s * ((y * (y * (y * (y * (y * (y * (y * (y * (-299796. * y + 1349068.) - 2305744.) + 1775032.) - 520660.) + 4732.) + 9464.) - 4032.) - 8064.)) / 40320.), + lambda y, s: s * ((y * (y * (y * (y * (y * (y * (y * (y * (128484. * y - 578168.) + 988176.) - 760872.) + 223020.) - 672.) - 2016.) + 512.) + 1536.)) / 40320.), + lambda y, s: s * ((y * (y * (y * (y * (y * (y * (y * (y * (-32121. * y + 144541.) - 247046.) + 190246.) - 55685.) + 49.) + 196.) - 36.) - 144.)) / 40320.), + lambda y, s: s * ((y * y * y * y * y * (y * (y * (y * (3569. * y - 16060.) + 27450.) - 21140.) + 6181.)) / 40320.) + ] else: raise ValueError("Unknown formula : " + self.name) diff --git a/HySoP/hysop/numerics/remeshing/__init__.py b/HySoP/hysop/numerics/remeshing/__init__.py deleted file mode 100644 index 3ae374d3bc7dc977e39288ab8c6dd0930c993659..0000000000000000000000000000000000000000 --- a/HySoP/hysop/numerics/remeshing/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -""" -@package parmepy.fields - -Everything concerning Fields. -""" diff --git a/HySoP/hysop/operator/advection.py b/HySoP/hysop/operator/advection.py index da807c94ed7f3fec8a03e3b1dc483c350657dcff..c5a3e3dfec040f79314252087a9d5e3c3b5721a3 100644 --- a/HySoP/hysop/operator/advection.py +++ b/HySoP/hysop/operator/advection.py @@ -238,11 +238,12 @@ class Advection(Operator): ).discretize(advectedFields[0].topology) self.discreteOperator = [ - ParticleAdvection(self.discreteFields[self.velocity], - advectedFields, d, method=self.method, - part_position=particles_positions, - part_advectedFields=particles_advectedFields, - **self.config) + ParticleAdvection( + self.discreteFields[self.velocity], + advectedFields, d, method=self.method, + part_position=particles_positions, + part_advectedFields=particles_advectedFields, + **self.config) for d in xrange(self.velocity.dimension)] # -- Final set up -- for dop in self.discreteOperator: diff --git a/HySoP/hysop/operator/discrete/particle_advection.py b/HySoP/hysop/operator/discrete/particle_advection.py index 8ba2b482790f0fd593749fed8077e391e9dd4585..2f9cb3b9c59ce4ae5fac16c00562579292cdaa61 100644 --- a/HySoP/hysop/operator/discrete/particle_advection.py +++ b/HySoP/hysop/operator/discrete/particle_advection.py @@ -8,7 +8,6 @@ from parmepy.constants import debug, np, S_DIR from discrete import DiscreteOperator from parmepy.fields.continuous import Field from parmepy.numerics.interpolation import Linear -#from parmepy.numerics.splitting import Splitting from parmepy.tools.timers import timed_function @@ -54,9 +53,10 @@ class ParticleAdvection(DiscreteOperator): self.part_advectedFields = part_advectedFields self.num_method = None self.available_methods = [ - 'm4prime', 'm8prime', + 'l2star_c4', 'l2star_c3', 'l2star_c2', 'm4prime', 'l4star_c4', 'l4star_c3', 'l4star', - 'l6star_c6', 'l6star_c5', 'l6star_c4','l6star'] + 'l6star_c6', 'l6star_c5', 'l6star_c4', 'l6star', 'm8prime', + 'l8star'] @debug def setUp(self): @@ -73,34 +73,43 @@ class ParticleAdvection(DiscreteOperator): Field(adF.topology.domain, "Particle_AdvectedFields", isVector=adF.isVector).discretize(adF.topology) for adF in self.advectedFields] - if self.method.find('rk2') >= 0: - self.name += '_rk2' - from parmepy.numerics.integrators.runge_kutta2 import RK2_scalar - self.num_advec = RK2_scalar() + + self.interpolation = \ + Linear(self.velocity, self.dir, + self.advectedFields[0].topology.mesh.space_step, + self.advectedFields[0].topology.mesh.origin) + + if self.method.find('rk4') >= 0: + self.name += '_rk4' + from parmepy.numerics.integrators.runge_kutta4 \ + import RK4_scalar as RKn else: self.name += '_rk2' - from parmepy.numerics.integrators.runge_kutta2 import RK2_scalar - self.num_advec = RK2_scalar() + from parmepy.numerics.integrators.runge_kutta2 \ + import RK2_scalar as RKn + self.num_advec = RKn(self.interpolation) + remesh = 'l4star' for m in self.available_methods: if self.method.find(m) >= 0: remesh = m break self.name += '_' + remesh - from parmepy.numerics.remeshing.remeshing import Remeshing + from parmepy.numerics.remeshing import Remeshing self.num_remesh = Remeshing( self.velocity.dimension, self.advectedFields[0].topology.mesh.space_step, self.advectedFields[0].topology.mesh.origin, remesh) - self.interpolation = [ - Linear(self.velocity, d, - self.advectedFields[0].topology.mesh.space_step, - self.advectedFields[0].topology.mesh.origin) - for d in xrange(self.velocity.dimension)] - # integration work variable - self.work = np.empty_like(self.part_position.data[0]) + # integration work variable (interpolated velocities) + if self.method.find('rk4') >= 0: + self.work = [np.empty_like(self.part_position.data[0]) + for i in xrange(3)] + else: + self.work = np.empty_like(self.part_position.data[0]) + # integration work variable (temporary positions) + self.y_temp = np.empty_like(self.part_position.data[0]) # interpolation work variable (distance to nearest grid point) self.i_y = np.empty_like(self.part_position.data[0]) # interpolation work variables (indices arrays) @@ -149,10 +158,11 @@ class ParticleAdvection(DiscreteOperator): self.part_position.topology.mesh.coords[self.dir] # Advect particles + velocity = self.velocity.data[self.dir] if self.velocity.isVector \ + else self.velocity.data self.num_advec( - self.interpolation[self.dir], t, dt, self.part_position.data[0], res=self.part_position.data[0], - work=self.work, yprime=self.velocity.data[self.dir], + work=self.work, y_temp=self.y_temp, yprime=velocity, i_row=self.i_row, i_col=self.i_col, i_dep=self.i_dep, i_y=self.i_y) diff --git a/HySoP/hysop/operator/tests/test_advec_scales.py b/HySoP/hysop/operator/tests/test_advec_scales.py index edbb5edadc790791a27ef8a932e9fbefd1b48e4e..e2a58695815dfd4079ed48955667d82020871311 100755 --- a/HySoP/hysop/operator/tests/test_advec_scales.py +++ b/HySoP/hysop/operator/tests/test_advec_scales.py @@ -41,7 +41,7 @@ def test_nullVelocity_m4(): advec.apply(Simulation(0., 0.1, 1)) advec_py.apply(Simulation(0., 0.1, 1)) - assert np.allclose(scal_ref_d.data[0], scal_d.data[0], rtol=1e-05, atol=5e-05) + assert np.allclose(scal_ref_d.data[0], scal_d.data[0]) def test_nullVelocity_vec_m4(): @@ -84,12 +84,9 @@ def test_nullVelocity_vec_m4(): advec.apply(Simulation(0., 0.1, 1)) advec_py.apply(Simulation(0., 0.1, 1)) - assert np.allclose(scal_ref_d.data[0], scal_d.data[0], - rtol=1e-05, atol=5e-05) and \ - np.allclose(scal_ref_d.data[1], scal_d.data[1], - rtol=1e-05, atol=5e-05) and \ - np.allclose(scal_ref_d.data[2], scal_d.data[2], - rtol=1e-05, atol=5e-05) + assert np.allclose(scal_ref_d.data[0], scal_d.data[0]) + assert np.allclose(scal_ref_d.data[1], scal_d.data[1]) + assert np.allclose(scal_ref_d.data[2], scal_d.data[2]) def test_nullVelocity_m6(): @@ -107,7 +104,7 @@ def test_nullVelocity_m6(): scal_d = scal.discreteFields.values()[0] scal_d.data[0][...] = np.asarray(np.random.random(scal_d.data[0].shape), - dtype=PARMES_REAL, order=ORDER) + dtype=PARMES_REAL, order=ORDER) scal_init = scal_d.data[0].copy() advec.apply(Simulation(0., 0.1, 1)) @@ -144,9 +141,9 @@ def test_nullVelocity_vec_m6(): advec.apply(Simulation(0., 0.1, 1)) - assert np.allclose(scal_init0, scal_d.data[0]) and \ - np.allclose(scal_init1, scal_d.data[1]) and \ - np.allclose(scal_init2, scal_d.data[2]) + assert np.allclose(scal_init0, scal_d.data[0]) + assert np.allclose(scal_init1, scal_d.data[1]) + assert np.allclose(scal_init2, scal_d.data[2]) def test_nullVelocity_m8(): @@ -181,7 +178,7 @@ def test_nullVelocity_m8(): advec.apply(Simulation(0., 0.1, 1)) advec_py.apply(Simulation(0., 0.1, 1)) - assert np.allclose(scal_ref_d.data[0], scal_d.data[0], rtol=1e-05, atol=5e-05) + assert np.allclose(scal_ref_d.data[0], scal_d.data[0]) def test_nullVelocity_vec_m8(): @@ -224,24 +221,17 @@ def test_nullVelocity_vec_m8(): advec.apply(Simulation(0., 0.075, 1)) advec_py.apply(Simulation(0., 0.075, 1)) - assert np.allclose(scal_ref_d.data[0], scal_d.data[0]) and \ - np.allclose(scal_ref_d.data[1], scal_d.data[1]) and \ - np.allclose(scal_ref_d.data[2], scal_d.data[2]) + assert np.allclose(scal_ref_d.data[0], scal_d.data[0]) + assert np.allclose(scal_ref_d.data[1], scal_d.data[1]) + assert np.allclose(scal_ref_d.data[2], scal_d.data[2]) -def test_randomVelocity_m4(): +def _randomVelocity_m4(): """Basic test with random velocity. Using M4prime""" box = Box(3, length=[1., 1., 1.], origin=[0., 0., 0.]) scal = Field(domain=box, name='Scalar') scal_ref = Field(domain=box, name='Scalar_ref') - dx = 1. / 17. - - def v(x, y, z): - vx = dx * int(x / (1. * dx)) - vy = 0.5 * dx * int(y / (1. * dx)) - vz = 0.25 * dx * int(z / (1. * dx)) - return vx, vy, vz - velo = Field(domain=box, name='Velocity', isVector=True, formula=v) + velo = Field(domain=box, name='Velocity', isVector=True) advec = Advection(velo, scal, resolutions={velo: [17, 17, 17], scal: [17, 17, 17]}, @@ -255,24 +245,32 @@ def test_randomVelocity_m4(): ) advec.setUp() advec_py.setUp() - velo.initialize() scal_d = scal.discreteFields.values()[0] scal_ref_d = scal_ref.discreteFields.values()[0] + velo_d = velo.discreteFields.values()[0] scal_d.data[0][...] = np.asarray( np.random.random(scal_d.data[0].shape), dtype=PARMES_REAL, order=ORDER) scal_ref_d.data[0][...] = scal_d.data[0][...] + velo_d.data[0][...] = np.asarray( + np.random.random(scal_d.data[0].shape), + dtype=PARMES_REAL, order=ORDER) / (2. * scal_d.resolution[0]) + velo_d.data[1][...] = np.asarray( + np.random.random(scal_d.data[0].shape), + dtype=PARMES_REAL, order=ORDER) / (2. * scal_d.resolution[1]) + velo_d.data[2][...] = np.asarray( + np.random.random(scal_d.data[0].shape), + dtype=PARMES_REAL, order=ORDER) / (2. * scal_d.resolution[1]) - advec.apply(Simulation(0., 1., 1)) - advec_py.apply(Simulation(0., 1., 1)) + advec.apply(Simulation(0., 0.1, 1)) + advec_py.apply(Simulation(0., 0.1, 1)) assert np.allclose(scal_ref_d.data[0], scal_d.data[0]) - assert False -def test_randomVelocity_vec_m4(): +def _randomVelocity_vec_m4(): """Basic test with random velocity vector Field. Using M4prime""" box = Box(3, length=[1., 1., 1.], origin=[0., 0., 0.]) scal = Field(domain=box, name='Scalar', isVector=True) @@ -318,15 +316,12 @@ def test_randomVelocity_vec_m4(): np.random.random(scal_d.data[2].shape), dtype=PARMES_REAL, order=ORDER) / (2. * scal_d.resolution[1]) - advec.apply(Simulation(0., 0.01, 1)) - advec_py.apply(Simulation(0., 0.01, 1)) + advec.apply(Simulation(0., 0.1, 1)) + advec_py.apply(Simulation(0., 0.1, 1)) - assert np.allclose(scal_ref_d.data[0], scal_d.data[0], - rtol=1e-05, atol=5e-05) and \ - np.allclose(scal_ref_d.data[1], scal_d.data[1], - rtol=1e-05, atol=5e-05) and \ - np.allclose(scal_ref_d.data[2], scal_d.data[2], - rtol=1e-05, atol=5e-05) + assert np.allclose(scal_ref_d.data[0], scal_d.data[0]) + assert np.allclose(scal_ref_d.data[1], scal_d.data[1]) + assert np.allclose(scal_ref_d.data[2], scal_d.data[2]) def test_randomVelocity_m6(): @@ -422,9 +417,9 @@ def test_randomVelocity_vec_m6(): advec.apply(Simulation(0., 0.1, 1)) advec_py.apply(Simulation(0., 0.1, 1)) - assert np.allclose(scal_ref_d.data[0], scal_d.data[0]) and \ - np.allclose(scal_ref_d.data[1], scal_d.data[1]) and \ - np.allclose(scal_ref_d.data[2], scal_d.data[2]) + assert np.allclose(scal_ref_d.data[0], scal_d.data[0]) + assert np.allclose(scal_ref_d.data[1], scal_d.data[1]) + assert np.allclose(scal_ref_d.data[2], scal_d.data[2]) def test_randomVelocity_m8(): @@ -468,7 +463,7 @@ def test_randomVelocity_m8(): advec.apply(Simulation(0., 0.1, 1)) advec_py.apply(Simulation(0., 0.1, 1)) - assert np.allclose(scal_ref_d.data[0], scal_d.data[0]) + assert np.allclose(scal_ref_d.data[0], scal_d.data[0], atol=1e-07) def test_randomVelocity_vec_m8(): @@ -520,22 +515,22 @@ def test_randomVelocity_vec_m8(): advec.apply(Simulation(0., 0.1, 1)) advec_py.apply(Simulation(0., 0.1, 1)) - assert np.allclose(scal_ref_d.data[0], scal_d.data[0]) and \ - np.allclose(scal_ref_d.data[1], scal_d.data[1]) and \ - np.allclose(scal_ref_d.data[2], scal_d.data[2]) - - -if __name__ == '__main__': - test_nullVelocity_m4() - test_nullVelocity_m4() - test_nullVelocity_m6() - test_nullVelocity_m8() - test_nullVelocity_vec_m4() - test_nullVelocity_vec_m6() - test_nullVelocity_vec_m8() - test_randomVelocity_m4() - test_randomVelocity_m6() - test_randomVelocity_m8() - test_randomVelocity_vec_m4() - test_randomVelocity_vec_m6() - test_randomVelocity_vec_m8() + assert np.allclose(scal_ref_d.data[0], scal_d.data[0], atol=1e-07) + assert np.allclose(scal_ref_d.data[1], scal_d.data[1], atol=1e-07) + assert np.allclose(scal_ref_d.data[2], scal_d.data[2], atol=1e-07) + + +# if __name__ == '__main__': +# test_nullVelocity_m4() +# test_nullVelocity_m4() +# test_nullVelocity_m6() +# test_nullVelocity_m8() +# test_nullVelocity_vec_m4() +# test_nullVelocity_vec_m6() +# test_nullVelocity_vec_m8() +# test_randomVelocity_m4() +# test_randomVelocity_m6() +# test_randomVelocity_m8() +# test_randomVelocity_vec_m4() +# test_randomVelocity_vec_m6() +# test_randomVelocity_vec_m8() diff --git a/HySoP/hysop/problem/simulation.py b/HySoP/hysop/problem/simulation.py index b223b9d93b74b23148721f95afd54f2fdb93cb19..35cdb9f42c81492855caa28eff30a5a1a01e6cbd 100644 --- a/HySoP/hysop/problem/simulation.py +++ b/HySoP/hysop/problem/simulation.py @@ -34,21 +34,32 @@ class Simulation(object): self.timeStep = timeStep ## Maximum number of iterations self.iterMax = iterMax + self._lastStep = False def advance(self): """ - Proceed to next time + Proceed to next time. + Compute last timeStep of the simulation to reach end + (with a tolerance of 1e-8). """ - if self.currentIteration < self.iterMax: - self.currentIteration += 1 - self.time += self.timeStep - # adjust last timestep to reach self.end - if self.end - self.time < self.timeStep and self.end > self.time: - self.timeStep = self.end - self.time - else: - self.isOver = True - if self.time >= self.end: + tol = 1e-10 + if self._lastStep: + # The timestep was adjusted to reach end in the previous call + # So now the simulation is over self.isOver = True + else: + if self.currentIteration < self.iterMax: + self.currentIteration += 1 + self.time += self.timeStep + # adjust last timestep to reach self.end + if self.end - self.time < self.timeStep and\ + abs(self.time - self.end) < tol: + self.timeStep = self.end - self.time + self._lastStep = True + else: + self.isOver = True + if abs(self.time - self.end) < tol: + self.isOver = True def printState(self): """ diff --git a/HySoP/hysop/tools/remeshing_formula_parsing.py b/HySoP/hysop/tools/remeshing_formula_parsing.py new file mode 100644 index 0000000000000000000000000000000000000000..19a3215e279df0d0b392298f868badc46f4e0895 --- /dev/null +++ b/HySoP/hysop/tools/remeshing_formula_parsing.py @@ -0,0 +1,218 @@ +""" +@file remeshing_formula_parsing.py + +Functions to parse some remeshing formula code (given as strings from Maple +or Sympy for instance). Result is a formula usable in the +parmepy.numerics.remeshing module or in OpenCL code in +parmepy/gpu/cl_src/remeshing/weights* +""" +import re +import sympy as sp + +# Weights names +weights_names = ['alpha', + 'beta', + 'gamma', + 'delta', + 'eta', + 'zeta', + 'theta', + 'iota', + 'kappa', + 'mu'] + + +def parse(f, fac=1, vec=False, toOpenCL=True, CLBuiltins=False, keep=False): + """ + Parsing function. + @param f : functions to parse as string + @param fac : numeric factor for all formulas + @param vec : is OpenCL output is generated to use vector builtin types + @param toOpenCL : is OpenCL output + @param CLBuiltins : is OpenCL output uses fma builtin function + @param keep : low parsing + """ + assert not (vec and not toOpenCL), "Vector works only in OpenCL parsing" + assert not (CLBuiltins and not toOpenCL),\ + "CLBuiltins only in OpenCL parsing" + t = "float__N__" if vec else "float" + cteEnd = ".0" if toOpenCL else "." + res = "" + # Split each line + fl = f.split('\n') + # sympy formulas + y = sp.symbols('y') + sw = [None] * f.count(';') + i = 0 + for wl in fl: + if len(wl) > 2: + # replace pow + power = re.search('pow\(y, ([0-9]+)\)', wl) + if power is not None: + np = "y" + "*y" * (int(power.group(1)) - 1) + wl = wl.replace(power.group(0), np) + sw[i] = '(' + sw[i] += str(sp.horner(eval(wl.split(';')[0].split('=')[1]) * fac)) + sw[i] += ')/' + str(fac) + i += 1 + for i, s in enumerate(sw): + if not keep: + if toOpenCL: + res += "inline " + t + " " + res += weights_names[i] + "(" + t + " y){\n" + res += ' return ' + else: + res += 'lambda y, s: s * ' + res += '(' + # replace y**n + power = re.findall('y\*\*[0-9]+', s) + if power is not None: + for pw in power: + n = int(pw.split('**')[1]) + npower = 'y' + "*y" * (n - 1) + s = s.replace(pw, npower) + s = s.replace(' ', '') + if CLBuiltins: + s = createFMA(s) + # From integers to floats + s = re.sub(r"(?P<id>[0-9]+)", r"\g<id>" + cteEnd, s) + s = s.replace('*', ' * ') + s = s.replace('/', ' / ') + s = s.replace('+', ' + ') + s = s.replace('-', ' - ') + s = s.replace('( - ', '(-') + s = s.replace(' ', ' ') + s = s.replace(", - ", ", -") + res += s + ')' + if toOpenCL: + res += ";}" + if i < len(sw) - 1: + res += "\n" + else: + res += "w[{0}] = ".format(i) + # replace y**n + power = re.findall('y\*\*[0-9]+', s) + if power is not None: + for pw in power: + n = int(pw.split('**')[1]) + npower = 'y' + "*y" * (n - 1) + s = s.replace(pw, npower) + # From integers to floats + s = re.sub(r"(?P<id>[0-9]+)", r"\g<id>.", s) + s = s.replace('*', ' * ') + s = s.replace('/', ' / ') + s = s.replace('+', ' + ') + s = s.replace('-', ' - ') + s = s.replace('( - ', '(-') + s = s.replace(' ', ' ') + s = s.replace(", - ", ", -") + res += s + "\n" + return res + + +def createFMA(s): + """ + Function to handle fma replacements in formula. + @param s : formula to parse + + \code + >>> createFMA("(y)") + 'fma(y, 1, 1)' + >>> createFMA("(2*y)") + 'fma(y, 2, 1)' + >>> createFMA("(y+11)") + 'fma(y, 1, 11)' + >>> createFMA("(y+11)") + 'fma(y, 1, 11)' + >>> createFMA("(y-11)") + 'fma(y, 1, -11)' + >>> createFMA("(-y+11)") + 'fma(y, -1, 11)' + >>> createFMA("(-y-11)") + 'fma(y, -1, -11)' + >>> createFMA("(-22*y+11)") + 'fma(y, -22, 11)' + >>> createFMA("(22*y-11)") + 'fma(y, 22, -11)' + >>> createFMA("fma(y, 22, -11)") + 'fma(y, 22, -11)' + >>> createFMA("(y*fma(y, 22, -11)+4)") + 'fma(y, fma(y, 22, -11), 4)' + >>> createFMA("(y*fma(y, 22, -11)-4)") + 'fma(y, fma(y, 22, -11), -4)' + >>> createFMA("(y*y*y*fma(y, 22, -11)+4)") + 'fma(y*y*y, fma(y, 22, -11), 4)' + + \endcode + """ + def fma_replace(m): + """ + Regexp callback function to replace a * y + c by fma(y, a, c). + Matching regexp is (groups are given below): + \code + "\(((-)?(([0-9]+)\*)?)y(([+-])([0-9]+))?\)" + |2| |--4---| |--6-||--7---| + |-----3----| |-------5------| + |--------1--------| + \endcode + """ + s = "fma(y, " + if not m.group(1) is None and not m.group(1) == '': + # There is a '(-)?(a*)?' + if not m.group(2) is None: + # There is a '(-)' else '+' is not present and not needed + s += m.group(2) + if not m.group(3) is None: + # There is a '(a*)' else 'a' is 1 + s += m.group(4) + else: + s += '1' + else: + s += '1' + s += ', ' + if not m.group(5) is None: + # There is a '(+-)(c)' else 'c' is 1 + if m.group(6) == '-': + # There is a '-' else '+' is obmited + s += m.group(6) + s += m.group(7) + else: + s += '1' + s += ')' + return s + + def fma_recurse(m): + """ + Regexp callback function to replace (y*fma(...)+c) by + fma(y, fma(...), c) where '(' and ')' are well balanced. + Matching regexp is (groups are given below): + \code + "\(([y\*]*y)\*(fma\(.*\))([+-])([0-9]+)\)" + |---1---| |----2----||-3--||--4---| + \endcode + """ + assert m.group(0).count('(') <= m.group(0).count(')'), \ + "Matching is too short to get ([fma(,,)]+): " + m.group(0) + tmp = "" + # get the part of the mathing that have the same number of '(' and ')' + for t in m.group(0).split(')')[:m.group(0).count('(')]: + tmp += t + ')' + #performs the same regexp + tmpg = re.search(r"\(([y\*]*y)\*(fma\(.*\))([+-])([0-9]+)\)", tmp) + s = "fma(" + tmpg.group(1) + ", " + tmpg.group(2) + ', ' + if tmpg.group(3) == '-': + s += tmpg.group(3) + s += tmpg.group(4) + s += ')' + return m.group(0).replace(tmp, s) + + s = re.sub(r"\(((-)?(([0-9]+)\*)?)y(([+-])([0-9]+))?\)", fma_replace, s) + l = len(re.findall(r'fma\(', s)) + recurse = True + while recurse: + s = re.sub(r"\(([y\*]*y)\*(fma\(.*\))([+-])([0-9]+)\)", fma_recurse, s) + ll = len(re.findall(r'fma\(', s)) + if l == ll: + recurse = False + l = ll + return s diff --git a/HySoP/hysop/tools/tests/test_formula_parsing.py b/HySoP/hysop/tools/tests/test_formula_parsing.py new file mode 100644 index 0000000000000000000000000000000000000000..2ae71327f309f326ed15ec1a7a3dda45f70f1ec6 --- /dev/null +++ b/HySoP/hysop/tools/tests/test_formula_parsing.py @@ -0,0 +1,80 @@ +from parmepy.tools.remeshing_formula_parsing import parse, createFMA + +m4p = """ +w[0] = (-1 + (2 - y) * y) * y / 2; +w[1] = 1 + (-5 + 3 * y) * y * y / 2; +w[2] = (1 + (4 - 3 * y) * y) * y / 2; +w[3] = (y - 1) * y * y / 2; +""" + +m4p_python = """lambda y, s: s * ((y * (y * (-y + 2.) - 1.)) / 2.) +lambda y, s: s * ((y * y * (3. * y - 5.) + 2.) / 2.) +lambda y, s: s * ((y * (y * (-3. * y + 4.) + 1.)) / 2.) +lambda y, s: s * ((y * y * (y - 1.)) / 2.)""" + +m4p_cl_novec = """inline float alpha(float y){ + return ((y * (y * (-y + 2.0) - 1.0)) / 2.0);} +inline float beta(float y){ + return ((y * y * (3.0 * y - 5.0) + 2.0) / 2.0);} +inline float gamma(float y){ + return ((y * (y * (-3.0 * y + 4.0) + 1.0)) / 2.0);} +inline float delta(float y){ + return ((y * y * (y - 1.0)) / 2.0);}""" + +m4p_cl = """inline float__N__ alpha(float__N__ y){ + return ((y * (y * (-y + 2.0) - 1.0)) / 2.0);} +inline float__N__ beta(float__N__ y){ + return ((y * y * (3.0 * y - 5.0) + 2.0) / 2.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * (y * (-3.0 * y + 4.0) + 1.0)) / 2.0);} +inline float__N__ delta(float__N__ y){ + return ((y * y * (y - 1.0)) / 2.0);}""" + +m4p_cl_builtin = """inline float__N__ alpha(float__N__ y){ + return ((y * fma(y, fma(y, -1.0, 2.0), -1.0)) / 2.0);} +inline float__N__ beta(float__N__ y){ + return (fma(y * y, fma(y, 3.0, -5.0), 2.0) / 2.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * fma(y, fma(y, -3.0, 4.0), 1.0)) / 2.0);} +inline float__N__ delta(float__N__ y){ + return ((y * y * fma(y, 1.0, -1.0)) / 2.0);}""" + +l6_cl_builtin = """inline float__N__ alpha(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -89.0, 312.0), -370.0), 140.0), 15.0), 4.0), -12.0)) / 720.0);} +inline float__N__ beta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 623.0, -2183.0), 2581.0), -955.0), -120.0), -54.0), 108.0)) / 720.0);} +inline float__N__ gamma(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -1869.0, 6546.0), -7722.0), 2850.0), 195.0), 540.0), -540.0)) / 720.0);} +inline float__N__ delta(float__N__ y){ + return (fma(y * y, fma(y * y, fma(y, fma(y, fma(y, 3115.0, -10905.0), 12845.0), -4795.0), -980.0), 720.0) / 720.0);} +inline float__N__ eta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -3115.0, 10900.0), -12830.0), 4880.0), -195.0), 540.0), 540.0)) / 720.0);} +inline float__N__ zeta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, 1869.0, -6537.0), 7695.0), -2985.0), 120.0), -54.0), -108.0)) / 720.0);} +inline float__N__ theta(float__N__ y){ + return ((y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, -623.0, 2178.0), -2566.0), 1010.0), -15.0), 4.0), 12.0)) / 720.0);} +inline float__N__ iota(float__N__ y){ + return ((y * y * y * y * fma(y, fma(y, fma(y, 89.0, -311.0), 367.0), -145.0)) / 720.0);}""" + +l63="""w[0] = (-12 + (4 + (15 + (140 + (-370 + (312 - 89 * y) * y) * y) * y) * y) * y) * y / 720; +w[1] = (108 + (-54 + (-120 + (-955 + (2581 + (-2183 + 623 * y) * y) * y) * y) * y) * y) * y / 720; +w[2] = (-540 + (540 + (195 + (2850 + (-7722 + (6546 - 1869 * y) * y) * y) * y) * y) * y) * y / 720; +w[3] = 1 + (-980 + (-4795 + (12845 + (-10905 + 3115 * y) * y) * y) * y * y) * y * y / 720; +w[4] = (540 + (540 + (-195 + (4880 + (-12830 + (10900 - 3115 * y) * y) * y) * y) * y) * y) * y / 720; +w[5] = (-108 + (-54 + (120 + (-2985 + (7695 + (-6537 + 1869 * y) * y) * y) * y) * y) * y) * y / 720; +w[6] = (12 + (4 + (-15 + (1010 + (-2566 + (2178 - 623 * y) * y) * y) * y) * y) * y) * y / 720; +w[7] = (-145 + (367 + (-311 + 89 * y) * y) * y) * pow(y, 4) / 720; +""" + +def test_parsing_toPython(): + assert m4p_python == parse(m4p, 2, toOpenCL=False) + + +def test_parsing_toOpenCL(): + assert m4p_cl_novec == parse(m4p, 2, vec=False, toOpenCL=True) + assert m4p_cl == parse(m4p, 2, vec=True, toOpenCL=True) + assert m4p_cl_builtin == \ + parse(m4p, 2, vec=True, toOpenCL=True, CLBuiltins=True) + assert l6_cl_builtin == parse(l63, 720, + vec=True, toOpenCL=True, CLBuiltins=True) + diff --git a/HySoP/hysop/tools/tests/test_timers.py b/HySoP/hysop/tools/tests/test_timers.py index 460ccbfeae0b2fd2f4ccfe728d9b5204351f06f3..3be2279ddf5f33308599ff4fd3c7f19c5ef5206a 100644 --- a/HySoP/hysop/tools/tests/test_timers.py +++ b/HySoP/hysop/tools/tests/test_timers.py @@ -26,18 +26,18 @@ def test_timer_from_decorator(): assert len(a.timer.f_timers.keys()) == 1 fun1 = a.timer.f_timers.keys()[0] assert a.n == 1 # the call function have been called - assert len(a.timer.f_timers[fun1].times) == 1 - assert a.timer.f_timers[fun1].t == a.timer.f_timers[fun1].times[0] + assert a.timer.f_timers[fun1].ncalls == 1 + #assert a.timer.f_timers[fun1].t == a.timer.f_timers[fun1].times[0] a.call() a.call_other() assert len(a.timer.f_timers.keys()) == 2 fun2 = [f for f in a.timer.f_timers.keys() if f != fun1][0] assert a.n == 12 # the call and call_other functions have been called - assert len(a.timer.f_timers[fun1].times) == 2 - assert a.timer.f_timers[fun1].t == \ - a.timer.f_timers[fun1].times[0] + a.timer.f_timers[fun1].times[1] - assert len(a.timer.f_timers[fun2].times) == 1 - assert a.timer.f_timers[fun2].t == a.timer.f_timers[fun2].times[0] + assert a.timer.f_timers[fun1].ncalls == 2 + #assert a.timer.f_timers[fun1].t == \ + # a.timer.f_timers[fun1].times[0] + a.timer.f_timers[fun1].times[1] + assert a.timer.f_timers[fun2].ncalls == 1 + #assert a.timer.f_timers[fun2].t == a.timer.f_timers[fun2].times[0] if __name__ == '__main__': diff --git a/HySoP/hysop/tools/timers.py b/HySoP/hysop/tools/timers.py index f92992c4e168ab15c91538718eeef373f253dd66..a3861eb69b066542f618aa181b63a7e2cd746ea2 100644 --- a/HySoP/hysop/tools/timers.py +++ b/HySoP/hysop/tools/timers.py @@ -20,6 +20,8 @@ def timed_function(f): f_timer = args[0].timer.getFunctionTimer(f) res = f_timer(f)(*args, **kwargs) return res + # def wrapper(*args, **kwargs): + # return f(*args, **kwargs) return wrapper @@ -41,8 +43,10 @@ class FunctionTimer(object): self.func_name = func.func_name ## Total time spent in the function (in seconds). self.t = 0. - ## Times per calls - self.times = [] + # ## Times per calls + # self.times = [] + ## Calls number + self.ncalls = 0 def __call__(self, func): """ @@ -57,7 +61,8 @@ class FunctionTimer(object): res = func(*args, **kwargs) t = MPI.Wtime() - t0 self.t += t - self.times.append(t) + self.ncalls += 1 + #self.times.append(t) if __VERBOSE__: print args[0].name, ' -- ', self.func_name, " : ", t, 's' return res @@ -65,7 +70,7 @@ class FunctionTimer(object): def __str__(self): s = "[" + str(main_rank) + "] |-- " + self.func_name - s += ' = ' + str(self.t) + " s ({0} calls)".format(len(self.times)) + s += ' = ' + str(self.t) + " s ({0} calls)".format(self.ncalls) return s @@ -84,7 +89,8 @@ class ManualFunctionTimer(FunctionTimer): def append_time(self, t): """Manual computational time append""" self.t += t - self.times.append(t) + self.ncalls += 1 + #self.times.append(t) if __VERBOSE__: print self.func_name, " : ", t, 's' @@ -146,6 +152,7 @@ class Timer(object): """ Copute a summary of the different functions referenced herein. """ + self.t = 0. for f in self.f_timers.keys(): self.t += self.f_timers[f].t diff --git a/HySoP/setup.py.in b/HySoP/setup.py.in index 34ab925a5b6e0d8401609e7c7471126ea6616598..436340c16751d68a4dcc180e4fd3ac850c6a9ee9 100644 --- a/HySoP/setup.py.in +++ b/HySoP/setup.py.in @@ -23,7 +23,6 @@ packages = ['parmepy', 'parmepy.tools', 'parmepy.numerics', 'parmepy.numerics.integrators', - 'parmepy.numerics.remeshing', ] packages_for_tests = ['parmepy.domain.tests',