diff --git a/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl b/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl index 3c51049ec0607176860e0310fe6c6b552f77145d..9f0436f7b562335cd172c089551b8d65d07cb5cb 100644 --- a/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl +++ b/HySoP/hysop/gpu/cl_src/kernels/fine_to_coarse_filter.cl @@ -13,9 +13,9 @@ __kernel void coarse_to_fine_filter(__global const float* scal_in, // Data in the fine grid are read only once for the whole computation. // Because of the stencil, these data are spread over multiple coarse grid cells -> we need a global memory synchronization. // The global synchronization is obtained by several kernel launch with an offset - unsigned int lid = get_local_id(0); // [0;31] - unsigned int gid_y = get_global_id(1); // [0:127] - unsigned int gid_z = get_global_id(2); // [0:127] + unsigned int lid = get_local_id(0); + unsigned int gid_y = get_global_id(1); + unsigned int gid_z = get_global_id(2); unsigned int iy_c = gid_y*L_STENCIL+offset_y; unsigned int iz_c = gid_z*L_STENCIL+offset_z; unsigned int iy_f = iy_c*PTS_PER_CELL_Y; @@ -152,17 +152,16 @@ __kernel void coarse_to_fine_filter(__global const float* scal_in, p_res[3][3][3] += scale_factor * wx.w * wy.w * wz.w * line[lid*PTS_PER_CELL_X+pt_x]; #endif - - barrier(CLK_LOCAL_MEM_FENCE); } } } // Store the registers results in local memory for (pt_z=0;pt_z<L_STENCIL;pt_z++) for (pt_y=0;pt_y<L_STENCIL;pt_y++) - for (pt_x=0;pt_x<L_STENCIL;pt_x++) + for (pt_x=0;pt_x<L_STENCIL;pt_x++) { result[GHOSTS_OUT_X+b_id*WG+lid-SHIFT_STENCIL+pt_x][pt_y][pt_z] += p_res[pt_x][pt_y][pt_z]; - + barrier(CLK_LOCAL_MEM_FENCE); + } } // Write result in output array diff --git a/HySoP/hysop/gpu/config_k20m.py b/HySoP/hysop/gpu/config_k20m.py index 633bd60bbd644f76d6b8d8d39357d61104be1750..e9a23eb69b19b39d0e9dcbd36fc232427f2b562b 100644 --- a/HySoP/hysop/gpu/config_k20m.py +++ b/HySoP/hysop/gpu/config_k20m.py @@ -206,3 +206,19 @@ kernels_config[3][DOUBLE_GPU]['advec_MS_and_remesh_comm'] = \ (['common.cl', 'remeshing/weights_noVec.cl', 'kernels/comm_advection_MS_and_remeshing_noVec.cl'], False, 1, advection_and_remeshing_index_space) + + +def fine_to_coarse_filter_index_space(size, stencil_width): + wg = size[0] / (2 * stencil_width) + return ((wg, size[1] / stencil_width, size[2] / stencil_width), + (wg, 1, 1)) + + +kernels_config[3][FLOAT_GPU]['fine_to_coarse_filter'] = \ + (["common.cl", 'remeshing/weights_noVec.cl', + "kernels/fine_to_coarse_filter.cl"], + 1, fine_to_coarse_filter_index_space) +kernels_config[3][DOUBLE_GPU]['fine_to_coarse_filter'] = \ + (["common.cl", 'remeshing/weights_noVec.cl', + "kernels/fine_to_coarse_filter.cl"], + 1, fine_to_coarse_filter_index_space) diff --git a/HySoP/hysop/gpu/tests/test_gpu_multiresolution_filter.py b/HySoP/hysop/gpu/tests/test_gpu_multiresolution_filter.py index d6552d3397b04eb7e84398869732871b6134b88c..60527f458f2150728a9a2c8b6a93a118bdb82110 100644 --- a/HySoP/hysop/gpu/tests/test_gpu_multiresolution_filter.py +++ b/HySoP/hysop/gpu/tests/test_gpu_multiresolution_filter.py @@ -5,8 +5,9 @@ from hysop.fields.continuous import Field from hysop.operator.multiresolution_filter import MultiresolutionFilter import hysop.tools.numpywrappers as npw import numpy as np -from hysop.methods_keys import Remesh, Support +from hysop.methods_keys import Remesh, Support, ExtraArgs from hysop.methods import Rmsh_Linear, L2_1 +from hysop.mpi.main_var import main_rank L = [1., 1., 1.] @@ -30,7 +31,8 @@ def test_filter_linear(): op = MultiresolutionFilter(d_in=d_fine, d_out=d_coarse, variables={f: d_coarse}, method={Remesh: Rmsh_Linear, - Support: 'gpu', }) + Support: 'gpu', + ExtraArgs:{'device_id': main_rank}}) op.discretize() op.setup() topo_coarse = op.discreteFields[f].topology @@ -39,6 +41,7 @@ def test_filter_linear(): f.initialize(topo=topo_fine) f_out = f.discreteFields[topo_coarse] f_out.toDevice() + f_out.wait() op.apply(simu) f_out.toHost() f_out.wait() @@ -77,7 +80,8 @@ def test_filter_L2_1(): op = MultiresolutionFilter(d_in=d_fine, d_out=d_coarse, variables={f: d_coarse}, method={Remesh: L2_1, - Support: 'gpu', }) + Support: 'gpu', + ExtraArgs:{'device_id': main_rank} }) op.discretize() op.setup() topo_coarse = op.discreteFields[f].topology @@ -110,7 +114,3 @@ def test_filter_L2_1(): np.max(np.abs(valid[0][topo_coarse.mesh.iCompute] - f_out[0][topo_coarse.mesh.iCompute])) - -if __name__ == '__main__': - test_filter_linear() - test_filter_L2_1() diff --git a/HySoP/hysop/gpu/tools.py b/HySoP/hysop/gpu/tools.py index f4d2df159e5a4b95db9d15c97e55fcf597e360dd..ee88a35095064058165b7f26b64c4f3808272792 100644 --- a/HySoP/hysop/gpu/tools.py +++ b/HySoP/hysop/gpu/tools.py @@ -11,7 +11,6 @@ import re import mpi4py.MPI as mpi FLOAT_GPU, DOUBLE_GPU = np.float32, np.float64 - ## Global variable handling an OpenCL Environment instance __cl_env = None @@ -189,6 +188,7 @@ class OpenCLEnvironment(object): assert device_type.upper() == cl.device_type.to_string(device.type) if display or __VERBOSE__: print (" Device") + print (" - id :", device_id) print (" - Name :",) print (device.name) print (" - Type :",) diff --git a/HySoP/hysop/operator/discrete/baroclinic.py b/HySoP/hysop/operator/discrete/baroclinic.py index 8810ea18ec2055fef2b9bfb0eaad58cd66dfdaee..f30ed13a578638461cdf2077d3d2382310a21f03 100644 --- a/HySoP/hysop/operator/discrete/baroclinic.py +++ b/HySoP/hysop/operator/discrete/baroclinic.py @@ -1,11 +1,14 @@ # -*- coding: utf-8 -*- -"""discrete MultiPhase Rot Grad P +""" +@file operator/discrete/baroclinic.py +Discrete MultiPhase Rot Grad P """ from hysop.operator.discrete.discrete import DiscreteOperator import hysop.numerics.differential_operations as diff_op from hysop.constants import debug, XDIR, YDIR, ZDIR, np from hysop.methods_keys import SpaceDiscretisation from hysop.numerics.update_ghosts import UpdateGhosts +from hysop.tools.profiler import ftime import hysop.tools.numpywrappers as npw @@ -159,6 +162,7 @@ class Baroclinic(DiscreteOperator): self._baroclinicTerm[d][iCompute] *= dt self.vorticity[d][iCompute] += self._baroclinicTerm[d][iCompute] + # reinitialise for next iteration # velo(n-1) update for d in xrange(self.velocity.dimension):