From b4000c5b398bdc7bd0aca3a8bb7eec88a10ceefb Mon Sep 17 00:00:00 2001 From: Jean-Matthieu Etancelin <jean-matthieu.etancelin@univ-reims.fr> Date: Thu, 7 Jan 2016 14:10:49 +0100 Subject: [PATCH] Profile GPU-diffusion initilialisation. Fix multiresolution filter, GPU advection, redistribute op. Fix some tests. --- HySoP/hysop/gpu/gpu_diffusion.py | 6 ++- HySoP/hysop/gpu/gpu_multiresolution_filter.py | 23 ++++++++---- HySoP/hysop/gpu/gpu_particle_advection.py | 8 +++- .../hysop/gpu/multi_gpu_particle_advection.py | 21 ++++++++--- .../tests/test_gpu_multiresolution_filter.py | 17 ++++++--- HySoP/hysop/gpu/tests/test_transposition.py | 37 ++++++++++--------- HySoP/hysop/operator/continuous.py | 2 + .../hysop/operator/multiresolution_filter.py | 11 +++++- HySoP/hysop/operator/redistribute_inter.py | 10 ++++- HySoP/hysop/operator/redistribute_intra.py | 5 +++ 10 files changed, 97 insertions(+), 43 deletions(-) diff --git a/HySoP/hysop/gpu/gpu_diffusion.py b/HySoP/hysop/gpu/gpu_diffusion.py index c1ba26ec4..207fc2fd3 100644 --- a/HySoP/hysop/gpu/gpu_diffusion.py +++ b/HySoP/hysop/gpu/gpu_diffusion.py @@ -278,8 +278,10 @@ class GPUDiffusion(DiscreteOperator, GPUOperator): self.gpu_precision(self.viscosity * simulation.timeStep), self._mesh_size, wait_for=wait_evt) - c_evt = cl.enqueue_copy(self.cl_env.queue, self.field.gpu_data[0], - self.field_tmp, wait_for=[d_evt]) + #c_evt = cl.enqueue_copy(self.cl_env.queue, self.field.gpu_data[0], + # self.field_tmp, wait_for=[d_evt]) + c_evt = self.copy.launch_sizes_in_args( + self.field.gpu_data[0], self.field_tmp, wait_for=[d_evt]) self.field.events.append(c_evt) def apply(self, simulation): diff --git a/HySoP/hysop/gpu/gpu_multiresolution_filter.py b/HySoP/hysop/gpu/gpu_multiresolution_filter.py index f388f61c8..c2f77984f 100644 --- a/HySoP/hysop/gpu/gpu_multiresolution_filter.py +++ b/HySoP/hysop/gpu/gpu_multiresolution_filter.py @@ -40,7 +40,7 @@ class GPUFilterFineToCoarse(FilterFineToCoarse, GPUOperator): #GPU allocations alloc = not isinstance(self.field_in[0], GPUDiscreteField) GPUDiscreteField.fromField(self.cl_env, self.field_in[0], - self.gpu_precision, simple_layout=False) + self.gpu_precision, layout=False) if not self.field_in[0].gpu_allocated: self.field_in[0].allocate() if alloc: @@ -48,7 +48,7 @@ class GPUFilterFineToCoarse(FilterFineToCoarse, GPUOperator): alloc = not isinstance(self.field_out[0], GPUDiscreteField) GPUDiscreteField.fromField(self.cl_env, self.field_out[0], - self.gpu_precision, simple_layout=False) + self.gpu_precision, layout=False) if not self.field_out[0].gpu_allocated: self.field_out[0].allocate() if alloc: @@ -127,22 +127,29 @@ class GPUFilterFineToCoarse(FilterFineToCoarse, GPUOperator): self.initialize = KernelLauncher( prg.initialize_output, self.cl_env.queue, self.field_out[0].data[0].shape, None) + self._evts = [None, ] * self.field_in[0].dimension def apply(self, simulation=None): #evts = [] - self.field_out[0].events.append( - self.initialize(self.field_out[0].gpu_data[0])) + self.field_in[0].toHost() + self.field_in[0].wait() + for d in xrange(self.field_in[0].nb_components): + self._evts[d] = [] + self._evts[d].append( + self.initialize(self.field_out[0].gpu_data[d], + wait_for=self.field_out[0].events)) for iy in xrange(len(self._rmsh.weights)): for iz in xrange(len(self._rmsh.weights)): - evt = self.fine_to_coarse(self.field_in[0].gpu_data[0], - self.field_out[0].gpu_data[0], + for d in xrange(self.field_in[0].nb_components): + evt = self.fine_to_coarse(self.field_in[0].gpu_data[d], + self.field_out[0].gpu_data[d], self.scale_factor, self._mesh_size_in, self._mesh_size_out, self._domain_origin, np.int32(iy), np.int32(iz), - wait_for=self.field_out[0].events) - self.field_out[0].events.append(evt) + wait_for=self._evts[d]) + self._evts[d].append(evt) # Ghosts values must be exchanged either on process or through mpi # communications. Values must be moved to host. # We developp 2 versions: diff --git a/HySoP/hysop/gpu/gpu_particle_advection.py b/HySoP/hysop/gpu/gpu_particle_advection.py index 0504675eb..a0f2be00d 100644 --- a/HySoP/hysop/gpu/gpu_particle_advection.py +++ b/HySoP/hysop/gpu/gpu_particle_advection.py @@ -18,6 +18,7 @@ import hysop.tools.numpywrappers as npw from hysop.gpu.gpu_discrete import GPUDiscreteField from hysop.gpu.gpu_operator import GPUOperator from hysop.tools.profiler import profile +from hysop.numerics.update_ghosts import UpdateGhostsFull class GPUParticleAdvection(ParticleAdvection, GPUOperator): @@ -88,8 +89,10 @@ class GPUParticleAdvection(ParticleAdvection, GPUOperator): if self.method[MultiScale] is not None: self._isMultiScale = True + self._synchronize = None if self._isMultiScale: - self._synchronize = True + self._synchronize = UpdateGhostsFull( + self.velocity.topology, self.velocity.nb_components) # Compute resolutions for kernels for each direction. ## Resolution of the local mesh but reoganized redarding @@ -693,6 +696,9 @@ class GPUParticleAdvection(ParticleAdvection, GPUOperator): self._init_events[self.fields_on_grid[0]] = [] def _compute_1k_multiechelle(self, simulation, dtCoeff, split_id, old_dir): + if split_id==0 and self._synchronize is not None: + self._synchronize(self.velocity.data) + self.velocity.toDevice() dt = simulation.timeStep * dtCoeff wait_evts = self.velocity.events + \ self._init_events[self.fields_on_grid[0]] + \ diff --git a/HySoP/hysop/gpu/multi_gpu_particle_advection.py b/HySoP/hysop/gpu/multi_gpu_particle_advection.py index 623f577a2..53f9803ab 100644 --- a/HySoP/hysop/gpu/multi_gpu_particle_advection.py +++ b/HySoP/hysop/gpu/multi_gpu_particle_advection.py @@ -51,8 +51,9 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): super(MultiGPUParticleAdvection, self).__init__(**kwds) max_velocity = get_extra_args_from_method(self, 'max_velocity', None) max_dt = get_extra_args_from_method(self, 'max_dt', None) + max_cfl = get_extra_args_from_method(self, 'max_cfl', None) self._velocity_only_on_device = get_extra_args_from_method( - self, 'velocity_only_on_device', False), + self, 'velocity_only_on_device', False) if self._velocity_only_on_device: self._get_velocity_buffers = self._get_velocity_buffers_from_device else: @@ -63,9 +64,9 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): assert self._comm_size > 1, 'Parallel only' assert self.dim == 3, 'A 2D multi-GPU version is not yet available' - msg = "max_dt and _max_velocity must be given to advection " + msg = "Either max_dt and _max_velocity or max_cfl must be given to advection " msg += "for computing communication buffer sizes." - assert max_dt is not None and max_velocity is not None + assert (max_dt is not None and max_velocity is not None) or max_cfl is not None assert self.fields_topo.cutdir[self.direction] assert self.fields_topo.shape[self.direction] > 1 @@ -116,6 +117,15 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._v_space_step = v_msh.space_step # Maximum cfl for velocity and scalar + if max_cfl is not None: + scale_factor = self._v_space_step[self.direction]/self._space_step[self.direction] + try: + self.max_cfl_s = int(max_cfl[self.direction] * scale_factor) + 1 + self.max_cfl_v = int(max_cfl[self.direction]) + 1 + except TypeError: + self.max_cfl_s = int(max_cfl * scale_factor) + 1 + self.max_cfl_v = int(max_cfl) + 1 + else: try: self.max_cfl_s = int(max_velocity[self.direction] * max_dt / self._space_step[self.direction]) + 1 @@ -656,9 +666,9 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): self._queue_comm_p, self._cl_v_r_buff, self._v_r_buff, host_origin=(b * self._v_block_size, 0, 0), - host_pitches=(self._v_l_buff.nbytes, 0), + host_pitches=(self._v_r_buff.nbytes, 0), buffer_origin=(b * self._v_block_size, 0, 0), - buffer_pitches=(self._v_l_buff.nbytes, 0), + buffer_pitches=(self._v_r_buff.nbytes, 0), region=(self._v_block_size, 1, 1), is_blocking=False) @@ -696,7 +706,6 @@ class MultiGPUParticleAdvection(GPUParticleAdvection): def _compute_advec_comm(self, simulation, dtCoeff, split_id, old_dir): dt = simulation.timeStep * dtCoeff - self._todevice_velocity_buffers() wait_evts = self.velocity.events + self._evt_l_v + self._evt_r_v + \ self._init_events[self.fields_on_grid[0]] diff --git a/HySoP/hysop/gpu/tests/test_gpu_multiresolution_filter.py b/HySoP/hysop/gpu/tests/test_gpu_multiresolution_filter.py index 9890dbb03..240550376 100644 --- a/HySoP/hysop/gpu/tests/test_gpu_multiresolution_filter.py +++ b/HySoP/hysop/gpu/tests/test_gpu_multiresolution_filter.py @@ -16,10 +16,11 @@ PROC_TASKS = [0, ] * main_size if main_rank < n_gpu: PROC_TASKS[main_rank] = 1 + L = [1., 1., 1.] O = [0., 0., 0.] simu = Simulation(tinit=0., tend=0.1, nbIter=1) -PY_COMPARE = False +PY_COMPARE = True def func(res, x, y, z, t=0): @@ -32,13 +33,14 @@ def test_filter_linear(): """This test compares the GPU linear filter with python implementation""" box = Box(length=L, origin=O, proc_tasks=PROC_TASKS) mpi_p = MPIParams(comm=box.comm_task, task_id=1) - f = Field(box, formula=func) + f = Field(box, formula=func, is_vector=False) d_fine = Discretization([513, 513, 513]) d_coarse = Discretization([257, 257, 257], ghosts=[1, 1, 1]) op = MultiresolutionFilter(d_in=d_fine, d_out=d_coarse, variables={f: d_coarse}, method={Remesh: Rmsh_Linear, - Support: 'gpu', }, + Support: 'gpu', + ExtraArgs: {'device_id': main_rank, }}, mpi_params=mpi_p) if box.is_on_task(1): op.discretize() @@ -50,7 +52,6 @@ def test_filter_linear(): f_out = f.discreteFields[topo_coarse] f_out.toDevice() op.apply(simu) - print "ICI" f_out.toHost() f_out.wait() valid = [npw.zeros(f_out[0].shape), ] @@ -91,7 +92,9 @@ def test_filter_L2_1(): op = MultiresolutionFilter(d_in=d_fine, d_out=d_coarse, variables={f: d_coarse}, method={Remesh: L2_1, - Support: 'gpu', }) + Support: 'gpu', + ExtraArgs: {'device_id': main_rank, }}, + mpi_params=mpi_p) op.discretize() op.setup() topo_coarse = op.discreteFields[f].topology @@ -123,3 +126,7 @@ def test_filter_L2_1(): f_out[0][topo_coarse.mesh.iCompute]), \ np.max(np.abs(valid[0][topo_coarse.mesh.iCompute] - f_out[0][topo_coarse.mesh.iCompute])) + +if __name__ == '__main__': + test_filter_linear() + test_filter_L2_1() diff --git a/HySoP/hysop/gpu/tests/test_transposition.py b/HySoP/hysop/gpu/tests/test_transposition.py index 39862afed..65128e713 100644 --- a/HySoP/hysop/gpu/tests/test_transposition.py +++ b/HySoP/hysop/gpu/tests/test_transposition.py @@ -8,11 +8,10 @@ from hysop.gpu.tools import get_opencl_environment from hysop.gpu.gpu_kernel import KernelLauncher import hysop.tools.numpywrappers as npw -cl_env = get_opencl_environment() def _comparison(resolution, resolutionT, transpose_f, transpose_b, - gwi, lwi, axe=1): + gwi, lwi, cl_env, axe=1): data_in = npw.asrealarray(np.random.random(resolution)) data_out = npw.realempty(resolutionT) @@ -70,7 +69,7 @@ def test_transposition_xy2D(): prg.transpose_xy, cl_env.queue, gwi, lwi) _comparison(resolution, resolution, init_transpose_xy, init_transpose_xy, - gwi, lwi) + gwi, lwi, cl_env) def test_transposition_xy2D_noVec(): @@ -92,12 +91,13 @@ def test_transposition_xy2D_noVec(): prg.transpose_xy, cl_env.queue, gwi, lwi) _comparison(resolution, resolution, init_transpose_xy, init_transpose_xy, - gwi, lwi) + gwi, lwi, cl_env) def test_transposition_xy2D_rect(): resolution = (512, 256) resolutionT = (256, 512) + cl_env = get_opencl_environment() vec = 4 src_transpose_xy = 'kernels/transpose_xy.cl' build_options = "" @@ -129,7 +129,7 @@ def test_transposition_xy2D_rect(): gwi, lwi) _comparison(resolution, resolutionT, init_transpose_xy_x, init_transpose_xy_y, - gwi, lwi) + gwi, lwi, cl_env) def test_transposition_xy2D_noVec_rect(): @@ -167,7 +167,7 @@ def test_transposition_xy2D_noVec_rect(): gwi, lwi) _comparison(resolution, resolutionT, init_transpose_xy_x, init_transpose_xy_y, - gwi, lwi) + gwi, lwi, cl_env) def test_transposition_xy3D(): @@ -189,7 +189,7 @@ def test_transposition_xy3D(): prg.transpose_xy, cl_env.queue, gwi, lwi) _comparison(resolution, resolution, init_transpose_xy, init_transpose_xy, - gwi, lwi) + gwi, lwi, cl_env) def test_transposition_xy3D_noVec(): @@ -211,7 +211,7 @@ def test_transposition_xy3D_noVec(): prg.transpose_xy, cl_env.queue, gwi, lwi) _comparison(resolution, resolution, init_transpose_xy, init_transpose_xy, - gwi, lwi) + gwi, lwi, cl_env) def test_transposition_xy3D_rect(): @@ -249,7 +249,7 @@ def test_transposition_xy3D_rect(): prg.transpose_xy, cl_env.queue, gwi, lwi) _comparison(resolution, resolutionT, init_transpose_xy_x, init_transpose_xy_y, - gwi, lwi) + gwi, lwi, cl_env) def test_transposition_xy3D_noVec_rect(): @@ -287,7 +287,7 @@ def test_transposition_xy3D_noVec_rect(): prg.transpose_xy, cl_env.queue, gwi, lwi) _comparison(resolution, resolutionT, init_transpose_xy_x, init_transpose_xy_y, - gwi, lwi) + gwi, lwi, cl_env) def test_transposition_xz3D(): @@ -311,7 +311,7 @@ def test_transposition_xz3D(): prg.transpose_xz, cl_env.queue, gwi, lwi) _comparison(resolution, resolution, init_transpose_xz, init_transpose_xz, - gwi, lwi, axe=2) + gwi, lwi, cl_env, axe=2) def test_transposition_xz3D_noVec(): @@ -335,7 +335,7 @@ def test_transposition_xz3D_noVec(): prg.transpose_xz, cl_env.queue, gwi, lwi) _comparison(resolution, resolution, init_transpose_xz, init_transpose_xz, - gwi, lwi, axe=2) + gwi, lwi, cl_env, axe=2) def test_transposition_xz3D_rect(): @@ -377,7 +377,7 @@ def test_transposition_xz3D_rect(): prg.transpose_xz, cl_env.queue, gwi, lwi) _comparison(resolution, resolutionT, init_transpose_xz_x, init_transpose_xz_z, - gwi, lwi, axe=2) + gwi, lwi, cl_env, axe=2) def test_transposition_xz3D_noVec_rect(): @@ -419,7 +419,7 @@ def test_transposition_xz3D_noVec_rect(): prg.transpose_xz, cl_env.queue, gwi, lwi) _comparison(resolution, resolutionT, init_transpose_xz_x, init_transpose_xz_z, - gwi, lwi, axe=2) + gwi, lwi, cl_env, axe=2) def test_transposition_xz3Dslice(): @@ -443,7 +443,7 @@ def test_transposition_xz3Dslice(): prg.transpose_xz, cl_env.queue, gwi, lwi) _comparison(resolution, resolution, init_transpose_xz, init_transpose_xz, - gwi, lwi, axe=2) + gwi, lwi, cl_env, axe=2) def test_transposition_xz3Dslice_noVec(): resolution = (32, 32, 32) @@ -466,7 +466,7 @@ def test_transposition_xz3Dslice_noVec(): prg.transpose_xz, cl_env.queue, gwi, lwi) _comparison(resolution, resolution, init_transpose_xz, init_transpose_xz, - gwi, lwi, axe=2) + gwi, lwi, cl_env, axe=2) def test_transposition_xz3Dslice_rect(): @@ -506,7 +506,7 @@ def test_transposition_xz3Dslice_rect(): prg.transpose_xz, cl_env.queue, gwi, lwi) _comparison(resolution, resolutionT, init_transpose_xz_x, init_transpose_xz_z, - gwi, lwi, axe=2) + gwi, lwi, cl_env, axe=2) def test_transposition_xz3Dslice_noVec_rect(): resolution = (32, 32, 64) @@ -545,4 +545,5 @@ def test_transposition_xz3Dslice_noVec_rect(): prg.transpose_xz, cl_env.queue, gwi, lwi) _comparison(resolution, resolutionT, init_transpose_xz_x, init_transpose_xz_z, - gwi, lwi, axe=2) + gwi, lwi, cl_env, axe=2) + diff --git a/HySoP/hysop/operator/continuous.py b/HySoP/hysop/operator/continuous.py index 2e09122b0..994ae33e0 100644 --- a/HySoP/hysop/operator/continuous.py +++ b/HySoP/hysop/operator/continuous.py @@ -309,6 +309,8 @@ def opapply(f): name = inspect.getmro(args[0].apply.im_class) name[-3].setup(args[0]) #super(args[0].__class__, args[0]).apply() + for op in args[0].wait_list(): + op.wait() t0 = ftime() res = f(*args, **kwargs) args[0].profiler[f.func_name] += ftime() - t0 diff --git a/HySoP/hysop/operator/multiresolution_filter.py b/HySoP/hysop/operator/multiresolution_filter.py index 6bf564cf6..09127899c 100644 --- a/HySoP/hysop/operator/multiresolution_filter.py +++ b/HySoP/hysop/operator/multiresolution_filter.py @@ -7,6 +7,7 @@ from hysop.operator.continuous import opsetup from hysop.operator.computational import Computational import hysop.default_methods as default from hysop.methods_keys import Support +from hysop.tools.parameters import Discretization class MultiresolutionFilter(Computational): @@ -27,8 +28,14 @@ class MultiresolutionFilter(Computational): def discretize(self): super(MultiresolutionFilter, self)._standard_discretize() - topo_in = self._build_topo(self.d_in, 0) - topo_out = self._build_topo(self.d_out, 0) + if isinstance(self.d_in, Discretization): + topo_in = self._build_topo(self.d_in, 0) + else: + topo_in = self.d_in + if isinstance(self.d_out, Discretization): + topo_out = self._build_topo(self.d_out, 0) + else: + topo_out = self.d_out self._df_in = [] self._df_out = [] for v in self.variables: diff --git a/HySoP/hysop/operator/redistribute_inter.py b/HySoP/hysop/operator/redistribute_inter.py index f72bdb7c7..36cc97eca 100644 --- a/HySoP/hysop/operator/redistribute_inter.py +++ b/HySoP/hysop/operator/redistribute_inter.py @@ -122,6 +122,12 @@ class RedistributeInter(Redistribute): for op in self._run_till: op.wait_for(self) + def add_run_till_op(self, op): + """Add an operator to the wait list""" + if self._is_target: + self._run_till.append(op) + op.wait_for(self) + @debug @opapply def apply(self, simulation=None): @@ -131,7 +137,6 @@ class RedistributeInter(Redistribute): parameters (time, time step, iteration number ...), see hysop.problem.simulation.Simulation for details. """ - pass # --- Standard send/recv --- self._requests = {} @@ -165,6 +170,9 @@ class RedistributeInter(Redistribute): if self._has_requests: for rk in self._requests: self._requests[rk].Wait() + for v in self.variables: + for d in self._range_components(v): + vtab = v.discreteFields[self._topology].data[d] self._has_requests = False def test_requests(self): diff --git a/HySoP/hysop/operator/redistribute_intra.py b/HySoP/hysop/operator/redistribute_intra.py index d18525c41..c9062f9cf 100644 --- a/HySoP/hysop/operator/redistribute_intra.py +++ b/HySoP/hysop/operator/redistribute_intra.py @@ -112,6 +112,11 @@ class RedistributeIntra(Redistribute): self._is_uptodate = True + def add_run_till_op(self, op): + """Add an operator to the wait list""" + self._run_till.append(op) + op.wait_for(self) + @opapply def apply(self, simulation=None): # Try different way to send vars? -- GitLab