diff --git a/examples/advection_gpu.py b/examples/advection_gpu.py deleted file mode 100644 index 1dac067c5450fd9d676aa78ef1404368858158c5..0000000000000000000000000000000000000000 --- a/examples/advection_gpu.py +++ /dev/null @@ -1,138 +0,0 @@ - -from hysop.constants import np -from hysop.methods_keys import TimeIntegrator, Remesh, ExtraArgs, \ - Splitting, MultiScale, Interpolation, Precision, \ - StretchingFormulation, BoundaryCondition, Backend - -from hysop.gpu.kernel_autotuner import AutotunerConfig, AutotunerFlags - -from hysop.domain.box import Box -from hysop.fields.continuous import Field -from hysop.problem.simulation import Simulation -from hysop.mpi.topology import Cartesian -from hysop.tools.parameters import Discretization -from hysop.operator.advection import Advection - -from hysop.numerics.odesolvers import Euler, RK2, RK3, RK4, ExplicitRungeKutta -from hysop.numerics.interpolation import Linear - -from hysop.numerics.remeshing import L2_1, L2_2, L2_3, L2_4 -from hysop.numerics.remeshing import L4_2, L4_3, L4_4 -from hysop.numerics.remeshing import L6_3, L6_4, L6_5, L6_6 -from hysop.numerics.remeshing import L8_4 - -from hysop.constants import callback_profiler - -if __name__=='__main__': - - GHOSTS = 0 - NSCALARS = 1 - - # DIM = 3 - # f_resolution = (33,33,33)[:DIM] - # f_resolution = (65,65,65)[:DIM] - f_resolution = (129,129,129)[:DIM] - # f_resolution = (257,257,257)[:DIM] - # f_resolution = (513,513,257)[:DIM] - - #DIM = 2 - #f_resolution = (8193,8193,8193)[:DIM] - #f_resolution = (2049,2049,2049)[:DIM] - #f_resolution = (4097,4097,4097)[:DIM] - - v_resolution = f_resolution - ghosts = (GHOSTS,)*DIM - - def initVel(res, *args): - res[0][...] = 0. - if DIM>1: - res[1][...] = 0. - if DIM>2: - res[2][...] = 0. - return res - def initVort(res, *args): - res[0][...] = args[0] - if DIM>1: - res[1][...] = args[0]*args[0] - if DIM>2: - res[2][...] = args[0]*args[0]*args[0] - return res - - def initScalar(i): - def init(res,*args): - res[0][...] = i+1 - return res - return init - - - d3d = Discretization(resolution=f_resolution, ghosts=ghosts) - d3dv = Discretization(resolution=v_resolution, ghosts=ghosts) - - box = Box(length=[1.0]*DIM, origin=[0.0]*DIM) - - velo = Field(domain=box, formula=initVel, - name='Velocity', is_vector=True) - vorti = Field(domain=box, formula=initVort, - name='Vorticity', is_vector=True) - scalars = [ Field(domain=box, formula=initScalar(i), - name='Scalar_{}'.format(i), - is_vector=False) - for i in xrange(NSCALARS) ] - - f_topo = Cartesian(box,d3d,DIM) - v_topo = Cartesian(box,d3dv,DIM) - - dvelo = velo.discretize(v_topo) - dvorti = vorti.discretize(f_topo) - dscalars = [] - for s in scalars: - dscalars.append(s.discretize(f_topo)) - - autotuner_config = AutotunerConfig(autotuner_flag=AutotunerFlags.PATIENT, prune_threshold=1.20, override_cache=False) - rk_scheme = ExplicitRungeKutta('Euler') - - method = { - Backend: Backend.OPENCL, - TimeIntegrator: rk_scheme, - Remesh: L2_1, - ExtraArgs: { - 'autotuner_config':autotuner_config, - 'use_builtin_copy':True, - } - } - - if DIM==3: - method[ExtraArgs]['stretching'] = \ - { - 'rk_scheme': rk_scheme, - 'formulation': StretchingFormulation.CONSERVATIVE, - 'boundary': BoundaryCondition.PERIODIC, - 'order':4 - } - - advected_fields = [vorti]+scalars - variables = dict(zip(advected_fields, [f_topo]*len(advected_fields))) - - A = Advection(velo, v_topo, variables=variables, method=method) - A.discretize() - A.setup() - - velo.initialize() - vorti.initialize() - for s in scalars: - s.initialize() - - simu = Simulation(start=0.0, end=1.0, time_step=0.01) - simu.initialize() - - i=0 - while not simu.is_over: - print - simu.print_state() - - A.apply(simu) - simu.advance() - i+=1 - - print callback_profiler.report() - diff --git a/examples/scalar_advection/scalar_advection.py b/examples/scalar_advection/scalar_advection.py new file mode 100644 index 0000000000000000000000000000000000000000..636af63aff62c18ee398a8f2eed069495b878b4d --- /dev/null +++ b/examples/scalar_advection/scalar_advection.py @@ -0,0 +1,88 @@ + +from hysop import IO, IOParams, Field, Box, \ + Simulation, Problem +from hysop.deps import np + +from hysop.constants import AutotunerFlags, Implementation +from hysop.operators import DirectionalAdvection + +from hysop.methods import StrangOrder, Remesh, TimeIntegrator, \ + OpenClKernelConfig, OpenClKernelAutotunerConfig + +from hysop.numerics.splitting.strang import StrangSplitting +from hysop.numerics.odesolvers.runge_kutta import Euler, RK2, RK3, RK4 + + +IO.set_default_path('/tmp/scalar_advection') + +pi = np.pi +cos = np.cos +sin = np.sin + +## Function to compute velocity +def init_velocity(data, coords): + for d in data: + d[...] = 1.0 + +## Function to compute initial scalar +def init_scalar(data, coords): + data[0][...] = 1.0 + for x in coords: + data[0][...] *= cos(x) + +def run(npts=64+1, cfl=0.5): + dim = 3 + d3d=(npts,)*dim + box = Box(length=(2*pi,)*dim) + + impl = Implementation.OPENCL_CODEGEN + + if impl is Implementation.OPENCL_CODEGEN: + autotuner_config = OpenClKernelAutotunerConfig( + autotuner_flag=AutotunerFlags.ESTIMATE, + prune_threshold=1.2, override_cache=False, verbose=0) + kernel_config = OpenClKernelConfig(autotuner_config=autotuner_config) + method = { OpenClKernelConfig : kernel_config } + else: + method = None + + velo = Field(domain=box, name='V', is_vector=True, dtype=np.float32) + scalar = Field(domain=box, name='S0', nb_components=1, dtype=np.float32) + + advec = DirectionalAdvection(implementation=impl, + name='advec', + velocity = velo, + advected_fields = (scalar,), + velocity_cfl = cfl, + variables = {velo: d3d, scalar: d3d}, + method = {TimeIntegrator: Euler, Remesh: Remesh.L2_1}, + ) + + splitting = StrangSplitting(splitting_dim=dim, + order=StrangOrder.STRANG_SECOND_ORDER, + method=method) + splitting.push_operators(advec) + + problem = Problem() + problem.insert(splitting) + + io_params = IOParams(filename='S0', frequency=10) + problem.dump_inputs(fields=scalar, io_params=io_params) + + problem.build() + #problem.display() + + dfields = problem.input_discrete_fields + dfields[velo].initialize(formula=init_velocity) + dfields[scalar].initialize(formula=init_scalar) + + dx = dfields[scalar].space_step.min() + dt = cfl*dx + simu = Simulation(start=0.0, end=pi, dt0=dt) + + problem.solve(simu) + problem.finalize() + +if __name__=='__main__': + run() + diff --git a/examples/taylor_green/taylor_green_monoscale.py b/examples/taylor_green/taylor_green_monoscale.py index 179869737e6b574bb1f4be4e0f0b003a268ed759..7462f623d37fb952771e45f6fba39fdb43ec2a34 100644 --- a/examples/taylor_green/taylor_green_monoscale.py +++ b/examples/taylor_green/taylor_green_monoscale.py @@ -62,7 +62,7 @@ def init_vorticity(data, coords, dim): else: raise NotImplementedError('dimension {}'.format(dim)) -def run(npts=8+1, viscosity=1./1600., lcfl=0.125, cfl=0.5): +def run(npts=512+1, viscosity=1./1600., lcfl=0.125, cfl=0.5): dim = 2 d3d=(npts,)*dim impl = Implementation.OPENCL_CODEGEN @@ -112,14 +112,14 @@ def run(npts=8+1, viscosity=1./1600., lcfl=0.125, cfl=0.5): method=method) splitting.push_operators(advec)#, stretch) - min_dt = 0.1 + min_dt = 0.01 simu = Simulation(start=0.0, end=pi, dt0=min_dt) adapt_dt = AdaptiveTimeStep(simu.dt, dt_coeff=0.95) - adapt_dt.push_cfl_criteria(cfl, velo, min_dt=min_dt) - adapt_dt.push_lcfl_criteria(lcfl, vorti, min_dt=min_dt) + adapt_dt.push_cfl_criteria(cfl, velo) + adapt_dt.push_lcfl_criteria(lcfl, vorti) problem = Problem(method={ComputeGranularity:0}) - problem.insert(splitting, adapt_dt) + problem.insert(splitting)#, adapt_dt) problem.dump_inputs(fields=vorti) problem.build() #problem.display() @@ -127,16 +127,11 @@ def run(npts=8+1, viscosity=1./1600., lcfl=0.125, cfl=0.5): dfields = problem.input_discrete_fields dfields[velo].initialize(formula=init_velocity, dim=dim) dfields[vorti].initialize(formula=init_vorticity, dim=dim) - - print 'VELOCITY' - dfields[velo].print_with_ghosts() - print 'VORTICITY' - dfields[vorti].print_with_ghosts() with printoptions(threshold=10000, linewidth=240, nanstr='nan', infstr='inf', formatter={'float': lambda x: '{:>6.2f}'.format(x)}): - problem.solve(simu, max_iter=0) + problem.solve(simu) problem.finalize() if __name__=='__main__': diff --git a/hysop/__init__.py b/hysop/__init__.py index 17a4a76b1cb38dedb3a10e3a9ce44cccaa7f9c08..1f58761cea30dcdc03641fc5a300632db6a4f78c 100644 --- a/hysop/__init__.py +++ b/hysop/__init__.py @@ -16,7 +16,7 @@ __FFTW_ENABLED__ = "ON" is "ON" __SCALES_ENABLED__ = "ON" is "ON" __OPTIMIZE__ = not __debug__ -__VERBOSE__ = False +__VERBOSE__ = True __DEBUG__ = False __TRACE__ = False __KERNEL_DEBUG__ = True diff --git a/hysop/backend/device/opencl/opencl_array.py b/hysop/backend/device/opencl/opencl_array.py index 1e64109abcf27d2205771bf9ba8629b110f08b8e..41e63ec0a0dd8006fb754d3d6730e71e5f8a9e5d 100644 --- a/hysop/backend/device/opencl/opencl_array.py +++ b/hysop/backend/device/opencl/opencl_array.py @@ -51,19 +51,16 @@ class OpenClArray(Array): try: return self._handle.data except clArray.ArrayHasOffsetError: - raise - # offset = self.offset - # bdata = self.base_data - # alignment = self.backend.device.mem_base_addr_align - # if ((bdata.int_ptr + offset) % alignment) == 0: - # int_ptr_value = bdata.int_ptr + offset - # buf = cl.Buffer.from_int_ptr(int_ptr_value=int_ptr_value, - # retain=False) - # assert buf.int_ptr == int_ptr_value - # buf.__hysop_array_ref = self - # return buf - # else: - # raise + offset = self.offset + bdata = self.base_data + alignment = self.backend.device.mem_base_addr_align + if ((bdata.int_ptr + offset) % alignment) == 0: + # try to return a subbuffer + buf = self.base_data.buf[offset:] + buf.__parent = self.base_data + return buf + else: + raise def get_base(self): return self._handle.base_data @@ -235,12 +232,13 @@ class OpenClArray(Array): self.__min_launcher = self.backend.amin(a=self, axis=axis, out=out, build_kernel_launcher=True, queue=queue, async=True, **kwds) - (out, evt) = self.__min_launcher(queue=queue, async=True) + evt = self.__min_launcher(queue=queue, async=True) + out = self.__min_launcher.out if async: - return (out, evt) + return evt else: evt.wait() - return out + return out.copy() else: super(OpenClArray, self).min(self, axis=axis, out=out, queue=queue, async=async, **kwds) @@ -256,32 +254,13 @@ class OpenClArray(Array): self.__max_launcher = self.backend.amax(a=self, axis=axis, out=out, build_kernel_launcher=True, queue=queue, async=True, **kwds) - (out, evt) = self.__max_launcher(queue=queue, async=True) + evt = self.__max_launcher(queue=queue, async=True) + out = self.__max_launcher.out if async: - return (out, evt) + return evt else: evt.wait() - return out + return out.copy() else: super(OpenClArray, self).max(self, axis=axis, out=out, queue=queue, async=async, **kwds) - - # def __getitem__(self, index): - # """Custom getitem.""" - # if (self.ndim == 1) and isinstance(index, slice): - # (start,stop,step) = index.indices(self.size) - # dbytes = start*self.strides[0] - # ptr = self.base_data.int_ptr + dbytes - # alignment = self.backend.device.mem_base_addr_align - # print 'int_ptr={}, idx={}, dbytes={}, new_ptr={}, alignment={}, new_ptr%alignment={}'.format( - # self.int_ptr, indices[0], dbytes, ptr, alignment, ptr%alignment) - # if (start==0) or (step!=1) or (ptr % alignment != 0): - # return super(OpenClArray, self).__getitem__(index) - # else: - # origin = self.offset + dbytes - # size = (stop-start)*self.strides[0] - # data = self.handle.base_data - # data = data.get_sub_region(origin=origin, size=size, flags=0) - # raise NotImplementedError() - # else: - # return super(OpenClArray, self).__getitem__(index) diff --git a/hysop/backend/device/opencl/opencl_array_backend.py b/hysop/backend/device/opencl/opencl_array_backend.py index 97aa1ad28abc3b2b2a5d96321523d3dd4eab7c5f..c59b07bc2fbc63a66cf4c881d3012092e4307361 100644 --- a/hysop/backend/device/opencl/opencl_array_backend.py +++ b/hysop/backend/device/opencl/opencl_array_backend.py @@ -1,5 +1,6 @@ import warnings +from hysop import __KERNEL_DEBUG__ from hysop.deps import re, np, os from hysop.tools.types import check_instance, to_tuple from hysop.tools.hash import hash_id @@ -47,6 +48,37 @@ class _ElementwiseKernel(object): preamble=preamble, options=options) def __call__(self, wait_for, args, **kargs): return self.kernel.__call__(wait_for=wait_for, *args) + + + def to_kernel_launcher(self, name, wait_for, queue, args, **kwds): + """Build an _OpenClElementWiseKernelLauncher from self.""" + + class OpenClElementwiseKernelLauncher(OpenClKernelLauncherI): + """Utility class to build opencl reduction kernel launchers.""" + __slots__ = ('_name', 'kernel', 'kernel_args', 'kernel_kwds', + 'default_queue') + def __init__(self, name, kernel, args, default_queue, **extra_kernel_kwds): + super(OpenClElementwiseKernelLauncher, self).__init__(name=name) + kernel_args = args + kernel_kwds = {} + self.kernel = kernel + self.kernel_args = kernel_args + self.kernel_kwds = kernel_kwds + self.default_queue = default_queue + def global_size_configured(self): + return True + def __call__(self, queue=None, wait_for=None, **kwds): + if __KERNEL_DEBUG__: + print ' '+self._name+'<<<{}>>>()'.format(self.kernel_args[0].shape) + queue = first_not_None(queue, self.default_queue) + self.kernel_kwds['queue'] = queue + self.kernel_kwds['wait_for'] = wait_for + evt = self.kernel(*self.kernel_args, **self.kernel_kwds) + self._register_event(queue, evt) + return evt + + return OpenClElementwiseKernelLauncher(name=name, kernel=self.kernel, + args=args, default_queue=queue, **kwds) class _ReductionKernel(object): """ @@ -106,25 +138,24 @@ class _ReductionKernel(object): iargs, oargs, pargs, default_queue, **extra_kernel_kwds): super(OpenClReductionKernelLauncher, self).__init__(name=name) kernel_args = (iargs+pargs) - kernel_kwds = dict(out=oargs[0], return_event=return_event, + kernel_kwds = dict(out=oargs[0], return_event=True, allocator=None, range=None, slice=None) self.kernel = kernel self.kernel_args = kernel_args self.kernel_kwds = kernel_kwds self.default_queue = default_queue - self.return_event = return_event def global_size_configured(self): return True def __call__(self, queue=None, wait_for=None, **kwds): + if __KERNEL_DEBUG__: + print ' '+self._name+'<<<{}>>>()'.format(self.kernel_args[0].shape) queue = first_not_None(queue, self.default_queue) self.kernel_kwds['queue'] = queue self.kernel_kwds['wait_for'] = wait_for (out, evt) = self.kernel(*self.kernel_args, **self.kernel_kwds) self._register_event(queue, evt) - if self.return_event: - return (out, evt) - else: - return out + self.out = out + return evt return OpenClReductionKernelLauncher(name=name, kernel=self.kernel, default_queue=queue, **kwds) @@ -2403,17 +2434,17 @@ class OpenClArrayBackend(ArrayBackend): # Arithmetic operations def add(self, x1, x2, out=None, - queue=None, dtype=None): + queue=None, dtype=None, name='add',**kwds): """ Add arguments element-wise. """ if is_complex(x1) or is_complex(x2): return self._cplx_binary_op(x1,x2,'add', - out=out,queue=queue,dtype=dtype,) + out=out,queue=queue,dtype=dtype,name=name,**kwds) else: expr = 'y0[i] = (x0[i] + x1[i])' return self.binary_op(x0=x1, x1=x2, expr=expr, out=out, - queue=queue, dtype=dtype) + queue=queue, dtype=dtype,name=name,**kwds) def reciprocal(self, x, out=None, queue=None, dtype=None): """ @@ -2895,7 +2926,7 @@ class OpenClArrayBackend(ArrayBackend): return self.binary_op(x0=x1, x1=x2, expr=expr, out=out, queue=queue, dtype=dtype) def fmin(self, x1, x2, out=None, - queue=None, dtype=None): + queue=None, dtype=None, name='fmin', **kwds): """ Element-wise minimum of array elements, ignoring NaNs. """ @@ -2929,7 +2960,7 @@ class OpenClArrayBackend(ArrayBackend): expr = 'min(x0[i], x1[i])' expr = 'y0[i] = ' + expr return self.binary_op(x0=x1, x1=x2, expr=expr, out=out, - queue=queue, dtype=dtype) + queue=queue, dtype=dtype, name=name, **kwds) def fabs(self, x, out=None, queue=None, dtype=None): """ @@ -3025,19 +3056,19 @@ class OpenClArrayBackend(ArrayBackend): ## See https://docs.scipy.org/doc/numpy/reference/routines.sort.html #Order statistics - def amin(self, a, axis=None, out=None, queue=None, **kwds): + def amin(self, a, axis=None, out=None, queue=None, name='amin', **kwds): """ Return the minimum of an array. """ return self.reduce(kargs=(a,), neutral='x0[0]', reduce_expr='min(a,b)', - axis=axis, out=out, queue=queue, **kwds) + axis=axis, out=out, queue=queue, name=name, **kwds) - def amax(self, a, axis=None, out=None, slice=None, queue=None, **kwds): + def amax(self, a, axis=None, out=None, slice=None, queue=None, name='amax', **kwds): """ Return the maximum of an array. """ return self.reduce(kargs=(a,), neutral='x0[0]', reduce_expr='max(a,b)', - axis=axis, out=out, queue=queue, **kwds) + axis=axis, out=out, queue=queue, name=name, **kwds) def average(self, a, axis=None, weights=None, returned=False, queue=None): """ diff --git a/hysop/backend/device/opencl/opencl_buffer.py b/hysop/backend/device/opencl/opencl_buffer.py index b044fb440534e9e1f184e1a5a7c55ced3836085c..a830a5211f05258b859d230e0bf8326b771532b9 100644 --- a/hysop/backend/device/opencl/opencl_buffer.py +++ b/hysop/backend/device/opencl/opencl_buffer.py @@ -44,7 +44,11 @@ class OpenClBuffer(cl.Buffer, DeviceBuffer): size = self.size-offset else: assert self.size >= (offset+size) - return self[offset:offset+size] + if (offset == 0): + # Do NOT create a sub buffer if we do not have to, subbuffers are precious + return self + else: + return self[offset:offset+size] class OpenClPooledBuffer(PooledBuffer, cl.MemoryObjectHolder): def get_ptr(self): diff --git a/hysop/backend/device/opencl/opencl_copy_kernel_launchers.py b/hysop/backend/device/opencl/opencl_copy_kernel_launchers.py index b6315d660225cb9b1f4bc22fb7d2ab316a1d9952..36ec1203de57646c866bfab43bb57e7d2c3e8780 100644 --- a/hysop/backend/device/opencl/opencl_copy_kernel_launchers.py +++ b/hysop/backend/device/opencl/opencl_copy_kernel_launchers.py @@ -1,5 +1,5 @@ -from hysop import vprint, dprint +from hysop import vprint, dprint, __KERNEL_DEBUG__ from hysop.deps import np from hysop.tools.decorators import debug from hysop.tools.types import check_instance, first_not_None @@ -47,7 +47,9 @@ class OpenClCopyKernelLauncher(OpenClKernelLauncher): """ return dict(self._enqueue_copy_kwds.items()) - def __call__(self, queue=None, wait_for=None): + def __call__(self, queue=None, wait_for=None, **kwds): + if __KERNEL_DEBUG__: + print ' '+self._apply_msg queue = first_not_None(queue, self._default_queue) check_instance(queue, cl.CommandQueue) evt = cl.enqueue_copy(queue=queue, **self._enqueue_copy_kwds) @@ -282,7 +284,12 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): name = 'enqueue_copy_rect_{}__{}_to_{}'.format(varname, 'host' if isinstance(src, np.ndarray) else 'device', 'host' if isinstance(dst, np.ndarray) else 'device') - apply_msg='{}<<<{}>>>'.format(name, region) + #apply_msg='{}<<<{}>>>(so={}, sp={}, do={}, dp={})' + #apply_msg=apply_msg.format(name, region, + #src_origin, src_pitches, + #dst_origin, dst_pitches) + apply_msg='{}<<<{}>>>()' + apply_msg=apply_msg.format(name, region) super(OpenClCopyBufferRectLauncher, self).__init__(dst=dst, src=src, enqueue_copy_kwds=enqueue_copy_kwds, @@ -319,7 +326,6 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): indices += (slc.indices(si),) else: indices += ((slc, slc+1, 1),) - print indices nelems = tuple( (idx[1]-idx[0]+idx[2]-1)//idx[2] for idx in indices ) nbytes = prod(nelems) * dtype.itemsize @@ -372,6 +378,7 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): assert pitches[-1] == dtype.itemsize pitches = pitches[:-1] region[-1] *= dtype.itemsize + origin[-1] *= dtype.itemsize # ptr+=offset return data, region, origin, pitches @@ -419,6 +426,12 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): raise ValueError(msg0+msg) region = src_region + if (region<=0).any(): + msg =' >Error: region is ill-formed or zero-sized:' + msg+='\n region: {}' + msg =msg.format(region) + raise ValueError(msg0+msg) + copy_dims = src_region.size assert copy_dims > 0 if (copy_dims > 3): @@ -427,11 +440,10 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): msg+='\n dst_region: {}' msg =msg.format(src_region, dst_region) - zero, one = np.int32(0), np.int32(1) - _region, _src_origin, _dst_origin = ([zero]*3,)*3 - _src_pitches, _dst_pitches = ([one]*2,)*2 - + _region, _src_origin, _dst_origin = [one]*3, [zero]*3, [zero]*3 + _src_pitches, _dst_pitches = [zero]*2, [zero]*2 + _region[:copy_dims] = region[::-1] _src_origin[:copy_dims] = src_origin[::-1] _dst_origin[:copy_dims] = dst_origin[::-1] diff --git a/hysop/backend/device/opencl/opencl_kernel_launcher.py b/hysop/backend/device/opencl/opencl_kernel_launcher.py index 09fb12e50d5437bb5cf87acd27daed6b5e274d0d..a686cb47f4eca2bcbd0f2991387db3580ff36b72 100644 --- a/hysop/backend/device/opencl/opencl_kernel_launcher.py +++ b/hysop/backend/device/opencl/opencl_kernel_launcher.py @@ -1,6 +1,6 @@ from abc import ABCMeta, abstractmethod -from hysop import vprint, dprint +from hysop import vprint, dprint, __KERNEL_DEBUG__ from hysop.tools.decorators import debug from hysop.tools.types import check_instance, first_not_None from hysop.tools.numpywrappers import npw @@ -113,7 +113,8 @@ class OpenClKernelListLauncher(object): If this OpenClKernelListLauncher is empty, cl.wait_for_events will be called instead. """ - dprint(self._apply_msg) + if __KERNEL_DEBUG__: + print self._apply_msg if __debug__: parameters = self._parameters @@ -127,7 +128,13 @@ class OpenClKernelListLauncher(object): if kernels: evt = kernels[0].__call__(queue=queue, wait_for=wait_for, **kwds) for kernel in kernels[1:]: - evt = kernel.__call__(queue=queue, **kwds) + try: + evt = kernel.__call__(queue=queue, **kwds) + except: + msg='\nFailed to call kernel {} of type {}.\n' + msg=msg.format(kernel.name,type(kernel).__name__) + print msg + raise else: evt = cl.wait_for_events(wait_for) return evt @@ -135,7 +142,7 @@ class OpenClKernelListLauncher(object): def _update_parameters_from_parametrized_kernel(self, kernel, parameters): """Update parameters of this kernel list launcher from a OpenClParametrizedKernelLauncher.""" check_instance(kernel, OpenClParametrizedKernelLauncher) - check_instance(parameters, keys=str, values=type) + check_instance(parameters, dict, keys=str, values=type) sparameters = self._parameters for (pname, ptype) in parameters.iteritems(): if pname in sparameters: @@ -403,8 +410,9 @@ class OpenClKernelLauncher(OpenClKernelLauncherI): assert isinstance(queue, cl.CommandQueue) assert isinstance(global_work_size, tuple) assert isinstance(local_work_size, (tuple, type(None))) - - dprint(self._apply_msg.format(global_work_size, local_work_size)) + + if __KERNEL_DEBUG__: + print self._apply_msg.format(global_work_size, local_work_size) kernel = self._set_kernel_args(**kwds) diff --git a/hysop/backend/device/opencl/operator/directional/advection_dir.py b/hysop/backend/device/opencl/operator/directional/advection_dir.py index 93386d2f989fadd4c6c76cada571e0f77c0dbfff..854c1c784ca70a95027e8d5e34c3566b21781845 100644 --- a/hysop/backend/device/opencl/operator/directional/advection_dir.py +++ b/hysop/backend/device/opencl/operator/directional/advection_dir.py @@ -7,6 +7,7 @@ from hysop.operator.base.advection_dir import DirectionalAdvectionBase, MemoryRe from hysop.backend.device.opencl.autotunable_kernels.advection_dir import OpenClAutotunableDirectionalAdvectionKernel from hysop.backend.device.opencl.autotunable_kernels.remesh_dir import OpenClAutotunableDirectionalRemeshKernel from hysop.backend.device.opencl.opencl_copy_kernel_launchers import OpenClCopyBufferRectLauncher +from hysop.backend.device.opencl.opencl_kernel_launcher import OpenClKernelListLauncher class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOperator): @@ -83,9 +84,11 @@ class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOper self._collect_kernels(work) def _collect_kernels(self, work): - self._collect_advection_kernel() - self._collect_remesh_kernel() - self._collect_redistribute_kernels(work) + kl = OpenClKernelListLauncher(name='advec_remesh') + kl += self._collect_advection_kernel() + kl += self._collect_remesh_kernel() + kl += self._collect_redistribute_kernels(work) + self.all_kernels = kl def _collect_advection_kernel(self): cl_env = self.cl_env @@ -111,6 +114,7 @@ class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOper assert ('dt' in advec_launcher.parameters_map.keys()) self.advection_kernel_launcher = advec_launcher + return advec_launcher def _collect_remesh_kernel(self): cl_env = self.cl_env @@ -143,54 +147,82 @@ class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOper (remesh_kernel, args_dict) = kernel.autotune(**kwds) - self.remesh_kernel_launcher = remesh_kernel.build_launcher(**args_dict) + kl = remesh_kernel.build_launcher(**args_dict) + self.remesh_kernel_launcher = kl + return kl def _collect_redistribute_kernels(self, work): remesh_ghosts = self.remesh_ghosts dsinputs = self.dadvected_fields_in dsoutputs = self.dadvected_fields_out for sout in dsoutputs.values(): - outer_ghosts = sout.get_outer_ghost_slices(remesh_ghosts) - inner_ghosts = sout.get_inner_ghost_slices(remesh_ghosts) - nprocs = sout.mesh.proc_shape + all_outer_ghosts = sout.get_outer_ghost_slices() + all_inner_ghosts = sout.get_inner_ghost_slices() + remesh_outer_ghosts = sout.get_outer_ghost_slices(remesh_ghosts) + remesh_inner_ghosts = sout.get_inner_ghost_slices(remesh_ghosts) + nprocs = sout.mesh.proc_shape (lhs, rhs) = work.get_buffer(self, sout.name+'_ghost_layers') align = sout.backend.device.mem_base_addr_align + kl = OpenClKernelListLauncher(name='accumulate_and_exchange_ghosts') for i in xrange(sout.nb_components): - log, rog, oshape = outer_ghosts[-1] - lig, rig, ishape = inner_ghosts[-1] + log, rog, oshape = remesh_outer_ghosts[-1] + lig, rig, ishape = remesh_inner_ghosts[-1] si = sout[i] varname = sout.name+'_{}'.format(i) assert (ishape==oshape).all() if (nprocs[-1] == 1): - copy_left_inner_ghosts = OpenClCopyBufferRectLauncher.from_slices(varname=varname+'_left_inner_ghosts', - src=si, dst=lhs, src_slices=lig) - copy_right_outter_ghosts = OpenClCopyBufferRectLauncher.from_slices(varname=varname+'_right_outer_ghosts', - src=si, dst=rhs, src_slices=rog) - left_accumulator= sout.backend.add(x1=lhs, x2=lhs, out=lhs) - copy_accumulator = OpenClCopyBufferRectLauncher.from_slices(varname=varname+'_from_right_ghosts', + # accumulate right outer ghosts to left inner ghosts + kl += OpenClCopyBufferRectLauncher.from_slices(varname=varname+'_left_inner_ghosts', + src=si, src_slices=lig, dst=lhs) + kl += OpenClCopyBufferRectLauncher.from_slices(varname=varname+'_right_outer_ghosts', + src=si, src_slices=rog, dst=rhs) + kl += sout.backend.add(x1=lhs, x2=rhs, out=lhs, build_kernel_launcher=True) + kl += OpenClCopyBufferRectLauncher.from_slices(varname=varname+'_accumulate_rg', src=lhs, dst=si, dst_slices=lig) - # buf[ldst] += buf[rsrc] - # buf[rdst] += buf[lsrc] + + # accumulate left outer ghosts to right inner ghosts + kl += OpenClCopyBufferRectLauncher.from_slices(varname=varname+'_left_outer_ghosts', + src=si, src_slices=log, dst=lhs) + kl += OpenClCopyBufferRectLauncher.from_slices(varname=varname+'_right_inner_ghosts', + src=si, src_slices=rig, dst=rhs) + kl += sout.backend.add(x1=lhs, x2=rhs, out=lhs, build_kernel_launcher=True) + kl += OpenClCopyBufferRectLauncher.from_slices(varname=varname+'_accumulate_lg', + src=lhs, dst=si, dst_slices=rig) + else: + msg='MPI exchange not implemented yet.' + raise RuntimeError(msg) + + for i in xrange(sout.nb_components): + lsrc, rsrc, ishape = all_inner_ghosts[-1] + ldst, rdst, oshape = all_outer_ghosts[-1] + si = sout[i] + varname = sout.name+'_{}'.format(i) + assert (ishape==oshape).all() + if (nprocs[-1] == 1): + # accumulate right outer ghosts to left inner ghosts + kl += OpenClCopyBufferRectLauncher.from_slices(varname=varname+'_to_right_ghosts', + src=si, src_slices=lsrc, dst=si, dst_slices=rdst) + kl += OpenClCopyBufferRectLauncher.from_slices(varname=varname+'_to_left_ghosts', + src=si, src_slices=rsrc, dst=si, dst_slices=ldst) + pass else: msg='MPI exchange not implemented yet.' raise RuntimeError(msg) + + self.accumulate_and_exchange = kl + return kl + @op_apply def apply(self, simulation, **kargs): queue = self.cl_env.default_queue dt = self.precision(simulation.time_step * self.dt_coeff) - print 'DT' - print float(dt) - evt = self.advection_kernel_launcher(queue=queue, dt=dt) - evt.wait() - print 'POSITION' - print self.dposition(0).get() - - evt = self.remesh_kernel_launcher(queue=queue) + #evt = self.advection_kernel_launcher(queue=queue, dt=dt) + #evt = self.remesh_kernel_launcher(queue=queue) + #evt = self.accumulate_and_exchange(queue=queue) + evt = self.all_kernels(queue=queue, dt=dt) evt.wait() - print 'VORTICITY OUT' - print self.dadvected_fields_out.values()[0][0].get() diff --git a/hysop/backend/host/python/operator/directional/advection_dir.py b/hysop/backend/host/python/operator/directional/advection_dir.py index b6446508a14d820a13c499659004215a79bb3edc..3358116d21ce1bce64917db9d5542ce9aecc1df7 100644 --- a/hysop/backend/host/python/operator/directional/advection_dir.py +++ b/hysop/backend/host/python/operator/directional/advection_dir.py @@ -138,7 +138,7 @@ class PythonDirectionalAdvection(DirectionalAdvectionBase, HostDirectionalOperat self._compute_remesh(dt) print 'VORTICITY OUT' - print self.dadvected_fields_out.values()[0][0] + self.dadvected_fields_out.values()[0].print_with_ghosts() def _interp_velocity(self, Vin, Vout, dX, I, Ig, lidx, ridx, inv_dx, is_periodic): """ @@ -375,7 +375,7 @@ class PythonDirectionalAdvection(DirectionalAdvectionBase, HostDirectionalOperat ghost_exchangers = self._out_ghost_exchangers for sout in dsoutputs.values(): print 'VORTICITY PRE-REDISTRIBUTE' - print sout(0) + print sout[0] buffers = sout.buffers outer_ghosts = self._outer_ghosts[sout] inner_ghosts = self._inner_ghosts[sout] diff --git a/hysop/core/memory/memory_request.py b/hysop/core/memory/memory_request.py index 979fab660a48ae3f3a8116c49d7a4458c3d5e81c..6b0e8ab779ed03a4ca365d4356dd6af2ae0b2e70 100644 --- a/hysop/core/memory/memory_request.py +++ b/hysop/core/memory/memory_request.py @@ -290,7 +290,6 @@ class MultipleOperatorMemoryRequests(object): start_idx += align_offset end_idx = start_idx + size - print 'VIEW', start_idx, req.alignment view = data[start_idx:end_idx].view(dtype=req.dtype).reshape(req.shape) req_views.append(view) diff --git a/hysop/fields/cartesian_discrete_field.py b/hysop/fields/cartesian_discrete_field.py index 3560a64456483443b3a5fdebd31fc9af662f1149..22dc78ead3ca5c096e1e32040d3ea94d9e48cc7e 100644 --- a/hysop/fields/cartesian_discrete_field.py +++ b/hysop/fields/cartesian_discrete_field.py @@ -242,6 +242,9 @@ class CartesianDiscreteFieldView(DiscreteFieldView): def _get_grid_npoints(self): """Return the effective number of global computational points.""" return self.mesh.grid_npoints + def _get_space_step(self): + """Get the space step of the discretized mesh grid.""" + return self.mesh.space_step def _get_boundaries(self): """ @@ -599,6 +602,7 @@ CartesianDiscreteFieldView (id={}, tag={}) resolution = property(_get_resolution) ghosts = property(_get_ghosts) boundaries = property(_get_boundaries) + space_step = property(_get_space_step) compute_slices = property(_get_compute_slices) inner_ghost_slices = property(get_inner_ghost_slices) diff --git a/hysop/mesh/cartesian_mesh.py b/hysop/mesh/cartesian_mesh.py index 196f5a1a00288a3930c5bdf2ee96cc1a4435c185..d0a4a0e919ed6f69f710d64f12da00a1badc8721 100644 --- a/hysop/mesh/cartesian_mesh.py +++ b/hysop/mesh/cartesian_mesh.py @@ -307,9 +307,7 @@ class CartesianMeshView(MeshView): """ return self.__get_transposed_mesh_attr('_ghosts') def _get_space_step(self): - """ - Get the space step of the discretized mesh grid. - """ + """Get the space step of the discretized mesh grid.""" return self.__get_transposed_mesh_attr('_space_step') def _get_is_at_left_boundary(self):