From 4d609aed7ed6edfe35d8ee3684c6d6074dfcf6db Mon Sep 17 00:00:00 2001 From: JM Etancelin <jean-matthieu.etancelin@univ-pau.fr> Date: Mon, 30 Nov 2020 15:20:32 +0100 Subject: [PATCH] Improve profiling : add OpenCL timings to HySoP Profiler objects. --- .../opencl/opencl_copy_kernel_launchers.py | 429 +++++++++--------- .../device/opencl/opencl_kernel_launcher.py | 305 +++++++------ .../device/opencl/operator/custom_symbolic.py | 22 +- .../operator/directional/advection_dir.py | 65 +-- .../operator/directional/stretching_dir.py | 156 +++---- .../opencl/operator/solenoidal_projection.py | 39 +- .../opencl/operator/spatial_filtering.py | 103 ++--- .../device/opencl/operator/transpose.py | 46 +- hysop/iterative_method.py | 6 +- 9 files changed, 598 insertions(+), 573 deletions(-) diff --git a/hysop/backend/device/opencl/opencl_copy_kernel_launchers.py b/hysop/backend/device/opencl/opencl_copy_kernel_launchers.py index 353a6755b..507bc1f49 100644 --- a/hysop/backend/device/opencl/opencl_copy_kernel_launchers.py +++ b/hysop/backend/device/opencl/opencl_copy_kernel_launchers.py @@ -1,7 +1,7 @@ - from hysop import vprint, dprint, __KERNEL_DEBUG__, __TRACE_KERNELS__ from hysop.deps import np from hysop.constants import Backend +from hysop.tools.profiler import FProfiler from hysop.tools.decorators import debug from hysop.tools.types import check_instance, first_not_None, to_list from hysop.tools.misc import prod @@ -15,10 +15,10 @@ from hysop.backend.device.opencl.opencl_kernel_statistics import OpenClKernelSta class OpenClCopyKernelLauncher(OpenClKernelLauncher): """Interface to non-blocking OpenCL copy kernels.""" - + @debug def __init__(self, name, dst, src, - enqueue_copy_kwds, apply_msg, **kwds): + enqueue_copy_kwds, apply_msg, **kwds): """ Initialize an OpenClCopyKernelLauncher. @@ -28,27 +28,27 @@ class OpenClCopyKernelLauncher(OpenClKernelLauncher): Arguments to to passed to pyopencl.enqueue_copy. """ assert 'default_global_work_size' not in kwds - assert 'default_local_work_size' not in kwds + assert 'default_local_work_size' not in kwds assert 'is_blocking' not in kwds enqueue_copy_kwds['dest'] = dst - enqueue_copy_kwds['src'] = src + enqueue_copy_kwds['src'] = src if isinstance(src, np.ndarray) or isinstance(dst, np.ndarray): enqueue_copy_kwds['is_blocking'] = False super(OpenClCopyKernelLauncher, self).__init__(name=name, - kernel=None, args_list=(), **kwds) - + kernel=None, args_list=(), **kwds) + self._enqueue_copy_kwds = enqueue_copy_kwds self._apply_msg = apply_msg def _get_enqueue_copy_kwds(self): """ - Return a copy of the keywords arguments that will be passed + Return a copy of the keywords arguments that will be passed to pyopencl.enqueue_copy. """ return dict(self._enqueue_copy_kwds.items()) - + def __call__(self, queue=None, wait_for=None, **kwds): trace_kernel(' '+self._apply_msg) queue = first_not_None(queue, self._default_queue) @@ -56,22 +56,24 @@ class OpenClCopyKernelLauncher(OpenClKernelLauncher): wait_for = to_list(wait_for) check_instance(queue, cl.CommandQueue) evt = cl.enqueue_copy(queue=queue, wait_for=wait_for, - **self._enqueue_copy_kwds) - profile_kernel(None, evt, self._apply_msg) + **self._enqueue_copy_kwds) + profile_kernel(None, evt, self._apply_msg, fprofiler=self._profiler) return evt def global_size_configured(self): return True - + enqueue_copy_kwds = property(_get_enqueue_copy_kwds) + class OpenClCopyBufferLauncher(OpenClCopyKernelLauncher): """Non-blocking OpenCL copy kernel between host buffers and/or opencl device buffers.""" + def __init__(self, varname, src, dst, - src_device_offset=None, - dst_device_offset=None, - byte_count=None, - **kwds): + src_device_offset=None, + dst_device_offset=None, + byte_count=None, + **kwds): """ Initialize a (HOST <-> DEVICE) or a (DEVICE <-> DEVICE) copy kernel. @@ -84,22 +86,22 @@ class OpenClCopyBufferLauncher(OpenClCopyKernelLauncher): dst: cl.MemoryObjectHolder or np.ndarray The destination buffer. src_device_offset: int, optional - Offset in the source buffer, only valid if + Offset in the source buffer, only valid if source buffer is a cl.MemoryObjectHolder. dst_device_offset: int, optional - Offset in the source buffer, only valid if + Offset in the source buffer, only valid if source buffer is a cl.MemoryObjectHolder. byte_count: int Byte count to copy if and only if source and destination buffers are cl.MemoryObjectHolders. - + Notes ----- - The size of the transfer is controlled by the size of the of the host-side buffer. - If the host-side buffer is a numpy.ndarray, you can control the transfer size + The size of the transfer is controlled by the size of the of the host-side buffer. + If the host-side buffer is a numpy.ndarray, you can control the transfer size by transfering into a smaller view of the target array by using indexing, - If neither src nor dst are host buffers, the size is controlled by the + If neither src nor dst are host buffers, the size is controlled by the parameter byte_count. Device buffers cannot have views like np.ndarrays, an offset in bytes can @@ -111,8 +113,8 @@ class OpenClCopyBufferLauncher(OpenClCopyKernelLauncher): check_instance(dst_device_offset, (int, np.integer), allow_none=True) check_instance(byte_count, (int, np.integer), allow_none=True) - msg='Host to host copy is not supported.' - assert not (isinstance(src, np.ndarray) and isinstance(dst, np.ndarray)), msg + msg = 'Host to host copy is not supported.' + assert not (isinstance(src, np.ndarray) and isinstance(dst, np.ndarray)), msg enqueue_copy_kwds = {} if (src_device_offset is not None): @@ -126,35 +128,35 @@ class OpenClCopyBufferLauncher(OpenClCopyKernelLauncher): assert isinstance(dst, cl.MemoryObjectHolder) enqueue_copy_kwds['byte_count'] = byte_count - shape = first_not_None((byte_count,), - getattr(src, 'shape', None), - getattr(dst, 'shape', None), - '...') - + shape = first_not_None((byte_count,), + getattr(src, 'shape', None), + getattr(dst, 'shape', None), + '...') + assert 'name' not in kwds name = 'enqueue_copy_{}__{}_to_{}'.format(varname, - 'host' if isinstance(src, np.ndarray) else 'device', - 'host' if isinstance(dst, np.ndarray) else 'device') - apply_msg='{}<<<{}>>>'.format(name, shape) - - super(OpenClCopyBufferLauncher, self).__init__(dst=dst, src=src, - enqueue_copy_kwds=enqueue_copy_kwds, - name=name, apply_msg=apply_msg, **kwds) - + 'host' if isinstance(src, np.ndarray) else 'device', + 'host' if isinstance(dst, np.ndarray) else 'device') + apply_msg = '{}<<<{}>>>'.format(name, shape) + + super(OpenClCopyBufferLauncher, self).__init__(dst=dst, src=src, + enqueue_copy_kwds=enqueue_copy_kwds, + name=name, apply_msg=apply_msg, **kwds) + def _format_host_arg(self, arg): if isinstance(arg, HostArray): - arg = arg.data + arg = arg.data nbytes = arg.nbytes elif isinstance(arg, np.ndarray): nbytes = arg.size * arg.dtype.itemsize else: - msg='Unknown type {} to format device buffer arguments.' - msg=msg.format(type(arg)) + msg = 'Unknown type {} to format device buffer arguments.' + msg = msg.format(type(arg)) raise TypeError(msg) return arg, nbytes def _format_device_arg(self, arg, arg_offset): - nbytes=None + nbytes = None if isinstance(arg, (OpenClArray, clArray.Array)): arg_offset = first_not_None(arg_offset, 0) nbytes = arg.nbytes - arg_offset @@ -163,13 +165,15 @@ class OpenClCopyBufferLauncher(OpenClCopyKernelLauncher): elif isinstance(arg, cl.MemoryObjectHolder): pass else: - msg='Unknown type {} to format device buffer arguments.' - msg=msg.format(type(arg)) + msg = 'Unknown type {} to format device buffer arguments.' + msg = msg.format(type(arg)) raise TypeError(msg) return (arg, arg_offset, nbytes) + class OpenClCopyHost2DeviceLauncher(OpenClCopyBufferLauncher): """Reduced interface for host to device copy kernels.""" + def __init__(self, varname, src, dst, dst_device_offset=None): src, src_nbytes = self._format_host_arg(src) dst, dst_device_offset, dst_nbytes = self._format_device_arg(dst, dst_device_offset) @@ -178,10 +182,12 @@ class OpenClCopyHost2DeviceLauncher(OpenClCopyBufferLauncher): check_instance(dst_device_offset, (int, np.integer), allow_none=True) assert (src_nbytes is None) or (dst_nbytes is None) or (src_nbytes == dst_nbytes) super(OpenClCopyHost2DeviceLauncher, self).__init__(varname=varname, src=src, - dst=dst, dst_device_offset=dst_device_offset) + dst=dst, dst_device_offset=dst_device_offset) + class OpenClCopyDevice2HostLauncher(OpenClCopyBufferLauncher): """Reduced interface for device to host copy kernels.""" + def __init__(self, varname, src, dst, src_device_offset=None): src, src_device_offset, src_nbytes = self._format_device_arg(src, src_device_offset) dst, dst_nbytes = self._format_host_arg(dst) @@ -190,12 +196,14 @@ class OpenClCopyDevice2HostLauncher(OpenClCopyBufferLauncher): check_instance(src_device_offset, (int, np.integer), allow_none=True) assert (src_nbytes is None) or (dst_nbytes is None) or (src_nbytes == dst_nbytes) super(OpenClCopyDevice2HostLauncher, self).__init__(varname=varname, src=src, - dst=dst, src_device_offset=src_device_offset) + dst=dst, src_device_offset=src_device_offset) + class OpenClCopyDevice2DeviceLauncher(OpenClCopyBufferLauncher): """Reduced interface for device to device copy kernels.""" - def __init__(self, varname, src, dst, - src_device_offset=None, dst_device_offset=None, byte_count=None): + + def __init__(self, varname, src, dst, + src_device_offset=None, dst_device_offset=None, byte_count=None): src, src_device_offset, src_nbytes = self._format_device_arg(src, src_device_offset) dst, dst_device_offset, dst_nbytes = self._format_device_arg(dst, dst_device_offset) byte_count = first_not_None(byte_count, min(src_nbytes, dst_nbytes)) @@ -205,25 +213,25 @@ class OpenClCopyDevice2DeviceLauncher(OpenClCopyBufferLauncher): check_instance(dst_device_offset, (int, np.integer), allow_none=True) check_instance(byte_count, (int, np.integer), allow_none=True) assert (src_nbytes is None) or (dst_nbytes is None) or (src_nbytes == dst_nbytes) - super(OpenClCopyDevice2DeviceLauncher, self).__init__(varname=varname, - src=src, dst=dst, - src_device_offset=src_device_offset, dst_device_offset=dst_device_offset, - byte_count=byte_count) - + super(OpenClCopyDevice2DeviceLauncher, self).__init__(varname=varname, + src=src, dst=dst, + src_device_offset=src_device_offset, dst_device_offset=dst_device_offset, + byte_count=byte_count) class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): """ - Non-blocking OpenCL copy kernel between host buffers and/or opencl device + Non-blocking OpenCL copy kernel between host buffers and/or opencl device rectangle subregions of buffers (OpenCL 1.1 and newer). - Supports n-dimensional strided arrays with dimension greater than 3 + Supports n-dimensional strided arrays with dimension greater than 3 via iterating over 3D subregions. """ - def __init__(self, varname, src, dst, - copy_region, copy_src_origin, copy_dst_origin, copy_src_pitches, copy_dst_pitches, - iter_region=None, iter_src_origin=None, iter_dst_origin=None, iter_src_pitches=None, iter_dst_pitches=None, - **kwds): + + def __init__(self, varname, src, dst, + copy_region, copy_src_origin, copy_dst_origin, copy_src_pitches, copy_dst_pitches, + iter_region=None, iter_src_origin=None, iter_dst_origin=None, iter_src_pitches=None, iter_dst_pitches=None, + **kwds): """ Initialize a (HOST <-> DEVICE) or a (DEVICE <-> DEVICE) rectangle subregions copy kernel. @@ -237,19 +245,19 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): dst: cl.MemoryObjectHolder or np.ndarray The destination buffer. copy_region: tuple of ints - The 3D region to copy in terms of bytes for the + The 3D region to copy in terms of bytes for the first dimension and of elements for the two last dimensions. copy_src_origin: tuple of ints - The 3D offset in number of elements of the region associated with src buffer. + The 3D offset in number of elements of the region associated with src buffer. The final src offset in bytes is computed from src_origin and src_pitch. copy_dst_origin: tuple of ints - The 3D offset in number of elements of the region associated with dst buffer. + The 3D offset in number of elements of the region associated with dst buffer. The final dst offset in bytes is computed from dst_origin and dst_pitch. copy_src_pitches: tuple of ints - The 2D pitches used to compute src offsets in bytes for + The 2D pitches used to compute src offsets in bytes for the second and the third dimension. copy_dst_pitches: tuple of ints - The 2D pitches used to compute dst offsets in bytes for + The 2D pitches used to compute dst offsets in bytes for the second and the third dimension. iter_region: tuple of ints The n-dimensional region to iterate if the copied region dimension is greater than 3. @@ -264,9 +272,9 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): kwds: dict Base class arguments """ - iter_region = first_not_None(iter_region, ()) - iter_src_origin = first_not_None(iter_src_origin, ()) - iter_dst_origin = first_not_None(iter_dst_origin, ()) + iter_region = first_not_None(iter_region, ()) + iter_src_origin = first_not_None(iter_src_origin, ()) + iter_dst_origin = first_not_None(iter_dst_origin, ()) iter_src_pitches = first_not_None(iter_src_pitches, ()) iter_dst_pitches = first_not_None(iter_dst_pitches, ()) @@ -278,77 +286,77 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): check_instance(copy_dst_origin, tuple, values=(int, np.integer), size=3) check_instance(copy_src_pitches, tuple, values=(int, np.integer), size=2) check_instance(copy_dst_pitches, tuple, values=(int, np.integer), size=2) - + n = len(iter_region) check_instance(iter_region, tuple, values=(int, np.integer), size=n) check_instance(iter_src_origin, tuple, values=(int, np.integer), size=n) check_instance(iter_dst_origin, tuple, values=(int, np.integer), size=n) check_instance(iter_src_pitches, tuple, values=(int, np.integer), size=n) check_instance(iter_dst_pitches, tuple, values=(int, np.integer), size=n) - + enqueue_copy_kwds = {} enqueue_copy_kwds['region'] = copy_region if isinstance(src, np.ndarray) and \ isinstance(dst, np.ndarray): - msg='Host to host copy is not supported.' + msg = 'Host to host copy is not supported.' raise RuntimeError(msg) elif isinstance(src, cl.MemoryObjectHolder) and \ - isinstance(dst, cl.MemoryObjectHolder): - enqueue_copy_kwds['src_origin'] = copy_src_origin + isinstance(dst, cl.MemoryObjectHolder): + enqueue_copy_kwds['src_origin'] = copy_src_origin enqueue_copy_kwds['src_pitches'] = copy_src_pitches - enqueue_copy_kwds['dst_origin'] = copy_dst_origin + enqueue_copy_kwds['dst_origin'] = copy_dst_origin enqueue_copy_kwds['dst_pitches'] = copy_dst_pitches src_origin_kwd = 'src_origin' dst_origin_kwd = 'dst_origin' elif isinstance(src, cl.MemoryObjectHolder) and \ - isinstance(dst, np.ndarray): - enqueue_copy_kwds['host_origin'] = copy_dst_origin - enqueue_copy_kwds['host_pitches'] = copy_dst_pitches - enqueue_copy_kwds['buffer_origin'] = copy_src_origin + isinstance(dst, np.ndarray): + enqueue_copy_kwds['host_origin'] = copy_dst_origin + enqueue_copy_kwds['host_pitches'] = copy_dst_pitches + enqueue_copy_kwds['buffer_origin'] = copy_src_origin enqueue_copy_kwds['buffer_pitches'] = copy_src_pitches src_origin_kwd = 'buffer_origin' dst_origin_kwd = 'host_origin' elif isinstance(src, np.ndarray) and \ - isinstance(dst, cl.MemoryObjectHolder): - enqueue_copy_kwds['host_origin'] = copy_src_origin - enqueue_copy_kwds['host_pitches'] = copy_src_pitches - enqueue_copy_kwds['buffer_origin'] = copy_dst_origin + isinstance(dst, cl.MemoryObjectHolder): + enqueue_copy_kwds['host_origin'] = copy_src_origin + enqueue_copy_kwds['host_pitches'] = copy_src_pitches + enqueue_copy_kwds['buffer_origin'] = copy_dst_origin enqueue_copy_kwds['buffer_pitches'] = copy_dst_pitches src_origin_kwd = 'host_origin' dst_origin_kwd = 'buffer_origin' else: - msg='The impossible happened.\n *src={}\n *dst={}' - msg=msg.format(type(src), type(dst)) + msg = 'The impossible happened.\n *src={}\n *dst={}' + msg = msg.format(type(src), type(dst)) raise ValueError(msg) assert 'name' not in kwds name = 'enqueue_copy_rect_{}__{}_to_{}'.format(varname, - 'host' if isinstance(src, np.ndarray) else 'device', - 'host' if isinstance(dst, np.ndarray) else 'device') - apply_msg='{}<<<{}>>>()' - apply_msg=apply_msg.format(name, copy_region) - + 'host' if isinstance(src, np.ndarray) else 'device', + 'host' if isinstance(dst, np.ndarray) else 'device') + apply_msg = '{}<<<{}>>>()' + apply_msg = apply_msg.format(name, copy_region) + # if iteration is required, we redefine __call__ - if (n>0): + if (n > 0): apply_msg += ' iterated over ndindex {}'.format(iter_region) assert src_origin_kwd in enqueue_copy_kwds assert dst_origin_kwd in enqueue_copy_kwds src_origin = enqueue_copy_kwds.pop(src_origin_kwd) dst_origin = enqueue_copy_kwds.pop(dst_origin_kwd) - super(OpenClCopyBufferRectLauncher, self).__init__(dst=dst, src=src, - enqueue_copy_kwds=enqueue_copy_kwds, - name=name, apply_msg=apply_msg, **kwds) - - if (n>0): - def call(queue=None, wait_for=None, - iter_region=iter_region, - iter_src_origin=iter_src_origin, - iter_dst_origin=iter_dst_origin, - iter_src_pitches=iter_src_pitches, - iter_dst_pitches=iter_dst_pitches, - **kwds): + super(OpenClCopyBufferRectLauncher, self).__init__(dst=dst, src=src, + enqueue_copy_kwds=enqueue_copy_kwds, + name=name, apply_msg=apply_msg, **kwds) + + if (n > 0): + def call(queue=None, wait_for=None, + iter_region=iter_region, + iter_src_origin=iter_src_origin, + iter_dst_origin=iter_dst_origin, + iter_src_pitches=iter_src_pitches, + iter_dst_pitches=iter_dst_pitches, + **kwds): if __KERNEL_DEBUG__ or __TRACE_KERNELS__: print ' '+self._apply_msg queue = first_not_None(queue, self._default_queue) @@ -362,7 +370,8 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): enqueue_copy_kwds[src_origin_kwd] = _src_origin enqueue_copy_kwds[dst_origin_kwd] = _dst_origin evt = cl.enqueue_copy(queue=queue, wait_for=wait_for, - **enqueue_copy_kwds) + **enqueue_copy_kwds) + profile_kernel(None, evt, self._apply_msg, fprofiler=self._profiler) wait_for = None return evt self.call = call @@ -374,15 +383,14 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): return super(OpenClCopyBufferRectLauncher, self).__call__(*args, **kwds) else: return self.call(*args, **kwds) - @classmethod def _format_slices(cls, a, slices): check_instance(a, (np.ndarray, clArray.Array, Array)) - shape = a.shape + shape = a.shape dtype = a.dtype - ndim = a.ndim + ndim = a.ndim if (not slices) or (slices is Ellipsis): slices = (Ellipsis,) @@ -391,31 +399,31 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): # expand ellipsis if (Ellipsis in slices): nellipsis = slices.count(Ellipsis) - msg='Only one Ellipsis can be passed.' - assert nellipsis==1, msg + msg = 'Only one Ellipsis can be passed.' + assert nellipsis == 1, msg eid = slices.index(Ellipsis) missing_count = ndim-len(slices) missing_slices = tuple(slice(shape[i]) for i in xrange(eid, eid+missing_count+1)) full_slices = slices[:eid]+missing_slices+slices[eid+1:] slices = full_slices - check_instance(slices, tuple, values=(int,slice), size=ndim) + check_instance(slices, tuple, values=(int, slice), size=ndim) # compute indices indices = () for slc, si in zip(slices, shape): if (slc.stop is not None) and (slc.stop > si): - msg='Error in slice specification: slc={} but size is only {}.' - msg=msg.format(slc, si) + msg = 'Error in slice specification: slc={} but size is only {}.' + msg = msg.format(slc, si) raise ValueError(msg) if isinstance(slc, slice): indices += (slc.indices(si),) - else: + else: indices += ((slc, slc+1, 1),) - - nelems = tuple( (idx[1]-idx[0]+idx[2]-1)//idx[2] for idx in indices ) + + nelems = tuple((idx[1]-idx[0]+idx[2]-1)//idx[2] for idx in indices) nbytes = prod(nelems) * dtype.itemsize return slices, dtype, nelems, nbytes, indices - + @classmethod def _compute_region(cls, a, indices): # compute nelems and parameters @@ -433,112 +441,111 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): start_offset = a.offset if isinstance(a, Array): a = a.handle - - shape = a.shape + + shape = a.shape strides = a.strides - dtype = a.dtype - estart = tuple( idx[0] for idx in indices ) - estop = tuple( idx[1] for idx in indices ) - estep = tuple( idx[2] for idx in indices ) + dtype = a.dtype + estart = tuple(idx[0] for idx in indices) + estop = tuple(idx[1] for idx in indices) + estep = tuple(idx[2] for idx in indices) assert len(shape) == len(strides) == len(estep) == len(estart) == len(estop) _estart, _estop, _estep = (npw.asintegerarray(_) for _ in (estart, estop, estep)) if ((_estart % _estep) != 0).any(): - msg='Start is not aligned on step, cannot compute origin.' + msg = 'Start is not aligned on step, cannot compute origin.' raise ValueError(msg) if ((_estop % _estep) != 0).any(): - msg='Stop is not aligned on step, cannot compute region.' + msg = 'Stop is not aligned on step, cannot compute region.' raise ValueError(msg) if (estep[-1] != 1): - msg='Array is not contiguous (last slice step should be 1).' + msg = 'Array is not contiguous (last slice step should be 1).' raise ValueError(msg) if (strides[-1] != dtype.itemsize): - msg='Array is not contiguous (last strides should be item size).' + msg = 'Array is not contiguous (last strides should be item size).' raise ValueError(msg) - region, origin, pitches = (),(),() + region, origin, pitches = (), (), () for (Si, Sr, start, stop, step) in zip(shape, strides, estart, estop, estep): Ni = (stop-start+step-1)//step - if (Ni<=0) or (Ni>Si): - msg='Ni={}, Si={}'.format(Ni, Si) + if (Ni <= 0) or (Ni > Si): + msg = 'Ni={}, Si={}'.format(Ni, Si) raise ValueError(msg) - elif (not region) or (Ni <= Si): - region += (Ni,) - origin += (start//step,) + elif (not region) or (Ni <= Si): + region += (Ni,) + origin += (start//step,) pitches += (step*Sr,) - region = np.asarray(region, dtype=np.int32) - origin = np.asarray(origin, dtype=np.int32) + region = np.asarray(region, dtype=np.int32) + origin = np.asarray(origin, dtype=np.int32) pitches = np.asarray(pitches, dtype=np.int32) - + assert pitches[-1] == dtype.itemsize pitches = pitches[:-1] region[-1] *= dtype.itemsize - origin[-1] *= dtype.itemsize + origin[-1] *= dtype.itemsize origin[-1] += start_offset return data, region, origin, pitches - @classmethod def from_slices(cls, varname, src, dst, src_slices=None, dst_slices=None): """ - Build an OpenClCopyBufferRectLauncher from source, destinations + Build an OpenClCopyBufferRectLauncher from source, destinations and some slices. Device arrays must be aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN. """ - assert hasattr(src, 'shape') - assert hasattr(src, 'dtype') - assert hasattr(src, 'strides') - assert hasattr(dst, 'shape') - assert hasattr(dst, 'dtype') - assert hasattr(dst, 'strides') - + assert hasattr(src, 'shape') + assert hasattr(src, 'dtype') + assert hasattr(src, 'strides') + assert hasattr(dst, 'shape') + assert hasattr(dst, 'dtype') + assert hasattr(dst, 'strides') + msg0 = 'OpenClCopyBufferRectLauncher.from_slices()' - msg0+='\n *Inputs were:' - msg0+='\n src: shape={}, dtype={}, slices={}' - msg0+='\n dst: shape={}, dtype={}, slices={}' - msg0+='\n *Slices conversions were:' - msg0+='\n src_slices: {}' - msg0+='\n dst_slices: {}' - msg0 = msg0.format(src.shape, src.dtype, src_slices, + msg0 += '\n *Inputs were:' + msg0 += '\n src: shape={}, dtype={}, slices={}' + msg0 += '\n dst: shape={}, dtype={}, slices={}' + msg0 += '\n *Slices conversions were:' + msg0 += '\n src_slices: {}' + msg0 += '\n dst_slices: {}' + msg0 = msg0.format(src.shape, src.dtype, src_slices, dst.shape, dst.dtype, dst_slices, '{}', '{}') src_slices, src_dtype, src_nelems, src_bytes, src_indices = \ - cls._format_slices(src, src_slices) + cls._format_slices(src, src_slices) dst_slices, dst_dtype, dst_nelems, dst_bytes, dst_indices = \ - cls._format_slices(dst, dst_slices) + cls._format_slices(dst, dst_slices) msg0 = msg0.format(src_slices, dst_slices) if (src_bytes != dst_bytes): - msg0+='\n >Error: byte size mismatch between source and destination slices:' + msg0 += '\n >Error: byte size mismatch between source and destination slices:' else: - msg0+='\n *Data types and byte count:' - msg0+='\n src: nelems={}, dtype={}, bytes={} ({}B)' - msg0+='\n dst: nelems={}, dtype={}, bytes={} ({}B)' - msg0=msg0.format(src_nelems, src_dtype, bytes2str(src_bytes), src_bytes, - dst_nelems, dst_dtype, bytes2str(dst_bytes), dst_bytes) + msg0 += '\n *Data types and byte count:' + msg0 += '\n src: nelems={}, dtype={}, bytes={} ({}B)' + msg0 += '\n dst: nelems={}, dtype={}, bytes={} ({}B)' + msg0 = msg0.format(src_nelems, src_dtype, bytes2str(src_bytes), src_bytes, + dst_nelems, dst_dtype, bytes2str(dst_bytes), dst_bytes) if (src_bytes != dst_bytes): raise ValueError(msg0) - + src_data, src_region, src_origin, src_pitches = cls._compute_region(src, src_indices) dst_data, dst_region, dst_origin, dst_pitches = cls._compute_region(dst, dst_indices) if (src_region != dst_region).any(): - msg0 +='\n >Error: mismatch between source and destination regions:' + msg0 += '\n >Error: mismatch between source and destination regions:' else: msg0 += '\n *Determined regions:' - msg0+='\n src: region={}, origin={}, pitches={}' - msg0+='\n dst: region={}, origin={}, pitches={}' - msg0=msg0.format(src_region, src_origin, src_pitches, - dst_region, dst_origin, dst_pitches) + msg0 += '\n src: region={}, origin={}, pitches={}' + msg0 += '\n dst: region={}, origin={}, pitches={}' + msg0 = msg0.format(src_region, src_origin, src_pitches, + dst_region, dst_origin, dst_pitches) if (src_region != dst_region).any(): raise ValueError(msg0) region = src_region - if (region<=0).any(): - msg ='\n >Error: region is ill-formed or zero-sized:' - msg+='\n region: {}' + if (region <= 0).any(): + msg = '\n >Error: region is ill-formed or zero-sized:' + msg += '\n region: {}' msg = msg.format(region) raise ValueError(msg0+msg) @@ -547,62 +554,61 @@ class OpenClCopyBufferRectLauncher(OpenClCopyKernelLauncher): iter_dims = total_dims - copy_dims assert copy_dims > 0 assert iter_dims >= 0 - + zero, one = np.int32(0), np.int32(1) copy_region = [one]*3 copy_src_origin, copy_dst_origin = [zero]*3, [zero]*3 copy_src_pitches, copy_dst_pitches = [zero]*2, [zero]*2 - copy_region[:copy_dims] = region[::-1][:copy_dims] - copy_src_origin[:copy_dims] = src_origin[::-1][:copy_dims] - copy_dst_origin[:copy_dims] = dst_origin[::-1][:copy_dims] + copy_region[:copy_dims] = region[::-1][:copy_dims] + copy_src_origin[:copy_dims] = src_origin[::-1][:copy_dims] + copy_dst_origin[:copy_dims] = dst_origin[::-1][:copy_dims] copy_src_pitches[:copy_dims-1] = src_pitches[::-1][:copy_dims-1] copy_dst_pitches[:copy_dims-1] = dst_pitches[::-1][:copy_dims-1] - copy_region = tuple(copy_region) - copy_src_origin = tuple(copy_src_origin) - copy_dst_origin = tuple(copy_dst_origin) + copy_region = tuple(copy_region) + copy_src_origin = tuple(copy_src_origin) + copy_dst_origin = tuple(copy_dst_origin) copy_src_pitches = tuple(copy_src_pitches) copy_dst_pitches = tuple(copy_dst_pitches) - - iter_region = tuple(region[:iter_dims]) - iter_src_origin = tuple(src_origin[:iter_dims]) - iter_dst_origin = tuple(dst_origin[:iter_dims]) + + iter_region = tuple(region[:iter_dims]) + iter_src_origin = tuple(src_origin[:iter_dims]) + iter_dst_origin = tuple(dst_origin[:iter_dims]) iter_src_pitches = tuple(src_pitches[:iter_dims]) iter_dst_pitches = tuple(dst_pitches[:iter_dims]) - msg0+='\n *Dimensions:' - msg0+='\n total: {}' - msg0+='\n copy: {}' - msg0+='\n iter: {}' - msg0=msg0.format(total_dims, copy_dims, iter_dims) - - msg0+='\n *enqueue_copy kernel arguments:' - msg0+='\n region: {}' - msg0+='\n src: origin={}, pitches={}' - msg0+='\n dst: origin={}, pitches={}' - msg0=msg0.format(copy_region, - copy_src_origin, copy_src_pitches, - copy_dst_origin, copy_dst_pitches) - - msg0+='\n *iter arguments:' - msg0+='\n region: {}' - msg0+='\n src: origin={}, pitches={}' - msg0+='\n dst: origin={}, pitches={}' - msg0=msg0.format(iter_region, - iter_src_origin, iter_src_pitches, - iter_dst_origin, iter_dst_pitches) + msg0 += '\n *Dimensions:' + msg0 += '\n total: {}' + msg0 += '\n copy: {}' + msg0 += '\n iter: {}' + msg0 = msg0.format(total_dims, copy_dims, iter_dims) + + msg0 += '\n *enqueue_copy kernel arguments:' + msg0 += '\n region: {}' + msg0 += '\n src: origin={}, pitches={}' + msg0 += '\n dst: origin={}, pitches={}' + msg0 = msg0.format(copy_region, + copy_src_origin, copy_src_pitches, + copy_dst_origin, copy_dst_pitches) + + msg0 += '\n *iter arguments:' + msg0 += '\n region: {}' + msg0 += '\n src: origin={}, pitches={}' + msg0 += '\n dst: origin={}, pitches={}' + msg0 = msg0.format(iter_region, + iter_src_origin, iter_src_pitches, + iter_dst_origin, iter_dst_pitches) #print msg0 - - return cls(varname=varname, - src=src_data, dst=dst_data, - copy_region=copy_region, - copy_src_origin=copy_src_origin, copy_dst_origin=copy_dst_origin, - copy_src_pitches=copy_src_pitches, copy_dst_pitches=copy_dst_pitches, - iter_region=iter_region, - iter_src_origin=iter_src_origin, iter_dst_origin=iter_dst_origin, - iter_src_pitches=iter_src_pitches, iter_dst_pitches=iter_dst_pitches) + return cls(varname=varname, + src=src_data, dst=dst_data, + copy_region=copy_region, + copy_src_origin=copy_src_origin, copy_dst_origin=copy_dst_origin, + copy_src_pitches=copy_src_pitches, copy_dst_pitches=copy_dst_pitches, + iter_region=iter_region, + iter_src_origin=iter_src_origin, iter_dst_origin=iter_dst_origin, + iter_src_pitches=iter_src_pitches, iter_dst_pitches=iter_dst_pitches) class OpenClFillKernelLauncher(OpenClCopyBufferRectLauncher): @@ -621,9 +627,9 @@ class OpenClFillKernelLauncher(OpenClCopyBufferRectLauncher): fill_value = dst.dtype.type(fill_value) src = cls.create_fill_buffer(backend, dtype, shape, fill_value) - + obj = super(OpenClFillKernelLauncher, cls).from_slices(varname=varname, - src=src, dst=dst) + src=src, dst=dst) return obj @classmethod @@ -635,8 +641,7 @@ class OpenClFillKernelLauncher(OpenClCopyBufferRectLauncher): if (key in cls.__fill_buffers): buf = cls.__fill_buffers[key] else: - buf = backend.full(dtype=dtype, shape=shape, - fill_value=fill_value) + buf = backend.full(dtype=dtype, shape=shape, + fill_value=fill_value) cls.__fill_buffers[key] = buf return buf.reshape(shape) - diff --git a/hysop/backend/device/opencl/opencl_kernel_launcher.py b/hysop/backend/device/opencl/opencl_kernel_launcher.py index ceebbb47b..54dac149d 100644 --- a/hysop/backend/device/opencl/opencl_kernel_launcher.py +++ b/hysop/backend/device/opencl/opencl_kernel_launcher.py @@ -1,5 +1,5 @@ from abc import ABCMeta, abstractmethod -from hysop import __KERNEL_DEBUG__, __TRACE_KERNELS__, __TRACE_NOCOPY__, __TRACE_NOACCUMULATE__ +from hysop import __KERNEL_DEBUG__, __TRACE_KERNELS__, __TRACE_NOCOPY__, __TRACE_NOACCUMULATE__ from hysop.deps import it, warnings from hysop.tools.decorators import debug from hysop.tools.types import check_instance, first_not_None @@ -7,29 +7,32 @@ from hysop.tools.numpywrappers import npw from hysop.backend.device.opencl import cl, __OPENCL_PROFILE__ from hysop.backend.device.opencl.opencl_kernel_statistics import OpenClKernelStatistics from hysop.tools.warning import HysopWarning - +from hysop.tools.profiler import FProfiler + + def should_trace_kernel(kernel_msg): assert isinstance(kernel_msg, str) kernel_msg = kernel_msg.strip() - if __TRACE_NOCOPY__ and kernel_msg.startswith('enqueue_copy'): + if __TRACE_NOCOPY__ and kernel_msg.startswith('enqueue_copy'): return False - elif __TRACE_NOACCUMULATE__ and kernel_msg.startswith('add<<<'): + elif __TRACE_NOACCUMULATE__ and kernel_msg.startswith('add<<<'): return False else: return True + should_profile_kernel = should_trace_kernel - + if (__KERNEL_DEBUG__ or __TRACE_KERNELS__): def trace_kernel(kernel_msg): - if should_trace_kernel(kernel_msg): + if should_trace_kernel(kernel_msg): print kernel_msg else: def trace_kernel(kernel_msg): pass if __OPENCL_PROFILE__: - def profile_kernel(kernel, evt, kernel_msg=None): + def profile_kernel(kernel, evt, kernel_msg=None, fprofiler=None): evt.wait() if (kernel is None): assert (kernel_msg is not None) @@ -39,14 +42,16 @@ if __OPENCL_PROFILE__: if show_profiling_info: if not hasattr(kernel, '_apply_msg'): msg = 'Kernel of type {} has no \'_apply_msg\' attribute, this is required for profiling.' - msg=kernel_msg.format(type(kernel).__name__) + msg = kernel_msg.format(type(kernel).__name__) raise AttributeError(kernel_msg) kernel_msg = kernel._apply_msg - - if (kernel_msg is not None) and should_profile_kernel(kernel_msg): + if __KERNEL_DEBUG__ and (kernel_msg is not None) and should_profile_kernel(kernel_msg): print '{} | {}'.format(evt.profile.end - evt.profile.start, kernel_msg.strip()) + if not fprofiler is None: + fprofiler[kernel_msg] += (evt.profile.end - evt.profile.start)*1e-9 + else: - def profile_kernel(kernel, evt, kernel_msg=None): + def profile_kernel(kernel, evt, kernel_msg=None, fprofiler=None): pass @@ -57,9 +62,9 @@ class OpenClKernelListLauncher(object): (ie. OpenClKernelLauncher default_queue is not taken into account and all kernels should at least have a pre-configured global_work_size). """ - + @debug - def __init__(self, name): + def __init__(self, name, profiler=None): """ Create a OpenClKernelListLauncher. @@ -75,45 +80,46 @@ class OpenClKernelListLauncher(object): self._kernels = () self._parameters = {} self._apply_msg = '>OpenClKernelListLauncher {}'.format(name) + self._profiler = profiler def push_copy_host_device(self, varname, src, dst, - src_device_offset=None, dst_device_offset=None, byte_count=None): + src_device_offset=None, dst_device_offset=None, byte_count=None): """Shortcut for OpenClCopyBuffer kernels creation.""" from hysop.backend.device.opencl.opencl_copy_kernel_launchers import \ - OpenClCopyBufferLauncher - kernel = OpenClCopyBufferLauncher(varname=varname, - src=src, dst=dst, byte_count=byte_count, - src_device_offset=src_device_offset, dst_device_offset=dst_device_offset) + OpenClCopyBufferLauncher + kernel = OpenClCopyBufferLauncher(varname=varname, + src=src, dst=dst, byte_count=byte_count, + src_device_offset=src_device_offset, dst_device_offset=dst_device_offset) self.push_kernels(kernel) return self def push_copy_host_to_device(self, varname, src, dst, dst_device_offset=None): """Shortcut for OpenClCopyHost2Device kernels creation.""" from hysop.backend.device.opencl.opencl_copy_kernel_launchers import \ - OpenClCopyHost2DeviceLauncher - kernel = OpenClCopyHost2DeviceLauncher(varname=varname, src=src, dst=dst, - dst_device_offset=dst_device_offset) + OpenClCopyHost2DeviceLauncher + kernel = OpenClCopyHost2DeviceLauncher(varname=varname, src=src, dst=dst, + dst_device_offset=dst_device_offset) self.push_kernels(kernel) return self def push_copy_device_to_host(self, varname, src, dst, src_device_offset=None): """Shortcut for OpenClCopyDevice2Host kernels creation.""" from hysop.backend.device.opencl.opencl_copy_kernel_launchers import \ - OpenClCopyDevice2HostLauncher - kernel = OpenClCopyDevice2HostLauncher(varname=varname, - src=src, dst=dst, - src_device_offset=src_device_offset) + OpenClCopyDevice2HostLauncher + kernel = OpenClCopyDevice2HostLauncher(varname=varname, + src=src, dst=dst, + src_device_offset=src_device_offset) self.push_kernels(kernel) return self - def push_copy_device_to_device(self, varname, src, dst, - src_device_offset=None, dst_device_offset=None, byte_count=None): + def push_copy_device_to_device(self, varname, src, dst, + src_device_offset=None, dst_device_offset=None, byte_count=None): """Shortcut for OpenClCopyDevice2Device kernels creation.""" from hysop.backend.device.opencl.opencl_copy_kernel_launchers import\ - OpenClCopyDevice2DeviceLauncher - kernel = OpenClCopyDevice2DeviceLauncher(varname=varname, - src=src, dst=dst, byte_count=byte_count, - src_device_offset=src_device_offset, dst_device_offset=dst_device_offset) + OpenClCopyDevice2DeviceLauncher + kernel = OpenClCopyDevice2DeviceLauncher(varname=varname, + src=src, dst=dst, byte_count=byte_count, + src_device_offset=src_device_offset, dst_device_offset=dst_device_offset) self.push_kernels(kernel) return self @@ -126,12 +132,13 @@ class OpenClKernelListLauncher(object): if (launcher is None): pass elif isinstance(launcher, LauncherI): + launcher._profiler = self._profiler if not launcher.global_size_configured(): - msg='OpenClKernelLauncher {} global_work_size has not been configured.' - msg=msg.format(launcher.name) + msg = 'OpenClKernelLauncher {} global_work_size has not been configured.' + msg = msg.format(launcher.name) raise RuntimeError(msg) if isinstance(launcher, OpenClParametrizedKernelLauncher): - parameters = {k: v[1] for (k,v) in launcher.parameters_map.iteritems()} + parameters = {k: v[1] for (k, v) in launcher.parameters_map.iteritems()} self._update_parameters_from_parametrized_kernel(launcher, parameters) elif isinstance(launcher, HostLauncherI): parameters = launcher.parameters() @@ -139,11 +146,13 @@ class OpenClKernelListLauncher(object): self._kernels += (launcher,) elif isinstance(launcher, OpenClKernelListLauncher): self._update_parameters_from_kernel_list_launcher(launcher) + for kk in launcher._kernels: + kk._profiler = self._profiler self._kernels += launcher._kernels else: - msg='Expected an OpenClKernelLauncher or a OpenClKernelListLauncher ' - msg+='but got a {}.' - msg=msg.format(type(launcher)) + msg = 'Expected an OpenClKernelLauncher or a OpenClKernelListLauncher ' + msg += 'but got a {}.' + msg = msg.format(type(launcher)) raise TypeError(msg) return self @@ -156,7 +165,7 @@ class OpenClKernelListLauncher(object): """ Enqueue all kernels on the given queue in order. The first enqueued kernel will wait on the wait_for events. - If this OpenClKernelListLauncher is empty, cl.wait_for_events + If this OpenClKernelListLauncher is empty, cl.wait_for_events will be called instead. """ trace_kernel(self._apply_msg) @@ -164,8 +173,8 @@ class OpenClKernelListLauncher(object): if __debug__: parameters = self._parameters msg = 'Expected the following kernel parameters {} but got {}.' - msg=msg.format(', '.join('\'{}\''.format(k) for k in parameters), - ', '.join('\'{}\''.format(k) for k in kwds)) + msg = msg.format(', '.join('\'{}\''.format(k) for k in parameters), + ', '.join('\'{}\''.format(k) for k in kwds)) assert not (set(parameters.keys()) - set(kwds.keys())), msg kernels = self._kernels @@ -176,33 +185,33 @@ class OpenClKernelListLauncher(object): try: evt = kernel(queue=queue, **kwds) except: - msg='\nFailed to call kernel {} of type {}.\n' - msg=msg.format(kernel.name,type(kernel).__name__) + msg = '\nFailed to call kernel {} of type {}.\n' + msg = msg.format(kernel.name, type(kernel).__name__) print msg raise else: if (__KERNEL_DEBUG__ or __TRACE_KERNELS__): - msg='No kernels enqueued for KernelListLauncher::{}'.format(self.name) + msg = 'No kernels enqueued for KernelListLauncher::{}'.format(self.name) warnings.warn(msg, HysopWarning) evt = cl.enqueue_marker(queue=queue, wait_for=wait_for) return evt def _update_parameters_from_parametrized_kernel(self, kernel, parameters): """ - Update parameters of this kernel list launcher from a + Update parameters of this kernel list launcher from a OpenClParametrizedKernelLauncher (or HostLauncherI). """ check_instance(kernel, (OpenClParametrizedKernelLauncher, HostLauncherI)) - check_instance(parameters, dict, keys=str, values=(type,npw.dtype)) + check_instance(parameters, dict, keys=str, values=(type, npw.dtype)) sparameters = self._parameters for (pname, ptype) in parameters.iteritems(): if pname in sparameters: (stype, op_names) = sparameters[pname] if (stype != ptype): - msg='Trying to register parameter {} with type {} ' - msg+='but it was already registered with type {} by the ' - msg+= 'following operators:\n {}.' - msg=msg.format(pname, ptype, stype, ', '.join(op_names)) + msg = 'Trying to register parameter {} with type {} ' + msg += 'but it was already registered with type {} by the ' + msg += 'following operators:\n {}.' + msg = msg.format(pname, ptype, stype, ', '.join(op_names)) raise RuntimeError(msg) sparameters[pname][1].add(kernel.name) else: @@ -213,20 +222,19 @@ class OpenClKernelListLauncher(object): check_instance(kernel_list_launcher, OpenClKernelListLauncher) parameters = kernel_list_launcher._parameters sparameters = self._parameters - for (pname, (ptype,knames)) in parameters.iteritems(): + for (pname, (ptype, knames)) in parameters.iteritems(): if pname in sparameters: (stype, op_names) = sparameters[pname] if (stype != ptype): - msg='Trying to register parameter {} with type {} ' - msg+='but it was already registered with type {} by the ' - msg+= 'following operators:\n {}.' - msg=msg.format(pname, ptype, stype, ', '.join(op_names)) + msg = 'Trying to register parameter {} with type {} ' + msg += 'but it was already registered with type {} by the ' + msg += 'following operators:\n {}.' + msg = msg.format(pname, ptype, stype, ', '.join(op_names)) raise RuntimeError(msg) sparameters[pname][1].update(knames) else: sparameters[pname] = (ptype, knames) - def _get_name(self): """Return the OpenClKernelLauncher name.""" return self._name @@ -234,11 +242,11 @@ class OpenClKernelListLauncher(object): def _get_parameters(self): """ Return parameters of OpenClParametrizedKernelLauncher. - This is a mapping between the parameter names and + This is a mapping between the parameter names and parameter types and operator names. """ return self._parameters - + def _get_statistics(self): """Compute statistics of each kernels and clear events of kernels that finished.""" kernel_statistics = {} @@ -256,8 +264,8 @@ class LauncherI(object): Interface for any object that has the ability to be a launcher. """ __metaclass__ = ABCMeta - - def __init__(self, name, **kwds): + + def __init__(self, name, profiler=None, **kwds): """ Create a OpenClKernelLauncher. @@ -273,11 +281,12 @@ class LauncherI(object): self._name = name self._events = () self._kernel_statistics = OpenClKernelStatistics() - + self._profiler = profiler + def _get_name(self): """Get the name of this kernel launcher.""" return self._name - + def _get_events(self): """All events since the last call to update statistics.""" return self._events @@ -285,26 +294,26 @@ class LauncherI(object): def _get_statistics(self): """Compute statistics and clear events of kernels that finished.""" old_events = self._events - finished = tuple(evt for evt in events \ - if (evt.execution_statusin == cl.command_execution_status.COMPLETE)) + finished = tuple(evt for evt in events + if (evt.execution_statusin == cl.command_execution_status.COMPLETE)) running = tuple(evt for evt in old_events if (evt not in finished)) stats = OpenClKernelStatistics(finished) self._kernel_statistics += stats self._events = running return self._kernel_statistics - + def _register_event(self, queue, evt): """ - Register an event in the event list if the queue has + Register an event in the event list if the queue has the PROFILING_ENABLE flag set. """ if (cl.command_queue_properties.PROFILING_ENABLE & queue.properties): self._events += (evt,) - + name = property(_get_name) events = property(_get_events) statistics = property(_get_statistics) - + @abstractmethod def __call__(self, queue=None, wait_for=None, **kwds): """ @@ -329,43 +338,43 @@ class OpenClKernelLauncherI(LauncherI): Interface for any object that has the ability to enqueue a OpenCL kernel without extra arguments. """ - + @abstractmethod - def __call__(self, queue=None, wait_for=None, - global_work_size=None, local_work_size=None, **kwds): + def __call__(self, queue=None, wait_for=None, + global_work_size=None, local_work_size=None, **kwds): """ Launch kernel with a specific queue. Wait for wait_for events before computing. If queue has profiling enabled, events are pushed into a local list of events to compute kernel statistics when self.statistics is fetched. """ - + def check_kernel_arg(self, arg, arg_id, arg_name, arg_type): """Check kernel argument type prior to setargs.""" if not (__KERNEL_DEBUG__ or __TRACE_KERNELS__): return if isinstance(arg_type, npw.dtype) or \ - (isinstance(arg_type, tuple) and len(arg_type)==1 - and isinstance(arg_type[0], npw.dtype)): + (isinstance(arg_type, tuple) and len(arg_type) == 1 + and isinstance(arg_type[0], npw.dtype)): dtype = arg_type if isinstance(arg_type, npw.dtype) else arg_type[0] good = isinstance(arg, npw.ndarray) if not good: - msg='Argument {}::{} at id {} at does not match required type np.ndarray, ' - msg+='got {} instead.' - msg=msg.format(self.name, arg_name, arg_id, dtype, type(arg)) + msg = 'Argument {}::{} at id {} at does not match required type np.ndarray, ' + msg += 'got {} instead.' + msg = msg.format(self.name, arg_name, arg_id, dtype, type(arg)) raise RuntimeError(msg) - good = (arg.dtype==dtype) + good = (arg.dtype == dtype) if not good: - msg='Argument {}::{} at id {} at does not match required dtype {}, ' - msg+='got {} instead.' - msg=msg.format(self.name, arg_name, arg_id, dtype, arg.dtype) + msg = 'Argument {}::{} at id {} at does not match required dtype {}, ' + msg += 'got {} instead.' + msg = msg.format(self.name, arg_name, arg_id, dtype, arg.dtype) raise RuntimeError(msg) else: good = isinstance(arg, arg_type) if not good: - msg='Argument {}::{} at id {} at does not match required type {}, ' - msg+='got {} instead.' - msg=msg.format(self.name, arg_name, arg_id, arg_type, type(arg)) + msg = 'Argument {}::{} at id {} at does not match required type {}, ' + msg += 'got {} instead.' + msg = msg.format(self.name, arg_name, arg_id, arg_type, type(arg)) raise RuntimeError(msg) @@ -390,11 +399,11 @@ class OpenClKernelLauncher(OpenClKernelLauncherI): Manage launching of one OpenCL kernel with fixed arguments. """ @debug - def __init__(self, name, kernel, args_list, - default_global_work_size=None, - default_local_work_size=None, - default_queue=None, - **kwds): + def __init__(self, name, kernel, args_list, + default_global_work_size=None, + default_local_work_size=None, + default_queue=None, + **kwds): """ Create a OpenClKernelLauncher. @@ -422,26 +431,26 @@ class OpenClKernelLauncher(OpenClKernelLauncherI): check_instance(args_list, tuple) check_instance(kernel, (cl.Program, cl.Kernel), allow_none=True) check_instance(default_queue, cl.CommandQueue, allow_none=True) - check_instance(default_global_work_size, tuple, values=(int,npw.integer), allow_none=True) - check_instance(default_local_work_size, tuple, values=(int,npw.integer), allow_none=True) + check_instance(default_global_work_size, tuple, values=(int, npw.integer), allow_none=True) + check_instance(default_local_work_size, tuple, values=(int, npw.integer), allow_none=True) if isinstance(kernel, cl.Program): kernels = kernel.all_kernels() - assert len(kernels)==1 + assert len(kernels) == 1 kernel = kernels[0] - kernel_is_shared=False + kernel_is_shared = False elif (kernel is None): - kernel_is_shared=False + kernel_is_shared = False else: # set_args will always be called on apply - kernel_is_shared=True + kernel_is_shared = True args_per_index = False if args_list: if isinstance(args_list[0], tuple): aindexes = tuple(x[0] for x in args_list) assert len(aindexes) == len(set(aindexes)), msg - for (index,arg) in args_list: + for (index, arg) in args_list: kernel.set_arg(index, arg) args_per_index = True else: @@ -455,7 +464,7 @@ class OpenClKernelLauncher(OpenClKernelLauncherI): self._default_queue = default_queue self._kernel_is_shared = kernel_is_shared self._apply_msg = ' {}<<<>>>'.format(name) - + def queue_configured(self): """ Return True is this kernel is ready to be enqueued without @@ -480,17 +489,22 @@ class OpenClKernelLauncher(OpenClKernelLauncherI): """Get the precompiled kernel to be launched.""" assert self.kernel_is_shared, 'Kernel cannot be shared.' return self._kernel + def _get_kernel_is_shared(self): """Return True if this kernel may be shared with other callers.""" + def _get_args_list(self): """All arguments of the kernel as a tuple.""" return self._args_list + def _get_default_queue(self): """Default queue to launch the kernel.""" return self._default_queue + def _get_default_global_work_size(self): """Default global work size to launch the kernel.""" return self._default_global_work_size + def _get_default_local_work_size(self): """Default default_local_work_size to launch the kernel.""" return self._default_local_work_size @@ -500,30 +514,30 @@ class OpenClKernelLauncher(OpenClKernelLauncherI): default_queue = property(_get_default_queue) def __call__(self, queue=None, wait_for=None, - global_work_size=None, local_work_size=None, **kwds): + global_work_size=None, local_work_size=None, **kwds): """ Launch kernel with a specific queue, global_work_size and local_work_size. Wait for wait_for events before computing. If queue has profiling enabled, events are pushed into a local list of events to compute kernel statistics when self.statistics is fetched. """ - queue = first_not_None(queue, self._default_queue) + queue = first_not_None(queue, self._default_queue) global_work_size = first_not_None(global_work_size, self._default_global_work_size) - local_work_size = first_not_None(local_work_size, self._default_local_work_size) + local_work_size = first_not_None(local_work_size, self._default_local_work_size) assert isinstance(queue, cl.CommandQueue) assert isinstance(global_work_size, tuple) assert isinstance(local_work_size, (tuple, type(None))) - + apply_msg = self._apply_msg.format(global_work_size, local_work_size) trace_kernel(apply_msg) - + kernel = self._set_kernel_args(**kwds) - - evt = cl.enqueue_nd_range_kernel(queue=queue, kernel=kernel, - global_work_size=global_work_size, - local_work_size=local_work_size, wait_for=wait_for) - profile_kernel(None, evt, apply_msg) - + + evt = cl.enqueue_nd_range_kernel(queue=queue, kernel=kernel, + global_work_size=global_work_size, + local_work_size=local_work_size, wait_for=wait_for) + profile_kernel(None, evt, apply_msg, fprofiler=self._profiler) + self._register_event(queue, evt) return evt @@ -533,7 +547,7 @@ class OpenClKernelLauncher(OpenClKernelLauncherI): if self._kernel_is_shared: args_list = self._args_list if self._args_per_index: - for (index,arg) in args_list: + for (index, arg) in args_list: kernel.set_arg(index, arg) else: kernel.set_args(*self.args_list) @@ -560,20 +574,20 @@ class OpenClParametrizedKernelLauncher(OpenClKernelLauncher): """ check_instance(args_list, tuple, values=tuple) check_instance(parameters_map, dict, keys=str, values=tuple) - + pindexes = tuple(x[0] for x in parameters_map.values()) aindexes = tuple(x[0] for x in args_list) assert len(pindexes) == len(set(pindexes)), \ - 'Arguments at same position: {}'.format(parameters_map) + 'Arguments at same position: {}'.format(parameters_map) assert len(aindexes) == len(set(aindexes)), \ - 'Arguments at same position: {}'.format(parameters_map) + 'Arguments at same position: {}'.format(parameters_map) if set(pindexes).intersection(aindexes): - msg='Overlap between parameters indexes and default argument indexes.' - msg+='\nparameters: {}\ndefault args: {}'.format(parameters_map, args_list) + msg = 'Overlap between parameters indexes and default argument indexes.' + msg += '\nparameters: {}\ndefault args: {}'.format(parameters_map, args_list) raise ValueError(msg) super(OpenClParametrizedKernelLauncher, self).__init__(args_list=args_list, **kwds) - + self._parameters_map = parameters_map def _get_parameters_map(self): @@ -581,7 +595,7 @@ class OpenClParametrizedKernelLauncher(OpenClKernelLauncher): Mapping between parameter names and (parameter_index, parameter_type). """ return self._parameters_map - + def _set_kernel_args(self, **kwds): """Set the arguments of this kernel and return the kernel.""" kernel = super(OpenClParametrizedKernelLauncher, self)._set_kernel_args() @@ -589,9 +603,9 @@ class OpenClParametrizedKernelLauncher(OpenClKernelLauncher): assert pname in kwds, '{} was not given.'.format(pname) pval = kwds[pname] self.check_kernel_arg(pval, pindex, pname, ptypes) - kernel.set_arg(pindex, pval) + kernel.set_arg(pindex, pval) return kernel - + parameters_map = property(_get_parameters_map) @@ -607,6 +621,7 @@ class OpenClKernelParameterGenerator(object): def new_generator(self): pass + class OpenClKernelParameterYielder(OpenClKernelParameterGenerator): """Generate opencl parameters through an external iterator or generator factory.""" @@ -619,8 +634,8 @@ class OpenClKernelParameterYielder(OpenClKernelParameterGenerator): fn: callable Lambda, function or functor that takes no arguments which should return a Generator or an Iterator uppon call. - - Example: + + Example: lambda: xrange(10) """ assert callable(fn) @@ -629,7 +644,6 @@ class OpenClKernelParameterYielder(OpenClKernelParameterGenerator): def new_generator(self): return self._fn() - class OpenClIterativeKernelLauncher(OpenClParametrizedKernelLauncher): """ @@ -661,7 +675,7 @@ class OpenClIterativeKernelLauncher(OpenClParametrizedKernelLauncher): check_instance(args_list, tuple, values=tuple) check_instance(parameters_map, dict, keys=str, values=tuple) check_instance(iterated_parameters, dict, keys=str, - values=OpenClKernelParameterGenerator) + values=OpenClKernelParameterGenerator) iterated_parameter_arg_ids = () iterated_parameter_arg_names = () @@ -675,66 +689,66 @@ class OpenClIterativeKernelLauncher(OpenClParametrizedKernelLauncher): iterated_parameter_arg_types += (arg_type,) iterated_parameter_generators += (pgen,) - super(OpenClIterativeKernelLauncher, self).__init__(args_list=args_list, - parameters_map=parameters_map, **kwds) - + super(OpenClIterativeKernelLauncher, self).__init__(args_list=args_list, + parameters_map=parameters_map, **kwds) + self.iterated_parameters = iterated_parameters self.iterated_parameter_arg_ids = iterated_parameter_arg_ids self.iterated_parameter_arg_names = iterated_parameter_arg_names self.iterated_parameter_arg_types = iterated_parameter_arg_types self.iterated_parameter_generators = iterated_parameter_generators - + self._apply_msg = '{}{}<<<{}, {}>>>({})'.format('{}', self.name, '{}', '{}', '{}') def iter_parameters(self): return it.product(*self.iterated_parameter_generators) - + def __call__(self, queue=None, wait_for=None, enqueue_barrier=True, - global_work_size=None, local_work_size=None, **kwds): + global_work_size=None, local_work_size=None, **kwds): """ Launch kernel with a specific queue, global_work_size and local_work_size. Wait for wait_for events before computing. If queue has profiling enabled, events are pushed into a local list of events to compute kernel statistics when self.statistics is fetched. - If the queue is out of order, a barrie is enqueued unless enqueue_barrier is set + If the queue is out of order, a barrie is enqueued unless enqueue_barrier is set to False. If enqueue_barrier is False, returned event is None. """ - queue = first_not_None(queue, self._default_queue) + queue = first_not_None(queue, self._default_queue) global_work_size = first_not_None(global_work_size, self._default_global_work_size) - local_work_size = first_not_None(local_work_size, self._default_local_work_size) + local_work_size = first_not_None(local_work_size, self._default_local_work_size) assert isinstance(queue, cl.CommandQueue) assert isinstance(global_work_size, tuple) assert isinstance(local_work_size, (tuple, type(None))) - + apply_msg = self._apply_msg.format('{}', global_work_size, local_work_size, '{}') trace_kernel(apply_msg.format(' ', '<yielder>')) - + kernel = self._set_kernel_args(**kwds) out_of_order_queue = \ (queue.properties & cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) - arg_ids = self.iterated_parameter_arg_ids + arg_ids = self.iterated_parameter_arg_ids arg_types = self.iterated_parameter_arg_types arg_names = self.iterated_parameter_arg_names - for i,args in enumerate(self.iter_parameters()): - apply_msg = self._apply_msg.format('{}', global_work_size, local_work_size, - '{}') - apply_msg = apply_msg.format(' | ', ', '.join('{}={}'.format(pname, pval) - for (pname,pval) in zip(arg_names, args))) + for i, args in enumerate(self.iter_parameters()): + apply_msg = self._apply_msg.format('{}', global_work_size, local_work_size, + '{}') + apply_msg = apply_msg.format(' | ', ', '.join('{}={}'.format(pname, pval) + for (pname, pval) in zip(arg_names, args))) trace_kernel(apply_msg) - - for arg_id, arg_name, arg_type, arg_value in zip(arg_ids, arg_names, + + for arg_id, arg_name, arg_type, arg_value in zip(arg_ids, arg_names, arg_types, args): self.check_kernel_arg(arg_value, arg_id, arg_name, arg_types) kernel.set_arg(arg_id, arg_value) - evt = cl.enqueue_nd_range_kernel(queue=queue, kernel=kernel, - global_work_size=global_work_size, - local_work_size=local_work_size, - wait_for=(wait_for if (i==0 or out_of_order_queue) else None)) - profile_kernel(None, evt, apply_msg) + evt = cl.enqueue_nd_range_kernel(queue=queue, kernel=kernel, + global_work_size=global_work_size, + local_work_size=local_work_size, + wait_for=(wait_for if (i == 0 or out_of_order_queue) else None)) + profile_kernel(None, evt, apply_msg, fprofiler=self._profiler) self._register_event(queue, evt) if out_of_order_queue: @@ -744,4 +758,3 @@ class OpenClIterativeKernelLauncher(OpenClParametrizedKernelLauncher): else: evt = None return evt - diff --git a/hysop/backend/device/opencl/operator/custom_symbolic.py b/hysop/backend/device/opencl/operator/custom_symbolic.py index af43ecaaf..3dca49a14 100644 --- a/hysop/backend/device/opencl/operator/custom_symbolic.py +++ b/hysop/backend/device/opencl/operator/custom_symbolic.py @@ -1,4 +1,3 @@ - from hysop.constants import DirectionLabels from hysop.tools.decorators import debug from hysop.operator.base.custom_symbolic_operator import CustomSymbolicOperatorBase @@ -8,47 +7,48 @@ from hysop.backend.device.opencl.autotunable_kernels.custom_symbolic import Open from hysop.backend.device.opencl.opencl_kernel_launcher import OpenClKernelListLauncher from hysop.backend.device.opencl.opencl_copy_kernel_launchers import OpenClCopyBufferRectLauncher + class OpenClCustomSymbolicOperator(CustomSymbolicOperatorBase, OpenClOperator): @debug def __init__(self, **kwds): super(OpenClCustomSymbolicOperator, self).__init__(**kwds) - + @debug def setup(self, work): super(OpenClCustomSymbolicOperator, self).setup(work) self._collect_kernels() def _collect_kernels(self): - kl = OpenClKernelListLauncher(name=self.name) - kl += self._collect_symbolic_kernel() + kl = OpenClKernelListLauncher(name=self.name, profiler=self._profiler) + kl += self._collect_symbolic_kernel() for sout in self.output_discrete_fields.values(): kl += sout.exchange_ghosts(build_launcher=True) self.kl = kl - + def _collect_symbolic_kernel(self): cl_env = self.cl_env typegen = self.typegen autotuner_config = self.autotuner_config build_opts = self.build_options() - - kernel_autotuner = OpenClAutotunableCustomSymbolicKernel(cl_env=cl_env, typegen=typegen, - build_opts=build_opts, autotuner_config=autotuner_config) + + kernel_autotuner = OpenClAutotunableCustomSymbolicKernel(cl_env=cl_env, typegen=typegen, + build_opts=build_opts, autotuner_config=autotuner_config) kernel, args_dict, update_input_parameters = kernel_autotuner.autotune(expr_info=self.expr_info) - + kl = kernel.build_launcher(**args_dict) self._symbolic_kernel_kl = kl self._update_input_params = update_input_parameters return kl - + @op_apply def apply(self, **kwds): queue = self.cl_env.default_queue evt = self.kl(queue=queue, **self._update_input_params()) - + @classmethod def supports_mpi(cls): return True diff --git a/hysop/backend/device/opencl/operator/directional/advection_dir.py b/hysop/backend/device/opencl/operator/directional/advection_dir.py index 819034fe5..acf1ad9fb 100644 --- a/hysop/backend/device/opencl/operator/directional/advection_dir.py +++ b/hysop/backend/device/opencl/operator/directional/advection_dir.py @@ -1,7 +1,7 @@ -from hysop.tools.decorators import debug +from hysop.tools.decorators import debug from hysop.constants import DirectionLabels from hysop.backend.device.opencl.operator.directional.opencl_directional_operator import \ - OpenClDirectionalOperator, op_apply + OpenClDirectionalOperator, op_apply from hysop.operator.base.advection_dir import DirectionalAdvectionBase, MemoryRequest from hysop.backend.device.opencl.autotunable_kernels.advection_dir import OpenClAutotunableDirectionalAdvectionKernel @@ -9,13 +9,14 @@ from hysop.backend.device.opencl.autotunable_kernels.remesh_dir import OpenClAut from hysop.backend.device.opencl.opencl_copy_kernel_launchers import OpenClCopyBufferRectLauncher from hysop.backend.device.opencl.opencl_kernel_launcher import OpenClKernelListLauncher + class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOperator): - DEBUG=False + DEBUG = False @debug def __init__(self, force_atomics=False, relax_min_particles=False, remesh_criteria_eps=None, - **kwds): + **kwds): """ Particular advection of field(s) in a given direction, on opencl backend, with remeshing. @@ -63,13 +64,13 @@ class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOper List of output continuous fields. """ - super(OpenClDirectionalAdvection,self).__init__(**kwds) - self.force_atomics = force_atomics + super(OpenClDirectionalAdvection, self).__init__(**kwds) + self.force_atomics = force_atomics self.relax_min_particles = relax_min_particles self.remesh_criteria_eps = remesh_criteria_eps self._force_autotuner_verbose = None - self._force_autotuner_debug = None + self._force_autotuner_debug = None @debug def setup(self, work): @@ -77,20 +78,20 @@ class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOper self._collect_kernels() def _collect_kernels(self): - kl = OpenClKernelListLauncher(name='advec_remesh') + kl = OpenClKernelListLauncher(name='advec_remesh', profiler=self._profiler) kl += self._collect_advection_kernel() kl += self._collect_remesh_kernels() kl += self._collect_redistribute_kernels() self.all_kernels = kl def _collect_advection_kernel(self): - cl_env = self.cl_env - typegen = self.typegen - build_options = self.build_options() + cl_env = self.cl_env + typegen = self.typegen + build_options = self.build_options() autotuner_config = self.autotuner_config kernel = OpenClAutotunableDirectionalAdvectionKernel(cl_env=cl_env, typegen=typegen, - build_opts=build_options, autotuner_config=autotuner_config) + build_opts=build_options, autotuner_config=autotuner_config) kwds = {} kwds['velocity'] = self.dvelocity @@ -102,12 +103,12 @@ class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOper else: kwds['is_bilevel'] = self.is_bilevel - kwds['direction'] = self.splitting_direction - kwds['velocity_cfl'] = self.velocity_cfl + kwds['direction'] = self.splitting_direction + kwds['velocity_cfl'] = self.velocity_cfl kwds['time_integrator'] = self.time_integrator (advec_kernel, args_dict) = kernel.autotune(force_verbose=self._force_autotuner_verbose, - force_debug=self._force_autotuner_debug, hardcode_arrays=True, **kwds) + force_debug=self._force_autotuner_debug, hardcode_arrays=True, **kwds) args_dict.pop('dt') advec_launcher = advec_kernel.build_launcher(**args_dict) @@ -117,39 +118,39 @@ class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOper return advec_launcher def _collect_remesh_kernels(self): - cl_env = self.cl_env - typegen = self.typegen - build_options = self.build_options() + cl_env = self.cl_env + typegen = self.typegen + build_options = self.build_options() autotuner_config = self.autotuner_config kernel = OpenClAutotunableDirectionalRemeshKernel(cl_env=cl_env, typegen=typegen, - build_opts=build_options, autotuner_config=autotuner_config) + build_opts=build_options, autotuner_config=autotuner_config) - scalars_in = tuple(self.dadvected_fields_in[ifield] \ - for ifield in self.advected_fields_in) - scalars_out = tuple(self.dadvected_fields_out[ofield] \ - for ofield in self.advected_fields_out) + scalars_in = tuple(self.dadvected_fields_in[ifield] + for ifield in self.advected_fields_in) + scalars_out = tuple(self.dadvected_fields_out[ofield] + for ofield in self.advected_fields_out) kwds = {} - kwds['direction'] = self.splitting_direction + kwds['direction'] = self.splitting_direction kwds['scalar_cfl'] = self.scalar_cfl kwds['is_inplace'] = self.is_inplace - kwds['position'] = self.dposition - kwds['is_inplace'] = self.is_inplace + kwds['position'] = self.dposition + kwds['is_inplace'] = self.is_inplace - kwds['remesh_kernel'] = self.remesh_kernel + kwds['remesh_kernel'] = self.remesh_kernel kwds['remesh_criteria_eps'] = self.remesh_criteria_eps - kwds['force_atomics'] = self.force_atomics + kwds['force_atomics'] = self.force_atomics kwds['relax_min_particles'] = self.relax_min_particles - assert len(scalars_in)==len(scalars_out) + assert len(scalars_in) == len(scalars_out) kl = OpenClKernelListLauncher(name='remesh') for (Sin, Sout) in zip(scalars_in, scalars_out): - kwds['scalars_in'] = (Sin,) + kwds['scalars_in'] = (Sin,) kwds['scalars_out'] = (Sout,) (remesh_kernel, args_dict) = kernel.autotune(force_verbose=self._force_autotuner_verbose, - force_debug=self._force_autotuner_debug, hardcode_arrays=True, **kwds) + force_debug=self._force_autotuner_debug, hardcode_arrays=True, **kwds) kl += remesh_kernel.build_launcher(**args_dict) self.remesh_kernel_launcher = kl return kl @@ -170,7 +171,7 @@ class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOper @op_apply def apply(self, dbg=None, **kargs): queue = self.cl_env.default_queue - dt = self.precision(self.dt() * self.dt_coeff) + dt = self.precision(self.dt() * self.dt_coeff) if self.DEBUG: queue.flush() diff --git a/hysop/backend/device/opencl/operator/directional/stretching_dir.py b/hysop/backend/device/opencl/operator/directional/stretching_dir.py index cab44515b..e07df8479 100644 --- a/hysop/backend/device/opencl/operator/directional/stretching_dir.py +++ b/hysop/backend/device/opencl/operator/directional/stretching_dir.py @@ -1,7 +1,6 @@ - from hysop import Field, TopologyDescriptor from hysop.deps import np -from hysop.tools.decorators import debug +from hysop.tools.decorators import debug from hysop.tools.types import check_instance from hysop.core.graph.graph import not_initialized, initialized, discretized, ready, op_apply from hysop.topology.cartesian_descriptor import CartesianTopologyDescriptors @@ -17,27 +16,28 @@ from hysop.constants import StretchingFormulation from hysop.numerics.odesolvers.runge_kutta import ExplicitRungeKutta, Euler, RK2, RK3, RK4 from hysop.backend.device.codegen.kernels.directional_stretching import \ - DirectionalStretchingKernel + DirectionalStretchingKernel + class OpenClDirectionalStretching(OpenClDirectionalOperator): - + __default_method = { - #KernelConfig: KernelConfig(), - TimeIntegrator: Euler, - StretchingFormulation: StretchingFormulation.GRAD_UW, - SpaceDiscretization: SpaceDiscretization.FDC4 - } - + # KernelConfig: KernelConfig(), + TimeIntegrator: Euler, + StretchingFormulation: StretchingFormulation.GRAD_UW, + SpaceDiscretization: SpaceDiscretization.FDC4 + } + __available_methods = { - #KernelConfig: InstanceOf(KernelConfig), - TimeIntegrator: InstanceOf(ExplicitRungeKutta), + # KernelConfig: InstanceOf(KernelConfig), + TimeIntegrator: InstanceOf(ExplicitRungeKutta), StretchingFormulation: InstanceOf(StretchingFormulation), SpaceDiscretization: InstanceOf(SpaceDiscretization) } @debug - def __init__(self, velocity, vorticity, vorticity_out, - variables, **kwds): + def __init__(self, velocity, vorticity, vorticity_out, + variables, **kwds): """ Directionnal stretching of vorticity in a given direction on opencl backend. @@ -57,7 +57,7 @@ class OpenClDirectionalStretching(OpenClDirectionalOperator): Dictionary of continuous fields as keys and topologies as values. kwds: Extra parameters passed to generated directional operators. - + Attributes ---------- velocity: Field @@ -73,32 +73,32 @@ class OpenClDirectionalStretching(OpenClDirectionalOperator): check_instance(vorticity_out, Field) check_instance(variables, dict, keys=Field, values=CartesianTopologyDescriptors) - input_fields = { velocity: variables[velocity], vorticity: variables[vorticity] } - output_fields = { vorticity_out: variables[vorticity_out] } - - super(OpenClDirectionalStretching,self).__init__(input_fields=input_fields, - output_fields=output_fields, **kwds) + input_fields = {velocity: variables[velocity], vorticity: variables[vorticity]} + output_fields = {vorticity_out: variables[vorticity_out]} - self.velocity = velocity - self.vorticity_in = vorticity + super(OpenClDirectionalStretching, self).__init__(input_fields=input_fields, + output_fields=output_fields, **kwds) + + self.velocity = velocity + self.vorticity_in = vorticity self.vorticity_out = vorticity_out - self.is_inplace = (vorticity is vorticity_out) - + self.is_inplace = (vorticity is vorticity_out) + @debug - def handle_method(self,method): - super(OpenClDirectionalStretching,self).handle_method(method) - + def handle_method(self, method): + super(OpenClDirectionalStretching, self).handle_method(method) + self.space_discretization = method.pop(SpaceDiscretization) - self.formulation = method.pop(StretchingFormulation) + self.formulation = method.pop(StretchingFormulation) self.time_integrator = method.pop(TimeIntegrator) - + assert str(self.space_discretization)[:3] == 'FDC' self.order = int(str(self.space_discretization)[3:]) - + @debug def get_field_requirements(self): requirements = super(OpenClDirectionalStretching, self).get_field_requirements() - + direction = self.splitting_direction is_inplace = self.is_inplace @@ -106,14 +106,14 @@ class OpenClDirectionalStretching(OpenClDirectionalOperator): formulation = self.formulation order = self.order - velocity = self.velocity - vorticity_in = self.vorticity_in + velocity = self.velocity + vorticity_in = self.vorticity_in vorticity_out = self.vorticity_out - v_topo, v_requirements = requirements.get_input_requirement(velocity) - win_topo, win_requirements = requirements.get_input_requirement(vorticity_in) + v_topo, v_requirements = requirements.get_input_requirement(velocity) + win_topo, win_requirements = requirements.get_input_requirement(vorticity_in) wout_topo, wout_requirements = requirements.get_output_requirement(vorticity_out) - + if v_topo.mpi_params.size == 1: lboundary = v_topo.domain.lboundaries[-1] rboundary = v_topo.domain.rboundaries[-1] @@ -122,64 +122,64 @@ class OpenClDirectionalStretching(OpenClDirectionalOperator): rboundary = BoundaryCondition.NONE boundaries = (lboundary, rboundary) - v_ghosts, w_ghosts = DirectionalStretchingKernel.min_ghosts(boundaries, - formulation, order, time_integrator, direction) + v_ghosts, w_ghosts = DirectionalStretchingKernel.min_ghosts(boundaries, + formulation, order, time_integrator, direction) - v_requirements.min_ghosts = v_ghosts + v_requirements.min_ghosts = v_ghosts win_requirements.min_ghosts = w_ghosts - + if is_inplace: wout_requirements.min_ghosts = w_ghosts return requirements - + @debug def discretize(self): - super(OpenClDirectionalStretching,self).discretize() - dvelocity = self.input_discrete_fields[self.velocity] - dvorticity_in = self.input_discrete_fields[self.vorticity_in] + super(OpenClDirectionalStretching, self).discretize() + dvelocity = self.input_discrete_fields[self.velocity] + dvorticity_in = self.input_discrete_fields[self.vorticity_in] dvorticity_out = self.output_discrete_fields[self.vorticity_out] assert dvorticity_in.topology.topology is dvorticity_out.topology.topology - - vorticity_mesh_info_in = self.input_mesh_info[self.vorticity_in] + + vorticity_mesh_info_in = self.input_mesh_info[self.vorticity_in] vorticity_mesh_info_out = self.output_mesh_info[self.vorticity_out] - - self.dvelocity = dvelocity - self.dvorticity_in = dvorticity_in + + self.dvelocity = dvelocity + self.dvorticity_in = dvorticity_in self.dvorticity_out = dvorticity_out - self.velocity_mesh_info = self.input_mesh_info[self.velocity] + self.velocity_mesh_info = self.input_mesh_info[self.velocity] self.vorticity_mesh_info = vorticity_mesh_info_in - + @debug def setup(self, work): - super(OpenClDirectionalStretching,self).setup(work) + super(OpenClDirectionalStretching, self).setup(work) self._collect_kernels() def _collect_kernels(self): self._collect_stretching_kernel() def _collect_stretching_kernel(self): - - velocity = self.dvelocity - vorticity_in = self.dvorticity_in + + velocity = self.dvelocity + vorticity_in = self.dvorticity_in vorticity_out = self.dvorticity_out - velocity_mesh_info = self.velocity_mesh_info + velocity_mesh_info = self.velocity_mesh_info vorticity_mesh_info = self.vorticity_mesh_info - - direction = self.splitting_direction - formulation = self.formulation - discretization = self.space_discretization + + direction = self.splitting_direction + formulation = self.formulation + discretization = self.space_discretization time_integrator = self.time_integrator - - cl_env = self.cl_env - precision = self.precision - build_options = self.build_options() + + cl_env = self.cl_env + precision = self.precision + build_options = self.build_options() autotuner_config = self.autotuner_config - (kernel_launcher, kernel_args, kernel_args_mapping, - total_work, per_work_statistic, cached_bytes) = \ + (kernel_launcher, kernel_args, kernel_args_mapping, + total_work, per_work_statistic, cached_bytes) = \ DirectionalStretchingKernel.autotune( cl_env=cl_env, precision=precision, @@ -194,13 +194,14 @@ class OpenClDirectionalStretching(OpenClDirectionalOperator): vorticity_out=vorticity_out, velocity_mesh_info=velocity_mesh_info, vorticity_mesh_info=vorticity_mesh_info) + kernel_launcher._profiler = self._profiler @op_apply - def apply(self,**kargs): - super(OpenClDirectionalStretching,self).apply(**kargs) + def apply(self, **kargs): + super(OpenClDirectionalStretching, self).apply(**kargs) raise NotImplementedError() - ## Backend methods + # Backend methods # ComputationalNode @classmethod def default_method(cls): @@ -217,28 +218,29 @@ class OpenClDirectionalStretching(OpenClDirectionalOperator): @classmethod def supports_mpi(cls): return False - + # DirectionalOperatorBase @classmethod def supported_dimensions(cls): return [3] - + # ComputationalGraphNode @classmethod def supports_multiscale(cls): return False - - def _do_compute(self,simulation,dt_coeff,**kargs): + + def _do_compute(self, simulation, dt_coeff, **kargs): dt = simulation.time_step * dt_coeff - self._do_compute_impl(dt=dt,**kargs) - + self._do_compute_impl(dt=dt, **kargs) + def _do_compute_monoscale(self, dt): raise NotImplementedError() + def _do_compute_multiscale(self, dt): raise NotImplementedError() + def _do_compute_monoscale_comm(self, dt): raise NotImplementedError() + def _do_compute_multiscale_comm(self, dt): raise NotImplementedError() - - diff --git a/hysop/backend/device/opencl/operator/solenoidal_projection.py b/hysop/backend/device/opencl/operator/solenoidal_projection.py index db614290e..dd49bde13 100644 --- a/hysop/backend/device/opencl/operator/solenoidal_projection.py +++ b/hysop/backend/device/opencl/operator/solenoidal_projection.py @@ -1,7 +1,8 @@ -import primefac, functools +import primefac +import functools from hysop import vprint from hysop.tools.numpywrappers import npw -from hysop.tools.decorators import debug +from hysop.tools.decorators import debug from hysop.tools.units import bytes2str from hysop.tools.numerics import is_complex, find_common_dtype from hysop.operator.base.solenoidal_projection import SolenoidalProjectionOperatorBase @@ -16,17 +17,18 @@ from hysop.symbolic.misc import Select, Expand from hysop.symbolic.complex import ComplexMul from hysop.symbolic.relational import Assignment, LogicalEQ, LogicalAND + class OpenClSolenoidalProjection(SolenoidalProjectionOperatorBase, OpenClSymbolic): """ Solves the poisson rotational equation using clFFT and an OpenCL symbolic kernels. """ - + def initialize(self, **kwds): # request the projection kernel if required - Fin = tuple(Ft.output_symbolic_array('Fin{}_hat'.format(i)) - for (i,Ft) in enumerate(self.forward_transforms)) + Fin = tuple(Ft.output_symbolic_array('Fin{}_hat'.format(i)) + for (i, Ft) in enumerate(self.forward_transforms)) Fout = tuple(Bt.input_symbolic_array('Fout{}_hat'.format(i)) - for (i,Bt) in enumerate(self.backward_transforms)) + for (i, Bt) in enumerate(self.backward_transforms)) K1s, K2s = (), () for kd1 in self.kd1s: Ki = self.tg.indexed_wavenumbers(*kd1)[::-1] @@ -37,16 +39,16 @@ class OpenClSolenoidalProjection(SolenoidalProjectionOperatorBase, OpenClSymboli dtype = find_common_dtype(*tuple(Ft.output_dtype for Ft in self.forward_transforms)) Cs = self.symbolic_tmp_scalars('C', dtype=dtype, count=3) - + I = local_indices_symbols[:3] - cond = LogicalAND(*tuple(LogicalEQ(Ik,0) for Ik in I)) - + cond = LogicalAND(*tuple(LogicalEQ(Ik, 0) for Ik in I)) + exprs = () for i in xrange(3): expr = 0 for j in xrange(3): e = Fin[j] - if (i==j): + if (i == j): e = K2s[j][j]*e else: e = (ComplexMul(K1s[j][j], e) if K1s[j][j].Wn.is_complex else K1s[j][j]*e) @@ -64,20 +66,19 @@ class OpenClSolenoidalProjection(SolenoidalProjectionOperatorBase, OpenClSymboli if self.compute_divFin: divFin = self.backward_divFin_transform.input_symbolic_array('divFin') - expr = sum(ComplexMul(K1s[j][j],Fin[j]) if K1s[j][j].Wn.is_complex else K1s[j][j]*Fin[j] - for j in xrange(3)) + expr = sum(ComplexMul(K1s[j][j], Fin[j]) if K1s[j][j].Wn.is_complex else K1s[j][j]*Fin[j] + for j in xrange(3)) expr = Assignment(divFin, expr) self.require_symbolic_kernel('compute_divFin', expr) - + if self.compute_divFout: - expr = sum(ComplexMul(K1s[j][j],Fout[j]) if K1s[j][j].Wn.is_complex else K1s[j][j]*Fout[j] - for j in xrange(3)) + expr = sum(ComplexMul(K1s[j][j], Fout[j]) if K1s[j][j].Wn.is_complex else K1s[j][j]*Fout[j] + for j in xrange(3)) divFout = self.backward_divFout_transform.input_symbolic_array('divFout') expr = Assignment(divFout, expr) self.require_symbolic_kernel('compute_divFout', expr) super(OpenClSolenoidalProjection, self).initialize(**kwds) - @debug def setup(self, work): @@ -97,9 +98,9 @@ class OpenClSolenoidalProjection(SolenoidalProjectionOperatorBase, OpenClSymboli if self.compute_divFout: knl, _ = self.symbolic_kernels['compute_divFout'] self.compute_divFout_kernel = functools.partial(knl, queue=self.cl_env.default_queue) - + def _build_ghost_exchangers(self): - kl = OpenClKernelListLauncher(name='exchange_ghosts') + kl = OpenClKernelListLauncher(name='exchange_ghosts', profiler=self._profiler) kl += self.dFout.exchange_ghosts(build_launcher=True) if self.compute_divFin: kl += self.ddivFin.exchange_ghosts(build_launcher=True) @@ -107,7 +108,6 @@ class OpenClSolenoidalProjection(SolenoidalProjectionOperatorBase, OpenClSymboli kl += self.ddivFout.exchange_ghosts(build_launcher=True) self.exchange_ghost_kernels = functools.partial(kl, queue=self.cl_env.default_queue) - @op_apply def apply(self, simulation=None, **kwds): '''Solve the SolenoidalProjection.''' @@ -125,4 +125,3 @@ class OpenClSolenoidalProjection(SolenoidalProjectionOperatorBase, OpenClSymboli for Bt in self.backward_transforms: evt = Bt() evt = self.exchange_ghost_kernels() - diff --git a/hysop/backend/device/opencl/operator/spatial_filtering.py b/hysop/backend/device/opencl/operator/spatial_filtering.py index 1501c1b29..f5df7d162 100644 --- a/hysop/backend/device/opencl/operator/spatial_filtering.py +++ b/hysop/backend/device/opencl/operator/spatial_filtering.py @@ -1,4 +1,3 @@ - import numpy as np import functools @@ -11,7 +10,7 @@ from hysop.fields.continuous_field import Field from hysop.parameters.parameter import Parameter from hysop.topology.cartesian_descriptor import CartesianTopologyDescriptors from hysop.operator.base.spatial_filtering import RemeshRestrictionFilterBase, SpectralRestrictionFilterBase, \ - SubgridRestrictionFilterBase, PolynomialInterpolationFilterBase, PolynomialRestrictionFilterBase + SubgridRestrictionFilterBase, PolynomialInterpolationFilterBase, PolynomialRestrictionFilterBase from hysop.backend.device.opencl.opencl_symbolic import OpenClSymbolic from hysop.backend.device.opencl.opencl_copy_kernel_launchers import OpenClCopyBufferRectLauncher from hysop.backend.device.opencl.opencl_kernel_launcher import OpenClKernelListLauncher @@ -19,6 +18,7 @@ from hysop.backend.device.opencl.opencl_elementwise import OpenClElementwiseKern from hysop.symbolic import local_indices_symbols from hysop.symbolic.relational import Assignment + class OpenClPolynomialInterpolationFilter(PolynomialInterpolationFilterBase, OpenClOperator): @debug @@ -26,22 +26,22 @@ class OpenClPolynomialInterpolationFilter(PolynomialInterpolationFilterBase, Ope if self.discretized: return super(OpenClPolynomialInterpolationFilter, self).discretize() - dFin = self.dFin + dFin = self.dFin dFout = self.dFout - gr = self.grid_ratio + gr = self.grid_ratio dim = dFin.dim assert dFin.is_scalar assert dFout.is_scalar assert self.subgrid_interpolator.gr == gr - + ekg = self.elementwise_kernel_generator - Wr = self.subgrid_interpolator.Wr - n = self.subgrid_interpolator.n + Wr = self.subgrid_interpolator.Wr + n = self.subgrid_interpolator.n ghosts = np.asarray(self.subgrid_interpolator.ghosts) - + I = np.asarray(local_indices_symbols[:dim][::-1]) fin, fout = ekg.dfields_to_ndbuffers(dFin, dFout) - Fin = ekg.symbolic_tmp_scalars('F', shape=n, dtype=dFin.dtype) + Fin = ekg.symbolic_tmp_scalars('F', shape=n, dtype=dFin.dtype) Fout_values = Wr.dot(Fin.ravel()).reshape(gr) exprs = () @@ -51,13 +51,13 @@ class OpenClPolynomialInterpolationFilter(PolynomialInterpolationFilterBase, Ope for idx in np.ndindex(*gr): e = Assignment(fout(gr*I+idx), Fout_values[idx]) exprs += (e,) - kname='interpolate_grid_{}'.format(self.polynomial_interpolation_method).lower() + kname = 'interpolate_grid_{}'.format(self.polynomial_interpolation_method).lower() interpolate_grid_kernel, _ = ekg.elementwise_kernel(kname, - *exprs, compute_resolution=self.iter_shape, debug=False) + *exprs, compute_resolution=self.iter_shape, debug=False) exchange_ghosts = self.dFout.exchange_ghosts(build_launcher=True) - - kl = OpenClKernelListLauncher(name=kname) + + kl = OpenClKernelListLauncher(name=kname, profiler=self._profiler) kl += interpolate_grid_kernel kl += exchange_ghosts @@ -76,36 +76,36 @@ class OpenClPolynomialRestrictionFilter(PolynomialRestrictionFilterBase, OpenClO if self.discretized: return super(OpenClPolynomialRestrictionFilter, self).discretize() - dFin = self.dFin + dFin = self.dFin dFout = self.dFout - gr = self.grid_ratio + gr = self.grid_ratio dim = dFin.dim assert dFin.is_scalar assert dFout.is_scalar assert self.subgrid_restrictor.gr == gr - + ekg = self.elementwise_kernel_generator - Rr = self.subgrid_restrictor.Rr / self.subgrid_restrictor.GR + Rr = self.subgrid_restrictor.Rr / self.subgrid_restrictor.GR ghosts = np.asarray(self.subgrid_restrictor.ghosts) - + I = np.asarray(local_indices_symbols[:dim][::-1]) fin, fout = ekg.dfields_to_ndbuffers(dFin, dFout) def gen_inputs(*idx): return fin(gr*I+idx-ghosts) - input_values = np.asarray(tuple(map(gen_inputs, np.ndindex(*Rr.shape)))).reshape(Rr.shape) + input_values = np.asarray(tuple(map(gen_inputs, np.ndindex(*Rr.shape)))).reshape(Rr.shape) output_value = (Rr*input_values).sum() - + e = Assignment(fout(I), output_value) exprs = (e,) - kname='restrict_grid_{}'.format(self.polynomial_interpolation_method).lower() + kname = 'restrict_grid_{}'.format(self.polynomial_interpolation_method).lower() restriction_grid_kernel, _ = ekg.elementwise_kernel(kname, - *exprs, compute_resolution=self.iter_shape, debug=False) + *exprs, compute_resolution=self.iter_shape, debug=False) exchange_ghosts = self.dFout.exchange_ghosts(build_launcher=True) - - kl = OpenClKernelListLauncher(name=kname) + + kl = OpenClKernelListLauncher(name=kname, profiler=self._profiler) kl += restriction_grid_kernel kl += exchange_ghosts @@ -122,33 +122,34 @@ class OpenClSubgridRestrictionFilter(SubgridRestrictionFilterBase, OpenClSymboli OpenCL implementation for lowpass spatial filtering: small grid -> coarse grid using the subgrid method. """ + def __init__(self, **kwds): super(OpenClSubgridRestrictionFilter, self).__init__(**kwds) - Fin = self.Fin + Fin = self.Fin Fout = self.Fout dim = Fin.dim assert Fin.is_scalar assert Fout.is_scalar - + # We do not know the grid ratio and array strides before discretization. # so we defer the initialization of those integers with symbolic constants. symbolic_input_buffer, = self.symbolic_buffers('fine_grid') symbolic_output_buffer = self.Fout.s() - symbolic_grid_ratio = self.symbolic_constants('gr', count=dim, dtype=npw.int32) + symbolic_grid_ratio = self.symbolic_constants('gr', count=dim, dtype=npw.int32) symbolic_input_strides = self.symbolic_constants('is', count=dim, dtype=npw.int32) - symbolic_input_ghosts = self.symbolic_constants('gs', count=dim, dtype=npw.int32) - + symbolic_input_ghosts = self.symbolic_constants('gs', count=dim, dtype=npw.int32) + I = local_indices_symbols[:dim][::-1] read_idx = npw.dot(symbolic_input_strides, npw.add(npw.multiply(symbolic_grid_ratio, I), symbolic_input_ghosts)) expr = Assignment(symbolic_output_buffer, symbolic_input_buffer[read_idx]) self.require_symbolic_kernel('extract_subgrid', expr) - self.symbolic_input_buffer = symbolic_input_buffer + self.symbolic_input_buffer = symbolic_input_buffer self.symbolic_output_buffer = symbolic_output_buffer - self.symbolic_grid_ratio = symbolic_grid_ratio + self.symbolic_grid_ratio = symbolic_grid_ratio self.symbolic_input_strides = symbolic_input_strides - self.symbolic_input_ghosts = symbolic_input_ghosts + self.symbolic_input_ghosts = symbolic_input_ghosts @debug def setup(self, work): @@ -159,12 +160,12 @@ class OpenClSubgridRestrictionFilter(SubgridRestrictionFilterBase, OpenClSymboli self.symbolic_grid_ratio[i].bind_value(self.grid_ratio[i]) self.symbolic_input_strides[i].bind_value(ibuffer.strides[i] // ibuffer.dtype.itemsize) self.symbolic_input_ghosts[i].bind_value(dFin.ghosts[i]) - + super(OpenClSubgridRestrictionFilter, self).setup(work) - + (extract_subgrid, _) = self.symbolic_kernels['extract_subgrid'] exchange_ghosts = self.dFout.exchange_ghosts(build_launcher=True) - + kl = OpenClKernelListLauncher(name='extract_subgrid') kl += extract_subgrid kl += exchange_ghosts @@ -181,44 +182,45 @@ class OpenClSpectralRestrictionFilter(SpectralRestrictionFilterBase, OpenClOpera OpenCL implementation for lowpass spatial filtering: small grid -> coarse grid using the spectral method. """ + def _compute_scaling_coefficient(self): - kernel_generator = OpenClElementwiseKernelGenerator(cl_env=self.cl_env, - kernel_config=self.kernel_config) - + kernel_generator = OpenClElementwiseKernelGenerator(cl_env=self.cl_env, + kernel_config=self.kernel_config) + # Kernels to copy src_slices to dst_slices (windowing operation on frequencies) kl = OpenClKernelListLauncher(name='lowpass_filter') for (src_slc, dst_slc) in zip(*self.fslices): kl += OpenClCopyBufferRectLauncher.from_slices('copy', - src=self.FIN[src_slc], dst=self.FOUT[dst_slc]) - + src=self.FIN[src_slc], dst=self.FOUT[dst_slc]) + # Now we compute the scaling coefficient of the filter - # self.Ft.input_buffer is just a pypencl.Array so we need to use + # self.Ft.input_buffer is just a pypencl.Array so we need to use # the kernel_generator to fill ones and use explicit copy kernels - - # This seems to be the only solution to fill a non C-contiguous + + # This seems to be the only solution to fill a non C-contiguous # OpenClinput_buffer with ones. buf = self.Ft.input_buffer buf, = kernel_generator.arrays_to_symbols(buf) expr = Assignment(buf, 1) fill_ones, _ = kernel_generator.elementwise_kernel('fill_ones', expr) - + fill_ones(queue=self.cl_env.default_queue) self.Ft(simulation=False) - kl(queue=self.cl_env.default_queue) # here we apply unscaled filter + kl(queue=self.cl_env.default_queue) # here we apply unscaled filter self.Bt(simulation=False) # Here we get the coefficient scaling = 1.0 / self.Bt.output_buffer[(0,)*self.FOUT.ndim].get() - + # Now we can finally build the filter scaling kernel fout, = kernel_generator.arrays_to_symbols(self.FOUT) expr = Assignment(fout, scaling*fout) scale, _ = kernel_generator.elementwise_kernel('scale', expr) kl += scale - + # we store the filtering kernel list for the setup step self.filter = functools.partial(kl, queue=self.cl_env.default_queue) - + # finally build ghost exchanger exchange_ghosts = self.dFout.exchange_ghosts(build_launcher=True) if (exchange_ghosts is not None): @@ -226,14 +228,13 @@ class OpenClSpectralRestrictionFilter(SpectralRestrictionFilterBase, OpenClOpera self.exchange_ghosts = exchange_ghosts return scaling - @op_apply def apply(self, **kwds): """Apply spectral filter (which is just a square window centered on low frequencies).""" super(OpenClSpectralRestrictionFilter, self).apply(**kwds) - evt = self.Ft(**kwds) + evt = self.Ft(**kwds) evt = self.filter() - evt = self.Bt(**kwds) + evt = self.Bt(**kwds) if (self.exchange_ghosts is not None): evt = self.exchange_ghosts() diff --git a/hysop/backend/device/opencl/operator/transpose.py b/hysop/backend/device/opencl/operator/transpose.py index 9e513fe47..2bd6d8307 100644 --- a/hysop/backend/device/opencl/operator/transpose.py +++ b/hysop/backend/device/opencl/operator/transpose.py @@ -1,10 +1,10 @@ - from hysop.tools.decorators import debug from hysop.operator.base.transpose_operator import TransposeOperatorBase from hysop.backend.device.opencl.opencl_operator import OpenClOperator, op_apply from hysop.backend.device.opencl.autotunable_kernels.transpose import OpenClAutotunableTransposeKernel from hysop.backend.device.opencl.opencl_kernel_launcher import OpenClKernelListLauncher + class OpenClTranspose(TransposeOperatorBase, OpenClOperator): @debug @@ -18,7 +18,7 @@ class OpenClTranspose(TransposeOperatorBase, OpenClOperator): def _collect_kernels(self): self._collect_transpose_kernel() - + def _collect_transpose_kernel(self): cl_env = self.cl_env typegen = self.typegen @@ -26,7 +26,7 @@ class OpenClTranspose(TransposeOperatorBase, OpenClOperator): autotuner_config = self.autotuner_config build_opts = self.build_options() - + input_field = self.din output_field = self.dout is_inplace = self.is_inplace @@ -34,43 +34,43 @@ class OpenClTranspose(TransposeOperatorBase, OpenClOperator): kernel = OpenClAutotunableTransposeKernel(cl_env, typegen, build_opts, autotuner_config) if is_inplace: - #Only 2D square matrix inplace transposition is supported - compute_inplace = (input_field.dim == 2) - compute_inplace &= all(input_field.resolution[0]==input_field.resolution) + # Only 2D square matrix inplace transposition is supported + compute_inplace = (input_field.dim == 2) + compute_inplace &= all(input_field.resolution[0] == input_field.resolution) else: compute_inplace = False hardcode_arrays = (compute_inplace or not is_inplace) - transpose, _ = kernel.autotune(axes=axes, - hardcode_arrays=hardcode_arrays, - is_inplace=compute_inplace, - input_buffer=input_field.sbuffer, - output_buffer=output_field.sbuffer) - - launcher = OpenClKernelListLauncher(name=transpose.name) + transpose, _ = kernel.autotune(axes=axes, + hardcode_arrays=hardcode_arrays, + is_inplace=compute_inplace, + input_buffer=input_field.sbuffer, + output_buffer=output_field.sbuffer) + + launcher = OpenClKernelListLauncher(name=transpose.name, profiler=self._profiler) for i in xrange(self.nb_components): if compute_inplace: - assert hardcode_arrays + assert hardcode_arrays launcher += transpose.build_launcher(inout_base=input_field.data[i].base_data) elif is_inplace: - assert not hardcode_arrays - kernel_kargs = kernel.build_array_args(**{'in':input_field.data[i], 'out':self.dtmp}) + assert not hardcode_arrays + kernel_kargs = kernel.build_array_args(**{'in': input_field.data[i], 'out': self.dtmp}) launcher += transpose.build_launcher(**kernel_kargs) - launcher.push_copy_device_to_device(varname='tmp', src=self.dtmp, - dst=input_field.data[i]) + launcher.push_copy_device_to_device(varname='tmp', src=self.dtmp, + dst=input_field.data[i]) else: - assert hardcode_arrays - launcher += transpose.build_launcher(in_base=input_field.data[i].base_data, + assert hardcode_arrays + launcher += transpose.build_launcher(in_base=input_field.data[i].base_data, out_base=output_field.data[i].base_data) self._kernel_launcher = launcher - + def enqueue_copy_kernel(self, _dst, _src, queue): pass - + @op_apply def apply(self, **kwds): queue = self.cl_env.default_queue - + kernel_launcher = self._kernel_launcher evt = kernel_launcher(queue=queue) diff --git a/hysop/iterative_method.py b/hysop/iterative_method.py index 9adfb5340..2fce0bfc1 100644 --- a/hysop/iterative_method.py +++ b/hysop/iterative_method.py @@ -115,11 +115,15 @@ class IterativeMethod(Problem): pass @debug - @profile @ready def apply(self, simulation, report_freq=0, dbg=None, **kwds): if self.to_be_skipped(self, simulation=simulation, **kwds): return + self.run_iterations(simulation=simulation, report_freq=report_freq, dbg=dbg, **kwds) + + @profile + def run_iterations(self, simulation, report_freq=0, dbg=None, **kwds): + """This function si meant to clarify the profiling data""" vprint('=== Entering iterative method...') self.stop_criteria.value = self._stop_criteria_reset -- GitLab