diff --git a/hysop/__init__.py b/hysop/__init__.py index d877538ec4dceb20559c6473c599f7edcb6c6f27..279e86abeb02be2bfb61b0e3665f0412e9c2564c 100644 --- a/hysop/__init__.py +++ b/hysop/__init__.py @@ -20,7 +20,7 @@ __VERBOSE__ = False __DEBUG__ = False __TRACE__ = False __TRACE_WARNINGS__ = False -__KERNEL_DEBUG__ = False +__KERNEL_DEBUG__ = True __PROFILE__ = True __ENABLE_LONG_TESTS__ = "OFF" is "ON" diff --git a/hysop/backend/device/autotunable_kernel.py b/hysop/backend/device/autotunable_kernel.py index b253355bb735da350ff5a10326ad21028b5226c9..f41023994586f0d74eb3c6b68b537599441bd860 100644 --- a/hysop/backend/device/autotunable_kernel.py +++ b/hysop/backend/device/autotunable_kernel.py @@ -643,4 +643,13 @@ class AutotunerWorkConfiguration(object): return False oldval = val return True - + + @abstractmethod + def make_array_offset(self, dim): + pass + @abstractmethod + def make_array_strides(self, dim): + pass + @abstractmethod + def make_array_args(self, **arrays): + pass diff --git a/hysop/backend/device/codegen/kernels/directional_remesh.py b/hysop/backend/device/codegen/kernels/directional_remesh.py index f8f93aa9292cf4524a381f12ed95e94bb39cce47..10a4141e96f17b1925c67e0182e6aa8b1b010903 100644 --- a/hysop/backend/device/codegen/kernels/directional_remesh.py +++ b/hysop/backend/device/codegen/kernels/directional_remesh.py @@ -2,44 +2,37 @@ import contextlib from contextlib import contextmanager -from hysop.deps import math, operator, hashlib - from hysop import __VERBOSE__, __KERNEL_DEBUG__ +from hysop.deps import np, math, operator, hashlib from hysop.tools.misc import Utils, upper_pow2_or_3 from hysop.tools.types import check_instance from hysop.tools.numpywrappers import npw - -from hysop.deps import np from hysop.constants import DirectionLabels, BoundaryCondition, Backend, Precision + from hysop.core.arrays.all import OpenClArray +from hysop.numerics.remesh.remesh import RemeshKernel +from hysop.fields.continuous_field import Field +from hysop.fields.discrete_field import DiscreteFieldView -from hysop.backend.device.opencl import cl +from hysop.backend.device.opencl import cl, clTools, clCharacterize +from hysop.backend.device.opencl.opencl_env import OpenClEnvironment +from hysop.backend.device.opencl.opencl_types import OpenClTypeGen +from hysop.backend.device.opencl.opencl_array_backend import OpenClArrayBackend -from hysop.backend.device.codegen import CodeGeneratorWarning +from hysop.backend.device.codegen import CodeGeneratorWarning +from hysop.backend.device.codegen.base.utils import WriteOnceDict, ArgDict +from hysop.backend.device.codegen.base.statistics import WorkStatistics +from hysop.backend.device.codegen.base.variables import CodegenStruct +from hysop.backend.device.codegen.structs.mesh_info import MeshBaseStruct, MeshInfoStruct from hysop.backend.device.codegen.base.opencl_codegen import OpenClCodeGenerator from hysop.backend.device.codegen.base.kernel_codegen import KernelCodeGenerator from hysop.backend.device.codegen.base.variables import CodegenVariable, \ CodegenVectorClBuiltin, CodegenArray -from hysop.backend.device.opencl import cl, clTools -from hysop.backend.device.opencl.opencl_types import OpenClTypeGen -from hysop.backend.device.codegen.base.utils import WriteOnceDict, ArgDict -from hysop.backend.device.codegen.base.statistics import WorkStatistics - -from hysop.backend.device.codegen.base.variables import CodegenStruct -from hysop.backend.device.codegen.structs.mesh_info import MeshBaseStruct, MeshInfoStruct - -from hysop.backend.device.opencl import cl, clCharacterize -from hysop.backend.device.opencl.opencl_env import OpenClEnvironment - -from hysop.fields.continuous_field import Field -from hysop.fields.discrete_field import DiscreteFieldView -from hysop.core.arrays.all import OpenClArrayBackend -from hysop.numerics.remesh.remesh import RemeshKernel -from hysop.constants import DirectionLabels from hysop.backend.device.codegen.functions.directional_remesh import DirectionalRemeshFunction + class DirectionalRemeshKernelGenerator(KernelCodeGenerator): @staticmethod diff --git a/hysop/backend/device/codegen/kernels/transpose.py b/hysop/backend/device/codegen/kernels/transpose.py index 83049193b69f52bf40dbbb9a12a46c83c4749327..e193d89edb51e0a5090c2ec0ee00b3da65493884 100644 --- a/hysop/backend/device/codegen/kernels/transpose.py +++ b/hysop/backend/device/codegen/kernels/transpose.py @@ -1,13 +1,12 @@ -import operator -import numpy as np from contextlib import contextmanager, nested - +from hysop.deps import np, operator from hysop.tools.misc import upper_pow2_or_3, prod from hysop.tools.decorators import static_vars from hysop.tools.numpywrappers import npw from hysop.tools.types import check_instance from hysop.tools.misc import upper_pow2 from hysop.tools.units import bytes2str +from hysop.backend.device.opencl.opencl_array_backend import OpenClArrayBackend from hysop.backend.device.opencl import clCharacterize from hysop.backend.device.opencl.opencl_types import OpenClTypeGen from hysop.backend.device.codegen.base.opencl_codegen import OpenClCodeGenerator @@ -152,7 +151,7 @@ class TransposeKernelGenerator(KernelCodeGenerator): return (sc,dc,tc) def __init__(self, typegen, ctype, vectorization, - axes, tile_size, tile_padding, + axes, tile_size, tile_padding, symbolic_mode, use_diagonal_coordinates = True, is_inplace = False, known_vars = None, @@ -225,7 +224,8 @@ class TransposeKernelGenerator(KernelCodeGenerator): name = TransposeKernelGenerator.codegen_name(is_inplace, axes, ctype, tile_size, tile_padding, vectorization, use_diagonal_coordinates) - kernel_args = self.gen_kernel_arguments(typegen, ctype, Pdim, debug_mode, is_inplace) + kernel_args = self.gen_kernel_arguments(typegen, ctype, Pdim, debug_mode, is_inplace, + known_vars, symbolic_mode) super(self.__class__,self).__init__( name=name, @@ -233,6 +233,7 @@ class TransposeKernelGenerator(KernelCodeGenerator): work_dim=work_dim, known_vars = known_vars, kernel_args = kernel_args, + symbolic_mode=symbolic_mode, **kargs) if debug_mode: @@ -288,22 +289,33 @@ class TransposeKernelGenerator(KernelCodeGenerator): return reqs - def gen_kernel_arguments(self, typegen, ctype, Pdim, debug_mode, is_inplace): + def gen_kernel_arguments(self, typegen, ctype, Pdim, debug_mode, is_inplace, + known_vars, symbolic_mode): _global = OpenClCodeGenerator.default_keywords['global'] tg = typegen + mesh_dim = Pdim kargs = ArgDict() if is_inplace: - kargs['inout'] = CodegenVariable(ctype=ctype, name='inout', - typegen=tg, storage=_global, ptr=True, - ptr_const=True, ptr_restrict=True, nl=True) + data, strides = OpenClArrayBackend.build_codegen_arguments(kargs, name='inout', + known_vars=known_vars, symbolic_mode=symbolic_mode, + storage=self._global, ctype=ctype, typegen=typegen, + mesh_dim=mesh_dim, const=False, ptr_restrict=True) + self.inout_strides = strides + self.inout_data = data else: - kargs['in'] = CodegenVariable(ctype=ctype, name='in', - typegen=tg, storage=_global, const=True, ptr=True, - add_impl_const=True, ptr_restrict=True, nl=True) - kargs['out'] = CodegenVariable(ctype=ctype, name='out', - typegen=tg, storage=_global, ptr=True, - ptr_const=True, ptr_restrict=True, nl=True) + in_data, in_strides = OpenClArrayBackend.build_codegen_arguments(kargs, name='in', + known_vars=known_vars, symbolic_mode=symbolic_mode, + storage=self._global, ctype=ctype, typegen=typegen, + mesh_dim=mesh_dim, const=True, ptr_restrict=True) + out_data, out_strides = OpenClArrayBackend.build_codegen_arguments(kargs, name='out', + known_vars=known_vars, symbolic_mode=symbolic_mode, + storage=self._global, ctype=ctype, typegen=typegen, + mesh_dim=mesh_dim, const=False, ptr_restrict=True) + self.in_data = in_data + self.out_data = out_data + self.in_strides = in_strides + self.out_strides = out_strides if debug_mode: n_dbg_arrays = self.n_dbg_arrays @@ -372,12 +384,15 @@ class TransposeKernelGenerator(KernelCodeGenerator): S = s.vars['shape'] if is_inplace: - _inout = s.vars['inout'] - _in = _inout - _out = _inout + _in = self.inout_data + _out = self.inout_data + _in_strides = self.inout_strides + _out_strides = self.inout_strides else: - _in = s.vars['in'] - _out = s.vars['out'] + _in = self.in_data + _out = self.out_data + _in_strides = self.in_strides + _out_strides = self.out_strides if debug_mode: dbg = [ s.vars['dbg{}'.format(i)] for i in xrange(n_dbg_arrays) ] @@ -420,29 +435,25 @@ class TransposeKernelGenerator(KernelCodeGenerator): i = pdim-1-k j = pdim-1-axes[k] if i==pdim-1: - tile_offset_in = '{}'.format(idx[i]) - tile_offset_out = '{}'.format(idx[j]) + tile_offset_in = '{}*{}'.format(_in_strides[i], idx[i]) + tile_offset_out = '{}*{}'.format(_out_strides[i], idx[j]) else: - tile_offset_in = '({}*{}+{})'.format(tile_offset_in, S[i], idx[i]) - tile_offset_out = '({}*{}+{})'.format(tile_offset_out, S[j], idx[j]) + tile_offset_in += ' $+ {}*{}'.format(_in_strides[i], idx[i]) + tile_offset_out += ' $+ {}*{}'.format(_out_strides[i], idx[j]) if i in tile_indexes: if ki==tdim-1: - local_offset_in = '{}'.format(lidx[ki]) + local_offset_in = '{}*{}'.format(_in_strides[i], lidx[ki]) else: - local_offset_in = '({}*{}+{})'.format(local_offset_in, S[i], lidx[ki]) + local_offset_in += ' $+ {}*{}'.format(_in_strides[i], lidx[ki]) ki-=1 - elif local_offset_in != '': - local_offset_in = '{}*{}'.format(local_offset_in, S[i]) if j in tile_indexes: if kj==tdim-1: - local_offset_out = '{}'.format(lidx[kj]) + local_offset_out = '{}*{}'.format(_out_strides[i], lidx[kj]) else: - local_offset_out = '({}*{}+{})'.format(local_offset_out, S[j], lidx[kj]) + local_offset_out += ' $+ {}*{}'.format(_out_strides[i], lidx[kj]) kj -= 1 - elif local_offset_out != '': - local_offset_out = '{}*{}'.format(local_offset_out, S[j]) assert ki==-1 assert kj==-1 @@ -464,13 +475,13 @@ class TransposeKernelGenerator(KernelCodeGenerator): else: loc_id = '({}*{}+{})'.format(loc_id, local_size[i], local_id[i]) - tile_offset_in = CodegenVariable('tile_offset_in', 'int', tg, + tile_offset_in = CodegenVariable('tile_offset_in', 'ulong', tg, init=tile_offset_in, const=True) - tile_offset_out = CodegenVariable('tile_offset_out', 'int', tg, + tile_offset_out = CodegenVariable('tile_offset_out', 'ulong', tg, init=tile_offset_out, const=True) - local_offset_in = CodegenVariable('local_offset_in', 'int', tg, + local_offset_in = CodegenVariable('local_offset_in', 'ulong', tg, init=local_offset_in, const=True) - local_offset_out = CodegenVariable('local_offset_out', 'int', tg, + local_offset_out = CodegenVariable('local_offset_out', 'ulong', tg, init=local_offset_out, const=True) TID = CodegenVariable('TID', 'int', tg, const=True, @@ -568,6 +579,10 @@ class TransposeKernelGenerator(KernelCodeGenerator): s.decl_aligned_vars(global_id, local_id, group_id, global_size, local_size, group_size, const=True) + ptrs = (_in,) + if not is_inplace: + ptrs+=(_out,) + s.decl_aligned_vars(*ptrs) s.jumpline() nwork.declare(s) ntiles.declare(s) @@ -696,7 +711,7 @@ to prevent memory camping that may occur during global input read or output writ if __name__ == '__main__': from hysop.backend.device.codegen.base.test import _test_typegen tg = _test_typegen('float') - ek = TransposeKernelGenerator(typegen=tg, ctype='char', vectorization=2, + ek = TransposeKernelGenerator(typegen=tg, ctype='short', vectorization=2, axes=(2,1,0,4,3), tile_size=8, tile_padding=1, is_inplace=False, diff --git a/hysop/backend/device/kernel_autotuner.py b/hysop/backend/device/kernel_autotuner.py index 1417617e49e9012b6fcd46a257d3cd94e7806183..a5e0fb0efae54999a66ffff340eda6b979cc6495 100644 --- a/hysop/backend/device/kernel_autotuner.py +++ b/hysop/backend/device/kernel_autotuner.py @@ -346,6 +346,13 @@ class KernelAutotuner(object): raise ValueError(msg) args_list = [None,]*len(args_mapping) + arg_indices = set(ka[0] for ka in args_mapping.values()) + if arg_indices != set(xrange(len(arg_indices))): + msg='Illformed argument position mapping:\n' + msg+='\n'.join(' >argument {}: {}'.format(argpos, argname) for (argname, argpos) + in zip(args_mapping.keys(), arg_indices)) + msg+='\nExpected contiguous integer argument positions.' + raise ValueError(msg) for (arg_name, arg_value) in kernel_args.iteritems(): if (arg_name not in args_mapping): msg='Unknown argument {}, valid ones are {}.' diff --git a/hysop/backend/device/opencl/autotunable_kernels/remesh_dir.py b/hysop/backend/device/opencl/autotunable_kernels/remesh_dir.py index 5a4404e34265227e1d33efa6c71de4879f9eefa8..8cabda7e2f9359fa92a5dbadb3982cd26cbb0b04 100644 --- a/hysop/backend/device/opencl/autotunable_kernels/remesh_dir.py +++ b/hysop/backend/device/opencl/autotunable_kernels/remesh_dir.py @@ -122,28 +122,17 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): msg=msg.format(dsin.name, dsin.dtype, dsout.name, dsout.dtype, precision.__name__) raise NotImplementedError(msg) - typegen = self.cl_env.typegen - ndim = upper_pow2_or_3(position.dim) - ivecn = typegen.uintn(ndim) - def make_ivecn(bstrides, dtype): - msg='Invalid strides {} for dtype {} (itemsize={}).'.format(bstrides, - dtype.__class__.__name__, dtype.itemsize) - assert (npw.mod(bstrides, dtype.itemsize) == 0).all(), msg - data = typegen.make_uintn( - vals=tuple(x//dtype.itemsize for x in bstrides[::-1]), - n=ndim, dval=0) - if hardcode_arrays: - return data.tolist()[:ndim] - else: - return data + make_offset, offset_dtype = self.make_array_offset() + make_strides, strides_dtype = self.make_array_strides(position.dim, + hardcode_arrays=hardcode_arrays) kernel_args = {} known_args = {} target_args = known_args if hardcode_arrays else kernel_args kernel_args['position_base'] = position[0].base_data - target_args['position_strides'] = make_ivecn(position[0].strides, position.dtype) - target_args['position_offset'] = npw.uint64(position[0].offset) + target_args['position_strides'] = make_strides(position[0].strides, position.dtype) + target_args['position_offset'] = make_offset(position[0].offset, position.dtype) mesh_info_vars = { 'position_mesh_info': self.mesh_info('position_mesh_info', position.mesh) } @@ -155,8 +144,8 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): for j in xrange(dsinout.nb_components): prefix = 'S{}_{}_inout'.format(i,j) kernel_args[prefix+'_base'] = dsinout[j].base_data - target_args[prefix+'_strides'] = make_ivecn(dsinout[j].strides, dsinout.dtype) - target_args[prefix+'_offset'] = npw.uint64(dsinout[j].offset) + target_args[prefix+'_strides'] = make_strides(dsinout[j].strides, dsinout.dtype) + target_args[prefix+'_offset'] = make_offset(dsinout[j].offset, dsinout.dtype) arg_index += 1 + 2*(1-hardcode_arrays) assert i == nfields-1 else: @@ -166,8 +155,8 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): for j in xrange(dsin.nb_components): prefix = 'S{}_{}_in'.format(i,j) kernel_args[prefix+'_base'] = dsin[j].base_data - target_args[prefix+'_strides'] = make_ivecn(dsin[j].strides, dsin.dtype) - target_args[prefix+'_offset'] = npw.uint64(dsin[j].offset) + target_args[prefix+'_strides'] = make_strides(dsin[j].strides, dsin.dtype) + target_args[prefix+'_offset'] = make_offset(dsin[j].offset, dsin.dtype) arg_index += 1 + 2*(1-hardcode_arrays) assert i == nfields-1 for (i,dsout) in enumerate(scalars_out): @@ -176,8 +165,8 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): for j in xrange(dsout.nb_components): prefix = 'S{}_{}_out'.format(i,j) kernel_args[prefix+'_base'] = dsout[j].base_data - target_args[prefix+'_strides'] = make_ivecn(dsout[j].strides, dsout.dtype) - target_args[prefix+'_offset'] = npw.uint64(dsout[j].offset) + target_args[prefix+'_strides'] = make_strides(dsout[j].strides, dsout.dtype) + target_args[prefix+'_offset'] = make_offset(dsout[j].offset, dsout.dtype) arg_index += 1 + 2*(1-hardcode_arrays) assert i == nfields-1 assert len(kernel_args) == arg_index @@ -192,7 +181,8 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): force_atomics=force_atomics, min_nparticles=min_nparticles, ftype=ftype, scalar_cfl=scalar_cfl, kernel_args=kernel_args, mesh_info_vars=mesh_info_vars, work_dim=work_dim, work_size=work_size, min_wg_size=min_wg_size, - known_args=known_args, hardcode_arrays=hardcode_arrays, ivecn=ivecn, **kwds) + known_args=known_args, hardcode_arrays=hardcode_arrays, + offset_dtype=offset_dtype, strides_dtype=strides_dtype, **kwds) def compute_args_mapping(self, extra_kwds, extra_parameters): @@ -209,7 +199,8 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): scalars_in = extra_kwds['scalars_in'] scalars_out = extra_kwds['scalars_out'] nscalars = extra_kwds['nscalars'] - ivecn = extra_kwds['ivecn'] + strides_dtype = extra_kwds['strides_dtype'] + offset_dtype = extra_kwds['offset_dtype'] hardcode_arrays = extra_kwds['hardcode_arrays'] args_mapping = {} @@ -218,8 +209,8 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): args_mapping['position_base'] = (0, cl.MemoryObjectHolder) arg_index += 1 if not hardcode_arrays: - args_mapping['position_strides'] = (1, ivecn) - args_mapping['position_offset'] = (2, npw.uint64) + args_mapping['position_strides'] = (1, strides_dtype) + args_mapping['position_offset'] = (2, offset_dtype) arg_index += 2 if is_inplace: @@ -229,8 +220,8 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): args_mapping[prefix+'_base'] = (arg_index, cl.MemoryObjectHolder) arg_index+=1 if not hardcode_arrays: - args_mapping[prefix+'_strides'] = (arg_index+0, ivecn) - args_mapping[prefix+'_offset'] = (arg_index+1, npw.uint64) + args_mapping[prefix+'_strides'] = (arg_index+0, strides_dtype) + args_mapping[prefix+'_offset'] = (arg_index+1, offset_dtype) arg_index += 2 else: for (i,dsin) in enumerate(scalars_in): @@ -239,8 +230,8 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): args_mapping[prefix+'_base'] = (arg_index, cl.MemoryObjectHolder) arg_index += 1 if not hardcode_arrays: - args_mapping[prefix+'_strides'] = (arg_index+0, ivecn) - args_mapping[prefix+'_offset'] = (arg_index+1, npw.uint64) + args_mapping[prefix+'_strides'] = (arg_index+0, strides_dtype) + args_mapping[prefix+'_offset'] = (arg_index+1, offset_dtype) arg_index += 2 for (i,dsout) in enumerate(scalars_out): for j in xrange(dsout.nb_components): @@ -248,8 +239,8 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): args_mapping[prefix+'_base'] = (arg_index, cl.MemoryObjectHolder) arg_index += 1 if not hardcode_arrays: - args_mapping[prefix+'_strides'] = (arg_index+0, ivecn) - args_mapping[prefix+'_offset'] = (arg_index+1, npw.uint64) + args_mapping[prefix+'_strides'] = (arg_index+0, strides_dtype) + args_mapping[prefix+'_offset'] = (arg_index+1, offset_dtype) arg_index += 2 assert len(args_mapping)==arg_index assert arg_index == (1+2*(1-hardcode_arrays))*(1+(2-is_inplace)*nscalars) diff --git a/hysop/backend/device/opencl/autotunable_kernels/transpose.py b/hysop/backend/device/opencl/autotunable_kernels/transpose.py index 78d282253f3b14bb3a8dda1b03f6a6e7dd233b07..d03e4232757c6fa79f4d682771ef56382a67638f 100644 --- a/hysop/backend/device/opencl/autotunable_kernels/transpose.py +++ b/hysop/backend/device/opencl/autotunable_kernels/transpose.py @@ -1,7 +1,7 @@ from hysop.tools.numpywrappers import npw from hysop.tools.types import check_instance -from hysop.tools.misc import upper_pow2, previous_pow2 +from hysop.tools.misc import upper_pow2, previous_pow2, upper_pow2_or_3 from hysop.tools.units import bytes2str from hysop.constants import AutotunerFlags from hysop.backend.device.opencl import cl, clTools @@ -31,7 +31,8 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): def autotune(self, is_inplace, input_field, output_field, - axes, name=None, **kwds): + axes, hardcode_arrays, + name=None, **kwds): """Autotune this kernel with specified axes, inputs and outputs.""" check_instance(axes, tuple, values=int) @@ -51,18 +52,46 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): assert axes != tuple(range(dim)) # check if is_inplace is allowed - assert (is_inplace == (input_field.dfield == output_field.dfield)) if is_inplace: - #Only 2D square matrix inplace transposition is supported - compute_inplace = (dim == 2) - compute_inplace &= all(shape[0]==shape) - else: - compute_inplace = False + can_compute_inplace = (dim == 2) + can_compute_inplace &= all(shape[0]==shape) + msg='Inplace was specified but this is only possible for 2D square arrays.' + if not can_compute_inplace: + raise ValueError(msg) + assert input_field.dfield == output_field.dfield + + # get vector size for strides + make_offset, offset_dtype = self.make_array_offset() + make_strides, strides_dtype = self.make_array_strides(input_field.dim, + hardcode_arrays=hardcode_arrays) + + # check that all component share strides and offsets + if hardcode_arrays: + assert all(npw.array_equal(input_field[i].offset, input_field[0].offset) + for i in xrange(input_field.nb_components)), 'Cannot hardcode mismatching array offsets.' + assert all(npw.array_equal(input_field[i].strides, input_field[0].strides) + for i in xrange(input_field.nb_components)), 'Cannot hardcode mismatching array strides.' + assert all(npw.array_equal(output_field[i].offset, output_field[0].offset) + for i in xrange(output_field.nb_components)), 'Cannot hardcode mismatching array offsets.' + assert all(npw.array_equal(output_field[i].strides, output_field[0].strides) + for i in xrange(output_field.nb_components)), 'Cannot hardcode mismatching array strides.' + + kernel_args = {} + known_args = {} + target_args = known_args if hardcode_arrays else kernel_args - if compute_inplace: - kernel_args = dict(inout=input_field(0).data) + if is_inplace: + kernel_args['inout_base'] = input_field(0).data + target_args['inout_strides'] = make_strides(input_field(0).strides, input_field.dtype) + target_args['inout_offset'] = make_offset(input_field(0).offset, input_field.dtype) else: - kernel_args = dict(input=input_field(0).data, output=output_field(0).data) + kernel_args['in_base'] = input_field(0).data + target_args['in_strides'] = make_strides(input_field(0).strides, input_field.dtype) + target_args['in_offset'] = make_offset(input_field(0).offset, input_field.dtype) + + kernel_args['out_base'] = output_field(0).data + target_args['out_strides'] = make_strides(output_field(0).strides, output_field.dtype) + target_args['out_offset'] = make_offset(output_field(0).offset, output_field.dtype) if (name is None): name = 'transpose_{}_[{}]_{}'.format(ctype, @@ -75,11 +104,11 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): TransposeKernelGenerator.characterize_permutation(shape, axes, self.max_device_work_dim()) - # keyword arguments will be agregated into extra_kwds dictionnary return super(OpenClAutotunableTransposeKernel, self).autotune(name=name, kernel_args=kernel_args, - compute_inplace=compute_inplace, + known_args=known_args, hardcode_arrays=hardcode_arrays, + offset_dtype=offset_dtype, strides_dtype=strides_dtype, axes=axes, dtype=dtype, ctype=ctype, @@ -89,6 +118,7 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): tile_indices=tile_indices, work_dim=work_dim, work_size=work_shape, + is_inplace=is_inplace, last_axe_permuted=last_axe_permuted, **kwds) def compute_parameters(self, extra_kwds): @@ -188,7 +218,8 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): ## Extract usefull variables axes = extra_kwds['axes'] ctype = extra_kwds['ctype'] - is_inplace = extra_kwds['compute_inplace'] + is_inplace = extra_kwds['is_inplace'] + known_args = extra_kwds['known_args'] ## Get compile time OpenCL known variables known_vars = super(OpenClAutotunableTransposeKernel, self).generate_kernel_src( @@ -196,7 +227,7 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): local_work_size=local_work_size, extra_parameters=extra_parameters, extra_kwds=extra_kwds) - + known_vars.update(known_args) known_vars['shape'] = self.to_vecn(extra_kwds['shape'], 0) ## Generate OpenCL source code @@ -214,30 +245,31 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): return (kernel_name, kernel_src) def compute_args_mapping(self, extra_kwds, extra_parameters): - if extra_kwds['compute_inplace']: - args_mapping = { 'inout': (0, cl.MemoryObjectHolder) } + args_mapping = {} + offset_dtype = extra_kwds['offset_dtype'] + strides_dtype = extra_kwds['strides_dtype'] + hardcode_arrays = extra_kwds['hardcode_arrays'] + if extra_kwds['is_inplace']: + args_mapping['inout_base'] = (0, cl.MemoryObjectHolder) + if not hardcode_arrays: + args_mapping['inout_strides'] = (1, strides_dtype) + args_mapping['inout_offset'] = (2, offset_dtype) else: - args_mapping = { 'input' : (0, cl.MemoryObjectHolder), - 'output': (1, cl.MemoryObjectHolder) } - return args_mapping - - def format_best_candidate(self, **kwds): - """ - Post treatment callback for autotuner results. - Transform autotuner results in user friendly kernel wrappers. + args_mapping['in_base'] = (0, cl.MemoryObjectHolder) + if not hardcode_arrays: + args_mapping['in_strides'] = (1, strides_dtype) + args_mapping['in_offset'] = (2, offset_dtype) - Return a OpenClKernel with default_queue and default_args set to None. - Only default_global_size, default_local_size, and args_mapping are set. + args_mapping['out_base'] = (1 + 2*(not hardcode_arrays), cl.MemoryObjectHolder) + if not hardcode_arrays: + args_mapping['out_strides'] = (4, strides_dtype) + args_mapping['out_offset'] = (5, offset_dtype) + return args_mapping - Use the build_launcher method to build OpenClKernelLauncher from this OpenClKernel. - """ - res = super(OpenClAutotunableTransposeKernel, self).format_best_candidate(**kwds) - compute_inplace = kwds['extra_kwds']['compute_inplace'] - return res + (compute_inplace,) - def hash_extra_kwds(self, extra_kwds): """Hash extra_kwds dictionnary for caching purposes.""" - return hash((extra_kwds['ctype'], + return self.custom_hash(extra_kwds['ctype'], extra_kwds['axes'], - tuple(extra_kwds['shape'].tolist()), - extra_kwds['compute_inplace'])) + extra_kwds['shape'], + extra_kwds['is_inplace'], + extra_kwds['known_args']) diff --git a/hysop/backend/device/opencl/opencl_array_backend.py b/hysop/backend/device/opencl/opencl_array_backend.py index 61dea8d155234c29129162f2f18371f9e34911e5..29016dd56b484c2da549dad6f36aaecf8705a4e1 100644 --- a/hysop/backend/device/opencl/opencl_array_backend.py +++ b/hysop/backend/device/opencl/opencl_array_backend.py @@ -3175,7 +3175,8 @@ class OpenClArrayBackend(ArrayBackend): offset_str = typegen.dump(np.uint64(known_vars[offset])) else: offset_str = offset - init = '({})(({})({})+{})'.format(ctype_alias, char_alias, base, offset_str) + # init = '({})(({})({})+{})'.format(ctype_alias, char_alias, base, offset_str) + init = '{} $+ {}'.format(base, offset_str) array = CodegenVariable(name=name, typegen=typegen, ctype=ctype, ptr=ptr, const=const, diff --git a/hysop/backend/device/opencl/opencl_autotunable_kernel.py b/hysop/backend/device/opencl/opencl_autotunable_kernel.py index a261662ca08295bb7c2d4f1d0f4e141a76127ac5..5d84e363b0748c7270045ebea65f1309f92d3ee8 100644 --- a/hysop/backend/device/opencl/opencl_autotunable_kernel.py +++ b/hysop/backend/device/opencl/opencl_autotunable_kernel.py @@ -10,10 +10,11 @@ from hysop.tools.units import bytes2str from hysop.backend.device.kernel_autotuner import KernelGenerationError from hysop.backend.device.autotunable_kernel import AutotunableKernel, AutotunerWorkConfiguration -from hysop.backend.device.opencl import cl, clTools, clCharacterize +from hysop.backend.device.opencl import cl, clTools, clCharacterize, clArray from hysop.backend.device.opencl.opencl_env import OpenClEnvironment from hysop.backend.device.opencl.opencl_types import OpenClTypeGen from hysop.backend.device.opencl.opencl_kernel import OpenClKernel +from hysop.backend.device.opencl.opencl_array import OpenClArray from hysop.backend.device.opencl.opencl_kernel_statistics import OpenClKernelStatistics class OpenClAutotunableKernel(AutotunableKernel): @@ -211,4 +212,52 @@ class OpenClAutotunableKernel(AutotunableKernel): res = npw.full(shape=(vsize,), dtype=vec.dtype, fill_value=extend) res[:vec.size] = vec return res + + def make_array_offset(self): + offset_dtype = npw.uint64 + def make_offset(offset, dtype): + """Build an offset in number of elements instead of bytes.""" + msg='Unaligned offset {} for dtype {} (itemsize={}).'.format(offset, + dtype, dtype.itemsize) + assert (offset % dtype.itemsize) == 0 + return offset_dtype(offset // dtype.itemsize) + return make_offset, offset_dtype + + def make_array_strides(self, dim, hardcode_arrays): + """Build array strides in number of elements instead of bytes.""" + typegen = self.cl_env.typegen + ndim = upper_pow2_or_3(dim) + strides_dtype = typegen.uintn(ndim) + def make_strides(bstrides, dtype): + msg='Invalid strides {} for dtype {} (itemsize={}).'.format(bstrides, + dtype.__class__.__name__, dtype.itemsize) + assert (npw.mod(bstrides, dtype.itemsize) == 0).all(), msg + data = typegen.make_uintn( + vals=tuple(x//dtype.itemsize for x in bstrides[::-1]), + n=ndim, dval=0) + if hardcode_arrays: + return data.tolist()[:ndim] + else: + return data + return make_strides, strides_dtype + + def build_array_args(self, hardcode_arrays=False, **arrays): + kernel_args = {} + for name, data in arrays.iteritems(): + check_instance(data, (OpenClArray, clArray.Array)) + base = '{}_base'.format(name) + kernel_args[base] = data.base_data + if not hardcode_arrays: + dim = data.ndim + make_offset, _ = self.make_array_offset() + make_strides, _ = self.make_array_strides(dim=dim, + hardcode_arrays=hardcode_arrays) + offset = '{}_offset'.format(name) + strides = '{}_strides'.format(name) + kernel_args[offset] = make_offset(data.offset, data.dtype) + kernel_args[strides] = make_strides(data.strides, data.dtype) + return kernel_args + + + diff --git a/hysop/backend/device/opencl/opencl_kernel.py b/hysop/backend/device/opencl/opencl_kernel.py index 3154d6398a33c8f2b6b24c88b9993e8e40fe88f7..360ebe3c060c34c8c57c81706f24ef4ebbe46d8f 100644 --- a/hysop/backend/device/opencl/opencl_kernel.py +++ b/hysop/backend/device/opencl/opencl_kernel.py @@ -100,7 +100,7 @@ class OpenClKernel(object): raise ValueError(msg) for argname, (argpos, argtype) in args_mapping.iteritems(): assert isinstance(argpos, int) - if not isinstance(argtype, type): + if not isinstance(argtype, (type, npw.dtype)): check_instance(argtype, tuple, values=type) if argname in default_args: argval = default_args[argname] @@ -188,7 +188,20 @@ class OpenClKernel(object): msg=msg.format(arg_name, ', '.join(args_mapping.keys())) raise ValueError(msg) (arg_index, arg_types) = args_mapping[arg_name] - if not isinstance(arg_value, arg_types): + if isinstance(arg_types, npw.dtype): + msg=None + if not isinstance(arg_value, npw.ndarray): + msg='Argument {} at position {} should be a np.ndarray, got a {}.' + msg=msg.format(arg_name, arg_index, type(arg_value)) + elif not arg_value.dtype == arg_types: + msg='Argument {} at position {} is a np.ndarray of wrong dtype, got a {}, expected a {}.' + msg=msg.format(arg_name, arg_index, type(arg_value), arg_types) + elif not arg_value.size == 1: + msg='Argument {} at position {} is not a scalar np.ndarray, shape={}, size={}.' + msg=msg.format(arg_name, arg_index, arg_value.shape, arg_value.size) + if (msg is not None): + raise ValueError(msg) + elif not isinstance(arg_value, arg_types): msg='Argument {} at position {} should be of type {} but got a {}.' msg=msg.format(arg_name, arg_index, arg_types, type(arg_value)) raise TypeError(msg) diff --git a/hysop/backend/device/opencl/operator/directional/advection_dir.py b/hysop/backend/device/opencl/operator/directional/advection_dir.py index fd7acfe208973b1ad40b803c1c5aff5510f7034e..cd5c9b28a2fb53d6b81aeb23e35a3c9259cbd13d 100644 --- a/hysop/backend/device/opencl/operator/directional/advection_dir.py +++ b/hysop/backend/device/opencl/operator/directional/advection_dir.py @@ -66,8 +66,8 @@ class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOper self.relax_min_particles = relax_min_particles self.remesh_criteria_eps = remesh_criteria_eps - self._force_autotuner_verbose = False - self._force_autotuner_debug = False + self._force_autotuner_verbose = None + self._force_autotuner_debug = None @debug def get_work_properties(self): diff --git a/hysop/backend/device/opencl/operator/transpose.py b/hysop/backend/device/opencl/operator/transpose.py index ba9a415a6551713196d2a078d53289bb393a6d4c..6c20bda7d6a45d0810f0a5b9b9a1d395c0417b2d 100644 --- a/hysop/backend/device/opencl/operator/transpose.py +++ b/hysop/backend/device/opencl/operator/transpose.py @@ -11,10 +11,6 @@ class OpenClTranspose(TransposeOperatorBase, OpenClOperator): def __init__(self, **kwds): super(OpenClTranspose, self).__init__(**kwds) - @debug - def discretize(self): - super(OpenClTranspose,self).discretize() - @debug def setup(self, work): super(OpenClTranspose, self).setup(work) @@ -36,21 +32,33 @@ class OpenClTranspose(TransposeOperatorBase, OpenClOperator): kernel = OpenClAutotunableTransposeKernel(cl_env, build_opts, autotuner_config) - (transpose, _, compute_inplace) = kernel.autotune(axes=axes, - is_inplace=is_inplace, input_field=input_field, output_field=output_field) + if is_inplace: + #Only 2D square matrix inplace transposition is supported + compute_inplace = (input_field.dim == 2) + compute_inplace &= all(input_field.shape[0]==input_field.shape) + else: + compute_inplace = False + + hardcode_arrays = (compute_inplace or not is_inplace) + transpose, _ = kernel.autotune(axes=axes, force_debug=True, + hardcode_arrays=hardcode_arrays, + is_inplace=compute_inplace, input_field=input_field, output_field=output_field) launcher = OpenClKernelListLauncher(name=transpose.name) for i in xrange(self.nb_components): if compute_inplace: - launcher += transpose.build_launcher(inout=input_field[i].data) + assert hardcode_arrays + launcher += transpose.build_launcher(inout_base=input_field[i].base_data) elif is_inplace: - launcher += transpose.build_launcher(input=input_field[i].data, - output=self.dtmp.data) + assert not hardcode_arrays + kernel_kargs = kernel.build_array_args(**{'in':input_field[i], 'out':self.dtmp}) + launcher += transpose.build_launcher(**kernel_kargs) launcher.push_copy_device_to_device(varname='tmp', src=self.dtmp, dst=input_field[i]) else: - launcher += transpose.build_launcher(input=input_field[i].data, - output=output_field[i].data) + assert hardcode_arrays + launcher += transpose.build_launcher(in_base=input_field[i].base_data, + out_base=output_field[i].base_data) self._kernel_launcher = launcher def enqueue_copy_kernel(self, _dst, _src, queue):