diff --git a/hysop/backend/device/opencl/autotunable_kernels/transpose.py b/hysop/backend/device/opencl/autotunable_kernels/transpose.py index 5a040be34a49a0d9021c8d83cb3297c6c267cde4..d9a16ea93ab765e5921216fd3d3f5ae3a06a1eac 100644 --- a/hysop/backend/device/opencl/autotunable_kernels/transpose.py +++ b/hysop/backend/device/opencl/autotunable_kernels/transpose.py @@ -1,4 +1,7 @@ +import math +import itertools as it + from hysop.tools.numpywrappers import npw from hysop.tools.types import check_instance from hysop.tools.misc import upper_pow2, previous_pow2, upper_pow2_or_3 @@ -12,13 +15,18 @@ from hysop.backend.device.kernel_autotuner import KernelGenerationError class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): """Autotunable interface for transpose kernel code generators.""" - def _max_tile_size(self, shape, dtype, tile_indexes): + def _max_tile_size(self, shape, dtype, tile_indexes, is_inplace): """Compute max tile size that will fit in device cache.""" nbytes = dtype.itemsize - max_cache_elems = int(self.usable_cache_bytes_per_wg / nbytes) + factor = 2.0 if is_inplace else 1.0 + max_cache_elems = int(self.usable_cache_bytes_per_wg / (factor*nbytes)) if len(tile_indexes)==2: - max_ts_cache = int(npw.sqrt(max_cache_elems))/2 + x = int(npw.sqrt(max_cache_elems)) + #while x*(x+1) > max_cache_elems: + #x-=1 + # tile offsetting will just trigger the usual cache exception + max_ts_cache = x else: # no cache is used max_ts_cache = npw.inf @@ -27,7 +35,7 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): max_ts_shape = max(tile_shape) max_tile_size = min(max_ts_cache, max_ts_shape) - return upper_pow2(max_tile_size) + return max_tile_size def autotune(self, is_inplace, input_buffer, output_buffer, @@ -132,8 +140,8 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): tile_indices = extra_kwds['tile_indices'] dtype = extra_kwds['dtype'] shape = extra_kwds['shape'] + is_inplace = extra_kwds['is_inplace'] last_axe_permuted = extra_kwds['last_axe_permuted'] - max_tile_size = self._max_tile_size(shape, dtype, tile_indices) flag = self.autotuner_config.autotuner_flag vectorization = (1,) @@ -142,21 +150,13 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): use_diagonal_coordinates += (True,) tile_padding = (0,1,) - tile_sizes = (max_tile_size,) - tile_size = max_tile_size - while tile_size>1: - tile_size = previous_pow2(tile_size) - tile_sizes += (tile_size,) - if flag == AutotunerFlags.ESTIMATE: - ntiles = 1 - elif flag == AutotunerFlags.MEASURE: - ntiles = 2 - elif flag == AutotunerFlags.PATIENT: - ntiles = 4 - elif flag == AutotunerFlags.EXHAUSTIVE: - ntiles = len(tile_sizes) - ntiles = min(ntiles, len(tile_sizes)) - tile_sizes = tile_sizes[:ntiles] + max_tile_size = self._max_tile_size(shape, dtype, tile_indices, is_inplace) + imax = int(math.log(max_tile_size, 2)) + jmax = int(math.log(max_tile_size, 3)) if flag in (AutotunerFlags.EXHAUSTIVE,) else 0 + tile_sizes = tuple( int((2**i)*(3**j)) + for (i,j) in it.product(range(0,imax+1), range(0,jmax+1))) + tile_sizes = (max_tile_size,) + tuple(sorted(tile_sizes, reverse=True)) + tile_sizes = tuple(filter(lambda x: (x>=8) and (x<=max_tile_size), tile_sizes)) params.register_extra_parameter('vectorization', vectorization) params.register_extra_parameter('use_diagonal_coordinates', use_diagonal_coordinates) diff --git a/hysop/operator/misc.py b/hysop/operator/misc.py index 56faaf95c89ea00d4f0cb58bd59f7538cace8196..3f715b906a0cf289efa2c5e1c91ef1686cccd54e 100644 --- a/hysop/operator/misc.py +++ b/hysop/operator/misc.py @@ -9,7 +9,7 @@ from hysop.fields.continuous_field import Field class Noop(ComputationalGraphOperator): """An operator that does nothing and implements apply as noop.""" - def apply(self): + def apply(self, **kwds): """This is a noop.""" pass