Skip to content
Snippets Groups Projects
Commit b4db021e authored by Jean-Baptiste Keck's avatar Jean-Baptiste Keck
Browse files

fix misc and permutation regression

parent 2eb2e61d
No related branches found
No related tags found
1 merge request!16MPI operators
Pipeline #29408 failed
import math
import itertools as it
from hysop.tools.numpywrappers import npw from hysop.tools.numpywrappers import npw
from hysop.tools.types import check_instance from hysop.tools.types import check_instance
from hysop.tools.misc import upper_pow2, previous_pow2, upper_pow2_or_3 from hysop.tools.misc import upper_pow2, previous_pow2, upper_pow2_or_3
...@@ -12,13 +15,18 @@ from hysop.backend.device.kernel_autotuner import KernelGenerationError ...@@ -12,13 +15,18 @@ from hysop.backend.device.kernel_autotuner import KernelGenerationError
class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
"""Autotunable interface for transpose kernel code generators.""" """Autotunable interface for transpose kernel code generators."""
def _max_tile_size(self, shape, dtype, tile_indexes): def _max_tile_size(self, shape, dtype, tile_indexes, is_inplace):
"""Compute max tile size that will fit in device cache.""" """Compute max tile size that will fit in device cache."""
nbytes = dtype.itemsize nbytes = dtype.itemsize
max_cache_elems = int(self.usable_cache_bytes_per_wg / nbytes) factor = 2.0 if is_inplace else 1.0
max_cache_elems = int(self.usable_cache_bytes_per_wg / (factor*nbytes))
if len(tile_indexes)==2: if len(tile_indexes)==2:
max_ts_cache = int(npw.sqrt(max_cache_elems))/2 x = int(npw.sqrt(max_cache_elems))
#while x*(x+1) > max_cache_elems:
#x-=1
# tile offsetting will just trigger the usual cache exception
max_ts_cache = x
else: else:
# no cache is used # no cache is used
max_ts_cache = npw.inf max_ts_cache = npw.inf
...@@ -27,7 +35,7 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): ...@@ -27,7 +35,7 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
max_ts_shape = max(tile_shape) max_ts_shape = max(tile_shape)
max_tile_size = min(max_ts_cache, max_ts_shape) max_tile_size = min(max_ts_cache, max_ts_shape)
return upper_pow2(max_tile_size) return max_tile_size
def autotune(self, is_inplace, def autotune(self, is_inplace,
input_buffer, output_buffer, input_buffer, output_buffer,
...@@ -132,8 +140,8 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): ...@@ -132,8 +140,8 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
tile_indices = extra_kwds['tile_indices'] tile_indices = extra_kwds['tile_indices']
dtype = extra_kwds['dtype'] dtype = extra_kwds['dtype']
shape = extra_kwds['shape'] shape = extra_kwds['shape']
is_inplace = extra_kwds['is_inplace']
last_axe_permuted = extra_kwds['last_axe_permuted'] last_axe_permuted = extra_kwds['last_axe_permuted']
max_tile_size = self._max_tile_size(shape, dtype, tile_indices)
flag = self.autotuner_config.autotuner_flag flag = self.autotuner_config.autotuner_flag
vectorization = (1,) vectorization = (1,)
...@@ -142,21 +150,13 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): ...@@ -142,21 +150,13 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
use_diagonal_coordinates += (True,) use_diagonal_coordinates += (True,)
tile_padding = (0,1,) tile_padding = (0,1,)
tile_sizes = (max_tile_size,) max_tile_size = self._max_tile_size(shape, dtype, tile_indices, is_inplace)
tile_size = max_tile_size imax = int(math.log(max_tile_size, 2))
while tile_size>1: jmax = int(math.log(max_tile_size, 3)) if flag in (AutotunerFlags.EXHAUSTIVE,) else 0
tile_size = previous_pow2(tile_size) tile_sizes = tuple( int((2**i)*(3**j))
tile_sizes += (tile_size,) for (i,j) in it.product(range(0,imax+1), range(0,jmax+1)))
if flag == AutotunerFlags.ESTIMATE: tile_sizes = (max_tile_size,) + tuple(sorted(tile_sizes, reverse=True))
ntiles = 1 tile_sizes = tuple(filter(lambda x: (x>=8) and (x<=max_tile_size), tile_sizes))
elif flag == AutotunerFlags.MEASURE:
ntiles = 2
elif flag == AutotunerFlags.PATIENT:
ntiles = 4
elif flag == AutotunerFlags.EXHAUSTIVE:
ntiles = len(tile_sizes)
ntiles = min(ntiles, len(tile_sizes))
tile_sizes = tile_sizes[:ntiles]
params.register_extra_parameter('vectorization', vectorization) params.register_extra_parameter('vectorization', vectorization)
params.register_extra_parameter('use_diagonal_coordinates', use_diagonal_coordinates) params.register_extra_parameter('use_diagonal_coordinates', use_diagonal_coordinates)
......
...@@ -9,7 +9,7 @@ from hysop.fields.continuous_field import Field ...@@ -9,7 +9,7 @@ from hysop.fields.continuous_field import Field
class Noop(ComputationalGraphOperator): class Noop(ComputationalGraphOperator):
"""An operator that does nothing and implements apply as noop.""" """An operator that does nothing and implements apply as noop."""
def apply(self): def apply(self, **kwds):
"""This is a noop.""" """This is a noop."""
pass pass
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment