Skip to content
Snippets Groups Projects
Commit b4db021e authored by Jean-Baptiste Keck's avatar Jean-Baptiste Keck
Browse files

fix misc and permutation regression

parent 2eb2e61d
No related branches found
No related tags found
1 merge request!16MPI operators
Pipeline #29408 failed
import math
import itertools as it
from hysop.tools.numpywrappers import npw
from hysop.tools.types import check_instance
from hysop.tools.misc import upper_pow2, previous_pow2, upper_pow2_or_3
......@@ -12,13 +15,18 @@ from hysop.backend.device.kernel_autotuner import KernelGenerationError
class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
"""Autotunable interface for transpose kernel code generators."""
def _max_tile_size(self, shape, dtype, tile_indexes):
def _max_tile_size(self, shape, dtype, tile_indexes, is_inplace):
"""Compute max tile size that will fit in device cache."""
nbytes = dtype.itemsize
max_cache_elems = int(self.usable_cache_bytes_per_wg / nbytes)
factor = 2.0 if is_inplace else 1.0
max_cache_elems = int(self.usable_cache_bytes_per_wg / (factor*nbytes))
if len(tile_indexes)==2:
max_ts_cache = int(npw.sqrt(max_cache_elems))/2
x = int(npw.sqrt(max_cache_elems))
#while x*(x+1) > max_cache_elems:
#x-=1
# tile offsetting will just trigger the usual cache exception
max_ts_cache = x
else:
# no cache is used
max_ts_cache = npw.inf
......@@ -27,7 +35,7 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
max_ts_shape = max(tile_shape)
max_tile_size = min(max_ts_cache, max_ts_shape)
return upper_pow2(max_tile_size)
return max_tile_size
def autotune(self, is_inplace,
input_buffer, output_buffer,
......@@ -132,8 +140,8 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
tile_indices = extra_kwds['tile_indices']
dtype = extra_kwds['dtype']
shape = extra_kwds['shape']
is_inplace = extra_kwds['is_inplace']
last_axe_permuted = extra_kwds['last_axe_permuted']
max_tile_size = self._max_tile_size(shape, dtype, tile_indices)
flag = self.autotuner_config.autotuner_flag
vectorization = (1,)
......@@ -142,21 +150,13 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
use_diagonal_coordinates += (True,)
tile_padding = (0,1,)
tile_sizes = (max_tile_size,)
tile_size = max_tile_size
while tile_size>1:
tile_size = previous_pow2(tile_size)
tile_sizes += (tile_size,)
if flag == AutotunerFlags.ESTIMATE:
ntiles = 1
elif flag == AutotunerFlags.MEASURE:
ntiles = 2
elif flag == AutotunerFlags.PATIENT:
ntiles = 4
elif flag == AutotunerFlags.EXHAUSTIVE:
ntiles = len(tile_sizes)
ntiles = min(ntiles, len(tile_sizes))
tile_sizes = tile_sizes[:ntiles]
max_tile_size = self._max_tile_size(shape, dtype, tile_indices, is_inplace)
imax = int(math.log(max_tile_size, 2))
jmax = int(math.log(max_tile_size, 3)) if flag in (AutotunerFlags.EXHAUSTIVE,) else 0
tile_sizes = tuple( int((2**i)*(3**j))
for (i,j) in it.product(range(0,imax+1), range(0,jmax+1)))
tile_sizes = (max_tile_size,) + tuple(sorted(tile_sizes, reverse=True))
tile_sizes = tuple(filter(lambda x: (x>=8) and (x<=max_tile_size), tile_sizes))
params.register_extra_parameter('vectorization', vectorization)
params.register_extra_parameter('use_diagonal_coordinates', use_diagonal_coordinates)
......
......@@ -9,7 +9,7 @@ from hysop.fields.continuous_field import Field
class Noop(ComputationalGraphOperator):
"""An operator that does nothing and implements apply as noop."""
def apply(self):
def apply(self, **kwds):
"""This is a noop."""
pass
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment