From a6c9db115e97ffaf59eb48389be342fb1f14c699 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Keck <jean-baptiste.keck@imag.fr> Date: Sun, 1 Oct 2017 13:27:58 +0200 Subject: [PATCH] custom hashes for autotunable kernels --- hysop/__init__.py | 2 +- hysop/backend/device/autotunable_kernel.py | 48 ++++++++++++++++++- hysop/backend/device/kernel_autotuner.py | 11 +++-- .../autotunable_kernels/advection_dir.py | 10 ++-- .../opencl/autotunable_kernels/remesh_dir.py | 10 ++-- .../opencl/autotunable_kernels/transpose.py | 4 +- .../opencl/opencl_autotunable_kernel.py | 5 +- hysop/backend/device/opencl/opencl_env.py | 14 +++--- .../operator/directional/advection_dir.py | 2 +- hysop/operator/base/redistribute_operator.py | 4 +- 10 files changed, 81 insertions(+), 29 deletions(-) diff --git a/hysop/__init__.py b/hysop/__init__.py index 279e86abe..d877538ec 100644 --- a/hysop/__init__.py +++ b/hysop/__init__.py @@ -20,7 +20,7 @@ __VERBOSE__ = False __DEBUG__ = False __TRACE__ = False __TRACE_WARNINGS__ = False -__KERNEL_DEBUG__ = True +__KERNEL_DEBUG__ = False __PROFILE__ = True __ENABLE_LONG_TESTS__ = "OFF" is "ON" diff --git a/hysop/backend/device/autotunable_kernel.py b/hysop/backend/device/autotunable_kernel.py index 13403e47d..12916e206 100644 --- a/hysop/backend/device/autotunable_kernel.py +++ b/hysop/backend/device/autotunable_kernel.py @@ -21,9 +21,55 @@ class AutotunableKernel(object): self.dump_src = first_not_None(dump_src, autotuner_config.debug) self.symbolic_mode = first_not_None(symbolic_mode, autotuner_config.debug) + + def custom_hash(self, *args, **kwds): + assert args or kwds, 'no arguments to be hashed.' + def _hash_arg(a): + if isinstance(a, list): + return hash(tuple(_hash_arg(x) for x in a)) + elif isinstance(a, set): + return hash(tuple(_hash_arg(x) for x in a)) + elif isinstance(a, dict): + return hash(tuple((_hash_arg(k), _hash_arg(v)) for (k,v) in a.items())) + elif isinstance(a, npw.ndarray): + assert a.ndim == 1 + assert a.size < 17, 'Only parameters up to size 16 are allowed.' + return hash(tuple(a.tolist())) + else: + return hash(a) + def _hash_karg(k,v): + if k == 'mesh_info_vars': + # for mesh infos we just hash the code generated constants that + # may alter the code branching. + from hysop.backend.device.codegen.base.variables import CodegenStruct + check_instance(v, dict, keys=str, values=CodegenStruct) + mesh_infos = tuple(str(v[k]) for k in sorted(v.keys())) + h = hash(mesh_infos) + return h + else: + msg='Unknown custom hash key \'\'.'.format(k) + raise KeyError(msg) + + h = None + if args: + h = _hash_arg(args[0]) + for arg in args[1:]: + h ^= _hash_arg(arg) + if kwds: + items = kwds.items() + if h is None: + h = _hash_karg(*items[0]) + else: + h ^= _hash_karg(*items[0]) + for it in items[1:]: + h ^= _hash_karg(*it) + return h + @abstractmethod - def autotune(self, name, kernel_args, **extra_kwds): + def autotune(self, name, kernel_args, + force_verbose=False, force_debug=False, + **extra_kwds): """Autotune this kernel with given name and extra_kwds.""" pass diff --git a/hysop/backend/device/kernel_autotuner.py b/hysop/backend/device/kernel_autotuner.py index 58ba6ef68..4ce75bb7d 100644 --- a/hysop/backend/device/kernel_autotuner.py +++ b/hysop/backend/device/kernel_autotuner.py @@ -150,14 +150,16 @@ class KernelAutotuner(object): src_hash = hasher.hexdigest() if (kernel_name != cached_kernel_name): - msg='\nCached kernel name did not match the benched one:\n {}\n {}' - msg=msg.format(kernel_name, cached_kernel_name) + msg='\nCached kernel name did not match the benched one:\n {}\n {}\n' + msg+='\nThis might be due to a faulty implementation of {}.hash_extra_kwds().' + msg=msg.format(kernel_name, cached_kernel_name, type(tkernel).__name__) warnings.warn(msg, CodeGeneratorWarning) return None if (src_hash != cached_src_hash): msg='\nCached kernel source hash did not match the benched one.\n {}\n {}' - msg=msg.format(src_hash, cached_src_hash) + msg+='\nThis might be due to a faulty implementation of {}.hash_extra_kwds().' + msg=msg.format(src_hash, cached_src_hash, type(tkernel).__name__) warnings.warn(msg, CodeGeneratorWarning) return None @@ -235,7 +237,8 @@ class KernelAutotuner(object): (cache_src_hash, cache_stats) = results[run_key] if cache_src_hash != src_hash: msg='\nCached parameters candidate did not match the benched one.\n {}\n {}' - msg=msg.format(src_hash, cache_src_hash) + msg+='\nThis might be due to a faulty implementation of {}.hash_extra_kwds().' + msg=msg.format(src_hash, cached_src_hash, type(tunable_kernel).__name__) warnings.warn(msg, CodeGeneratorWarning) old_stats = None else: diff --git a/hysop/backend/device/opencl/autotunable_kernels/advection_dir.py b/hysop/backend/device/opencl/autotunable_kernels/advection_dir.py index 274e3fdf8..99e43a432 100644 --- a/hysop/backend/device/opencl/autotunable_kernels/advection_dir.py +++ b/hysop/backend/device/opencl/autotunable_kernels/advection_dir.py @@ -13,7 +13,7 @@ class OpenClAutotunableDirectionalAdvectionKernel(OpenClAutotunableKernel): """Autotunable interface for directional advection kernel code generators.""" def autotune(self, direction, time_integrator, velocity_cfl, - velocity, position, precision): + velocity, position, precision, **kwds): """Autotune this kernel with specified configuration.""" dim = velocity.dim @@ -67,7 +67,7 @@ class OpenClAutotunableDirectionalAdvectionKernel(OpenClAutotunableKernel): rk_scheme=time_integrator, kernel_args=kernel_args, cache_ghosts=cache_ghosts, vboundaries=vboundaries, precision=precision, ftype=ftype, mesh_info_vars=mesh_info_vars, work_dim=dim, - work_size=position.compute_resolution) + work_size=position.compute_resolution, **kwds) def compute_args_mapping(self, extra_kwds, extra_parameters): @@ -161,7 +161,7 @@ class OpenClAutotunableDirectionalAdvectionKernel(OpenClAutotunableKernel): def hash_extra_kwds(self, extra_kwds): """Hash extra_kwds dictionnary for caching purposes.""" kwds = ('rk_scheme', 'ftype', 'work_dim', - 'vboundaries', 'cache_ghosts') - return hash(tuple(extra_kwds[kwd] for kwd in kwds)) ^ \ - hash(tuple(extra_kwds['work_size'].tolist())) + 'vboundaries', 'cache_ghosts', 'work_size') + return self.custom_hash(*tuple(extra_kwds[kwd] for kwd in kwds), + mesh_info_vars=extra_kwds['mesh_info_vars']) diff --git a/hysop/backend/device/opencl/autotunable_kernels/remesh_dir.py b/hysop/backend/device/opencl/autotunable_kernels/remesh_dir.py index f0ea8a914..8e55ca03d 100644 --- a/hysop/backend/device/opencl/autotunable_kernels/remesh_dir.py +++ b/hysop/backend/device/opencl/autotunable_kernels/remesh_dir.py @@ -20,7 +20,7 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): def autotune(self, precision, direction, scalar_cfl, position, scalars_in, scalars_out, is_inplace, remesh_kernel, remesh_criteria_eps, - force_atomics, relax_min_particles): + force_atomics, relax_min_particles, **kwds): """Autotune this kernel with specified configuration.""" check_instance(scalars_in, tuple, values=CartesianDiscreteFieldView) check_instance(scalars_out, tuple, values=CartesianDiscreteFieldView) @@ -161,7 +161,7 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): remesh_kernel=remesh_kernel, remesh_criteria_eps=remesh_criteria_eps, force_atomics=force_atomics, min_nparticles=min_nparticles, ftype=ftype, scalar_cfl=scalar_cfl, kernel_args=kernel_args, mesh_info_vars=mesh_info_vars, - work_dim=work_dim, work_size=work_size, min_wg_size=min_wg_size) + work_dim=work_dim, work_size=work_size, min_wg_size=min_wg_size, **kwds) def compute_args_mapping(self, extra_kwds, extra_parameters): @@ -339,7 +339,7 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel): def hash_extra_kwds(self, extra_kwds): """Hash extra_kwds dictionnary for caching purposes.""" kwds = ('remesh_criteria_eps', 'nscalars', 'ftype', - 'is_inplace', 'remesh_kernel') - return hash(tuple(extra_kwds[kwd] for kwd in kwds)) ^ \ - hash(tuple(extra_kwds['work_size'].tolist())) + 'is_inplace', 'remesh_kernel', 'work_size') + return self.custom_hash(*tuple(extra_kwds[kwd] for kwd in kwds), + mesh_info_vars=extra_kwds['mesh_info_vars']) diff --git a/hysop/backend/device/opencl/autotunable_kernels/transpose.py b/hysop/backend/device/opencl/autotunable_kernels/transpose.py index 14dea8a47..78d282253 100644 --- a/hysop/backend/device/opencl/autotunable_kernels/transpose.py +++ b/hysop/backend/device/opencl/autotunable_kernels/transpose.py @@ -31,7 +31,7 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): def autotune(self, is_inplace, input_field, output_field, - axes, name=None): + axes, name=None, **kwds): """Autotune this kernel with specified axes, inputs and outputs.""" check_instance(axes, tuple, values=int) @@ -89,7 +89,7 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): tile_indices=tile_indices, work_dim=work_dim, work_size=work_shape, - last_axe_permuted=last_axe_permuted) + last_axe_permuted=last_axe_permuted, **kwds) def compute_parameters(self, extra_kwds): """Register extra parameters to optimize.""" diff --git a/hysop/backend/device/opencl/opencl_autotunable_kernel.py b/hysop/backend/device/opencl/opencl_autotunable_kernel.py index 56d1974c3..a261662ca 100644 --- a/hysop/backend/device/opencl/opencl_autotunable_kernel.py +++ b/hysop/backend/device/opencl/opencl_autotunable_kernel.py @@ -28,11 +28,12 @@ class OpenClAutotunableKernel(AutotunableKernel): self.cl_env = cl_env self.usable_cache_bytes_per_wg = clCharacterize.usable_local_mem_size(cl_env.device) - def autotune(self, name, **extra_kwds): + def autotune(self, name, force_verbose=False, force_debug=False, **extra_kwds): from hysop.backend.device.opencl.opencl_kernel_autotuner import OpenClKernelAutotuner autotuner = OpenClKernelAutotuner(name=name, tunable_kernel=self) - best_candidate_results = autotuner.autotune(extra_kwds=extra_kwds) + best_candidate_results = autotuner.autotune(extra_kwds=extra_kwds, + force_verbose=force_verbose, force_debug=force_debug) check_instance(best_candidate_results, dict) return self.format_best_candidate(name=name, extra_kwds=extra_kwds, diff --git a/hysop/backend/device/opencl/opencl_env.py b/hysop/backend/device/opencl/opencl_env.py index d9324eb3e..bd0921b08 100644 --- a/hysop/backend/device/opencl/opencl_env.py +++ b/hysop/backend/device/opencl/opencl_env.py @@ -587,6 +587,14 @@ Dumped OpenCL Kernel '{}' dump_folder=IO.default_path()+'/'+OPENCL_KERNEL_DUMP_FOLDER if not os.path.exists(dump_folder): os.makedirs(dump_folder) + + if DEBUG: + # dump kernel source while in debug mode + dump_file=dump_folder+'/'+'{}_dump.cl'.format(kernel_name) + print('Dumping kernel src at \'{}\'.'.format(dump_file)) + with open(dump_file, 'w+') as f: + f.write(gpu_src) + #build_opts += ' '+' '.join(['-g', '-s "{}"'.format(dump_file)]) # Build OpenCL program try: @@ -607,12 +615,6 @@ Dumped OpenCL Kernel '{}' build.get_build_info(self.device, cl.program_build_info.STATUS)) vprint('Compiler log: ', build.get_build_info(self.device, cl.program_build_info.LOG)) - if DEBUG: - # dump kernel source while in debug mode - dump_file=dump_folder+'/'+'{}_dump.cl'.format(kernel_name) - print('Dumping kernel src at \'{}\'.'.format(dump_file)) - with open(dump_file, 'w+') as f: - f.write(gpu_src) if VERBOSE: print("===\n") diff --git a/hysop/backend/device/opencl/operator/directional/advection_dir.py b/hysop/backend/device/opencl/operator/directional/advection_dir.py index 854c1c784..4b9950563 100644 --- a/hysop/backend/device/opencl/operator/directional/advection_dir.py +++ b/hysop/backend/device/opencl/operator/directional/advection_dir.py @@ -107,7 +107,7 @@ class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOper kwds['velocity_cfl'] = self.velocity_cfl kwds['time_integrator'] = self.time_integrator - (advec_kernel, args_dict) = kernel.autotune(**kwds) + (advec_kernel, args_dict) = kernel.autotune(force_debug=True, **kwds) args_dict.pop('dt') advec_launcher = advec_kernel.build_launcher(**args_dict) diff --git a/hysop/operator/base/redistribute_operator.py b/hysop/operator/base/redistribute_operator.py index 9bf6d91d8..4f6360bd6 100644 --- a/hysop/operator/base/redistribute_operator.py +++ b/hysop/operator/base/redistribute_operator.py @@ -69,12 +69,12 @@ class RedistributeOperatorBase(ComputationalGraphOperator): for field in self.input_vars: _, req = reqs.get_input_requirement(field) - req.transposition_states = None + req.axes = None req.basis = None for field in self.output_vars: _, req = reqs.get_output_requirement(field) - req.transposition_states = None + req.axes = None req.basis = None return reqs -- GitLab