diff --git a/examples/example_utils.py b/examples/example_utils.py index 7812f433e6ef8ec91d06a94a2efcc2e4d409ffe4..91d1a45ee17cd6c9fa54ae04cfc2dd41657f84e1 100644 --- a/examples/example_utils.py +++ b/examples/example_utils.py @@ -183,8 +183,12 @@ class HysopArgParser(argparse.ArgumentParser): self._rmfiles(dump_dir, 'h5') self._rmfiles(dump_dir, 'xmf') self._rmfiles(dump_dir, 'out') + self._rmfiles(dump_dir, 'cl') self._rmfiles(dump_dir, 'txt') self._rmfiles(dump_dir, 'png') + self._rmfiles(dump_dir, 'sim') + self._rmfiles(dump_dir, 'xml') + self._rmfiles(dump_dir, 'json') self._rmfiles(dump_dir, 'pdf') self._rmfiles(dump_dir, 'npz') self._rmfiles(dump_dir, 'pklz') @@ -261,8 +265,10 @@ class HysopArgParser(argparse.ArgumentParser): return msg @staticmethod - def _mkdir(path): - path = os.path.dirname(os.path.realpath(path)) + def _mkdir(path, dirname=True): + path = os.path.realpath(path) + if dirname: + path = os.path.dirname(path) try: os.makedirs(path) except OSError as e: @@ -758,6 +764,13 @@ class HysopArgParser(argparse.ArgumentParser): def _add_autotuner_args(self): autotuner = self.add_argument_group('Kernel autotuner parameters') + autotuner.add_argument('--autotuner-dump-dir', type=str, default=None, + dest='autotuner_dump_dir', + help='Configure kernel autotuner dump directory.') + autotuner.add_argument('--autotuner-cache-override', + action='store_true', + dest='autotuner_cache_override', + help='Override kernel autotuner cached data. Best kernels candidates will be stored in a temporary directory instead of persistant system-wide cache directory.') autotuner.add_argument('--autotuner-flag', type=str, default=None, dest='autotuner_flag', help=('Configure kernel autotuner rigor flag' @@ -774,21 +787,22 @@ class HysopArgParser(argparse.ArgumentParser): autotuner.add_argument('--autotuner-verbose', type=int, default=None, dest='autotuner_verbose', help='Configure kernel autotuner kernel verbosity (0 to 5).') - autotuner.add_argument('--autotuner-debug', type=bool, default=None, + autotuner.add_argument('--autotuner-debug', + action='store_true', dest='autotuner_debug', help='Configure kernel autotuner kernel debug flag.') - autotuner.add_argument('--autotuner-dump-kernels', type=bool, default=None, + autotuner.add_argument('--autotuner-dump-kernels', + action='store_true', dest='autotuner_dump_kernels', help='Configure kernel autotuner kernel source dumping.') - autotuner.add_argument('--autotuner-dump-isolation', type=bool, default=None, + autotuner.add_argument('--autotuner-dump-isolation', + action='store_true', dest='autotuner_dump_isolation', - help='Configure kernel autotuner kernel isolation file generation.') - autotuner.add_argument('--autotuner-cache-override', type=bool, default=None, - dest='autotuner_cache_override', - help='Override kernel autotuner cached data.') - autotuner.add_argument('--autotuner-dump-dir', type=str, default=None, - dest='autotuner_dump_dir', - help='Configure kernel autotuner dump directory.') + help='Configure kernel autotuner to generate oclgrind kernel isolation files for each optimal kernel.') + autotuner.add_argument('--autotuner-dump-hash-logs', + action='store_true', + dest='autotuner_dump_hash_logs', + help='Configure kernel autotuner to generate kernel extra keywords hash logs for kernel caching debugging purposes.') autotuner.add_argument('--autotuner-plot-statistics', action='store_true', dest='autotuner_plot_statistics', @@ -796,16 +810,22 @@ class HysopArgParser(argparse.ArgumentParser): autotuner.add_argument('--autotuner-bench-kernels', action='store_true', dest='autotuner_bench_kernels', - help='Bench mode for kernels, enables exhaustive search without max candidates and disable prune threshold.') + help='Enable standard bench mode for kernels: search without max candidates at maximum verbosity with cache override and nruns=8. Prune threshold and autotuner flag are however not modified.') + autotuner.add_argument('--autotuner-postprocess-kernels', type=str, default=None, + dest='autotuner_postprocess_kernels', + help=('Run a custom command after each final generated kernel: ' + +'command FILE_BASENAME FROM_CACHE AUTOTUNER_DUMP_DIR AUTOTUNER_NAME KERNEL_NAME MEAN_EXECUTION_TIME_NS MIN_EXECUTION_TIME_NS MAX_EXECUTION_TIME_NS KERNEL_SOURCE_FILE KERNEL_ISOLATION_FILE KERNEL_HASH_LOGS_FILE VENDOR_NAME DEVICE_NAME WORK_SIZE WORK_LOAD GLOBAL_WORK_SIZE LOCAL_WORK_SIZE EXTRA_PARAMETERS EXTRA_KWDS_HASH SRC_HASH.' + +'See hysop/tools/postprocess_kernel.sh for an example of post processing script.')) return autotuner def _check_autotuner_args(self, args): - self._check_default(args, ('autotuner_flag', 'autotuner_dump_dir'), + self._check_default(args, ('autotuner_flag', 'autotuner_dump_dir', 'autotuner_postprocess_kernels'), str, allow_none=True) self._check_default(args, ('autotuner_nruns', 'autotuner_max_candidates', 'autotuner_verbose'), int, allow_none=True) self._check_default(args, ('autotuner_dump_kernels', 'autotuner_dump_isolation', + 'autotuner_dump_hash_logs', 'autotuner_bench_kernels', 'autotuner_plot_statistics'), bool, allow_none=True) @@ -824,8 +844,6 @@ class HysopArgParser(argparse.ArgumentParser): args.autotuner_nruns = 8 args.autotuner_max_candidates = np.iinfo(np.int64).max args.autotuner_verbose = np.iinfo(np.int64).max - args.autotuner_flag = self._convert_autotuner_flag('autotuner_flag', 'exhaustive') - args.autotuner_prune_threshold = 2.0 args.autotuner_cache_override = True args.autotuner_plot_statistics = True @@ -843,10 +861,12 @@ class HysopArgParser(argparse.ArgumentParser): verbose=args.autotuner_verbose, debug=args.autotuner_debug, dump_kernels=args.autotuner_dump_kernels, + dump_hash_logs=args.autotuner_dump_hash_logs, generate_isolation_file=args.autotuner_dump_isolation, plot_statistics=args.autotuner_plot_statistics, override_cache=override_cache, - dump_folder=args.autotuner_dump_dir) + dump_folder=args.autotuner_dump_dir, + postprocess_kernels=args.autotuner_postprocess_kernels) return autotuner_config def _add_file_io_args(self, default_dump_dir, generate_io_params): @@ -1286,6 +1306,8 @@ class HysopArgParser(argparse.ArgumentParser): msg='{} directory \'{}\' cannot be stored on a network file system.' msg=msg.format(argname, argvalue) self.error(msg) + + self._mkdir(argvalue, dirname=False) setattr(args, argname, os.path.realpath(argvalue)) def _setup_hysop_env(self, args): diff --git a/hysop/backend/device/autotunable_kernel.py b/hysop/backend/device/autotunable_kernel.py index 20ef8c74b02abbe5df3a94735cb9cc6106ba1a95..b70bcf0dd35dcdaba49a087a8c3b22b110d0f874 100644 --- a/hysop/backend/device/autotunable_kernel.py +++ b/hysop/backend/device/autotunable_kernel.py @@ -23,7 +23,7 @@ class AutotunableKernel(object): self.symbolic_mode = first_not_None(symbolic_mode, autotuner_config.debug) def custom_hash(self, *args, **kwds): - HASH_DEBUG=False + HASH_DEBUG=self.autotuner_config.dump_hash_logs assert args or kwds, 'no arguments to be hashed.' def _hash_arg(a): diff --git a/hysop/backend/device/codegen/kernels/transpose.py b/hysop/backend/device/codegen/kernels/transpose.py index 4d4701de045b93cf0c6f7e8424394aae6606c5fc..15ab1b9640d018835cbfa633d75456255c36ecf0 100644 --- a/hysop/backend/device/codegen/kernels/transpose.py +++ b/hysop/backend/device/codegen/kernels/transpose.py @@ -25,8 +25,8 @@ class TransposeKernelGenerator(KernelCodeGenerator): use_diagonal_coordinates): pdim = len(axes) axes = [ str(j) if i!=j else 'X' for i,j in enumerate(axes) ] - return '{}transpose_{}_{}_{}d__N{}__T{}__P{}__{}'.format( - 'diag_' if use_diagonal_coordinates else '', + return 'transpose{}_{}_{}_{}d__N{}__T{}__P{}__{}'.format( + '_dc' if use_diagonal_coordinates else '_nc', 'inplace' if is_inplace else 'out_of_place', ctype.replace(' ','_'), pdim, vectorization, tile_size, tile_padding, '_'.join(axes)) diff --git a/hysop/backend/device/kernel_autotuner.py b/hysop/backend/device/kernel_autotuner.py index 1fce4b597188274c40bcdb70318cc6b58c4b559a..316579c87d17c162af3926164347c2cde25968fb 100644 --- a/hysop/backend/device/kernel_autotuner.py +++ b/hysop/backend/device/kernel_autotuner.py @@ -28,12 +28,15 @@ class KernelAutotuner(object): @staticmethod def _hash_func(): return hashlib.new('sha256') - - @staticmethod - def cache_dir(): - cache_dir = IO.cache_path() + '/kernel_autotuner' - return cache_dir - + + def use_tmp_cache(self): + self._cache_dir = IO.get_tmp_dir('kernel_autotuner') + def use_system_cache(self): + self._cache_dir = IO.cache_path() + '/kernel_autotuner' + + def cache_dir(self): + assert (self._cache_dir is not None) + return self._cache_dir def cache_file(self): cache_file = '{}/{}.pklz'.format(self.cache_dir(), self.name.replace(' ','_')) return cache_file @@ -80,12 +83,25 @@ class KernelAutotuner(object): self.indent = lambda i: ' '*i self.verbose = self.autotuner_config.verbose - self.prg_idx = 4 - self.knl_idx = 5 - self.stats_idx = 6 - self.src_idx = 7 - self.src_hash_idx = 9 - self.logs_idx = 10 + self.result_keys = ( + 'extra_parameters', #00 + 'work_size', #01 + 'work_load', #02 + 'global_work_size', #03 + 'local_work_size', #04 + 'program', #05 + 'kernel', #06 + 'kernel_statistics', #07 + 'kernel_src', #08 + 'kernel_name', #09 + 'src_hash', #10 + 'extra_kwds_hash', #10 + 'extra_kwds_hash_logs' #12 + ) + for (i, pname) in enumerate(self.result_keys): + setattr(self, '{}_idx'.format(pname), i) + + self._cache_dir = None def autotune(self, extra_kwds, first_working=False, @@ -105,34 +121,42 @@ class KernelAutotuner(object): autotuner_config = self.autotuner_config extra_kwds_hash, extra_kwds_hash_logs = tkernel.hash_extra_kwds(extra_kwds) + hasher = self._hash_func() + hasher.update(str(extra_kwds_hash)) + extra_kwds_hash = hasher.hexdigest() + check_instance(extra_kwds_hash, str) check_instance(extra_kwds_hash_logs, str) + file_basename = '{}_{}'.format(self.name, extra_kwds_hash[:4]) self._print_header(extra_kwds) - results = self._reload_cache(extra_kwds_hash) - if autotuner_config.override_cache: if self.verbose: - print self.indent(1)+'>Ignoring cached results, benching all kernels.' - best_candidate = None - elif first_working: + print self.indent(1)+'>Using temporary cache folder, benching all new kernels.' + self.use_tmp_cache() + else: + self.use_system_cache() + results = self._reload_cache(extra_kwds_hash) + + if first_working: best_candidate = None else: best_candidate = self._load_results_from_cache(tkernel, results, extra_kwds, - force_verbose, force_debug, extra_kwds_hash_logs) + force_verbose, force_debug, extra_kwds_hash, extra_kwds_hash_logs, file_basename) if (best_candidate is None): best_candidate = self._autotune_kernels(tkernel, results, extra_kwds, - force_verbose, force_debug, first_working, extra_kwds_hash_logs) + force_verbose, force_debug, first_working, + extra_kwds_hash, extra_kwds_hash_logs, file_basename) + from_cache = False + else: + from_cache = True - result_keys = ('extra_parameters', 'work_load', 'global_work_size', 'local_work_size', - 'program', 'kernel', 'kernel_statistics', 'kernel_src', 'kernel_name', - 'src_hash', 'hash_logs') - assert len(result_keys) == len(best_candidate) - return dict(zip(result_keys, best_candidate)) + assert len(self.result_keys) == len(best_candidate) + return dict(zip(self.result_keys, best_candidate)), file_basename, from_cache def _load_results_from_cache(self, tkernel, results, extra_kwds, - force_verbose, force_debug, extra_kwds_hash_logs): + force_verbose, force_debug, extra_kwds_hash, extra_kwds_hash_logs, file_basename): if (self.FULL_RESULTS_KEY not in results): if self.verbose: print (' >No best candidate was cached for this configuration, ' @@ -147,10 +171,19 @@ class KernelAutotuner(object): # pyopencl kernel and program objects. best_candidate = copy.deepcopy(results[self.FULL_RESULTS_KEY]) - (extra_parameters, work_load, global_work_size, local_work_size, - prg, kernel, statistics, cached_kernel_src, - cached_kernel_name, cached_src_hash, - cached_kernel_hash_logs) = best_candidate + (extra_parameters, + work_size, work_load, global_work_size, local_work_size, + prg, kernel, statistics, cached_kernel_src, + cached_kernel_name, cached_src_hash, + cached_kernel_hash, cached_kernel_hash_logs) = best_candidate + + if (cached_kernel_hash != extra_kwds_hash): + msg='\nCached kernel extra_kwds hash did not match the benched one:\n {}\n {}\n' + msg+='\nThis might be due to an upgrade of the generated code or ' + msg+='a faulty implementation of {}.hash_extra_kwds().' + msg=msg.format(cached_kernel_hash, extra_kwds_hash, type(tkernel).__name__) + warnings.warn(msg, CodeGeneratorWarning) + return None assert prg is None assert kernel is None @@ -213,15 +246,16 @@ class KernelAutotuner(object): global_work_size=global_work_size, local_work_size=local_work_size) - best_candidate[self.prg_idx] = prg - best_candidate[self.knl_idx] = kernel - best_candidate[self.src_idx] = kernel_src - best_candidate[self.logs_idx] = extra_kwds_hash_logs + best_candidate[self.program_idx] = prg + best_candidate[self.kernel_idx] = kernel + best_candidate[self.kernel_src_idx] = kernel_src + best_candidate[self.extra_kwds_hash_logs_idx] = extra_kwds_hash_logs return tuple(best_candidate) def _autotune_kernels(self, tkernel, results, extra_kwds, - force_verbose, force_debug, first_working, extra_kwds_hash_logs): + force_verbose, force_debug, first_working, + extra_kwds_hash, extra_kwds_hash_logs, file_basename): autotuner_config = self.autotuner_config if first_working: nruns = 1 @@ -238,6 +272,7 @@ class KernelAutotuner(object): ks = AutotunedKernelStatistics(tkernel, extra_kwds) ks.max_candidates = max_candidates ks.nruns = nruns + ks.file_basename = file_basename with Timer() as timer: params = tkernel.compute_parameters(extra_kwds=extra_kwds) @@ -265,6 +300,7 @@ class KernelAutotuner(object): preferred_work_group_size_multiple=preferred_work_group_size_multiple, extra_parameters=extra_parameters, extra_kwds=extra_kwds) + work_size = work_bounds.work_size self._print_parameters(extra_parameters, work_bounds) @@ -306,7 +342,7 @@ class KernelAutotuner(object): hasher.update(kernel_src) src_hash = hasher.hexdigest() - if (not autotuner_config.override_cache) and (run_key in results): + if (run_key in results): (cache_src_hash, cache_stats) = results[run_key] if (cache_src_hash != src_hash): msg='\nCached parameters candidate did not match the ' @@ -354,19 +390,21 @@ class KernelAutotuner(object): local_best = False candidate = (extra_parameters, + tuple(work_size), tuple(work_load), tuple(global_work_size), tuple(local_work_size), prg, kernel, statistics, kernel_src, kernel_name, - src_hash, extra_kwds_hash_logs) + src_hash, extra_kwds_hash, extra_kwds_hash_logs) results[run_key] = (src_hash, statistics) bench_results[run_key] = candidate pks.push_run_statistics(run_key, - work_load=work_load, local_work_size=local_work_size, - global_work_size=global_work_size, statistics=statistics, - pruned=pruned, local_best=local_best, error=None) + work_size=work_size, work_load=work_load, + local_work_size=local_work_size, global_work_size=global_work_size, + statistics=statistics, pruned=pruned, + local_best=local_best, error=None) except KernelGenerationError as e: if __KERNEL_DEBUG__: sys.stderr.write(str(e)+'\n') @@ -374,10 +412,10 @@ class KernelAutotuner(object): statistics = None from_cache=False pks.push_run_statistics(run_key, - work_load=work_load, local_work_size=local_work_size, - global_work_size=global_work_size, - statistics=None, - pruned=None, local_best=None, error=e) + work_size=work_size, work_load=work_load, + local_work_size=local_work_size, global_work_size=global_work_size, + statistics=None, pruned=None, + local_best=None, error=e) total_count += 1 abort = (max_candidates is not None) and \ ((pruned_count + kept_count) >= max_candidates) @@ -412,7 +450,7 @@ class KernelAutotuner(object): keep_only = max(previous_pow2(kept_count),1) self._print_first_step_results(total_count, kept_count, pruned_count, failed_count, keep_only) - candidates = sorted(bench_results.items(), key=lambda x: x[1][self.stats_idx]) + candidates = sorted(bench_results.items(), key=lambda x: x[1][self.kernel_statistics_idx]) candidates = candidates[:keep_only] while(len(candidates)>1): step_count += 1 @@ -420,16 +458,16 @@ class KernelAutotuner(object): self._print_step(step_count, '{} BEST'.format(len(candidates)), nruns) for (run_key, run_params) in candidates: - (extra_params, work_load, global_work_size, local_work_size, - _, kernel, old_stats, _, _, _, _) = run_params + (extra_params, work_size, work_load, global_work_size, local_work_size, + _, kernel, old_stats, _, _, _, _, _) = run_params self.bench_one_from_binary(kernel=kernel, target_nruns=nruns, old_stats=old_stats, best_stats=best_stats, global_work_size=global_work_size, local_work_size=local_work_size) - candidates = sorted(candidates, key=lambda x: x[1][self.stats_idx]) - self._print_step_results(candidates, self.stats_idx) + candidates = sorted(candidates, key=lambda x: x[1][self.kernel_statistics_idx]) + self._print_step_results(candidates, self.kernel_statistics_idx) candidates = candidates[:max(previous_pow2(len(candidates)),1)] ks.push_step(step_count, candidates) best_candidate = candidates[0][1] @@ -437,11 +475,12 @@ class KernelAutotuner(object): ks.exec_time = timer.interval ks.best_candidate = best_candidate - ks.kernel_name = kernel_name + ks.kernel_name = self.name ks.kept_count = kept_count ks.pruned_count = pruned_count ks.failed_count = failed_count ks.total_count = total_count + ks.extra_kwds_hash = best_candidate[self.extra_kwds_hash_idx] if autotuner_config.plot_statistics and not first_working: ks.plot() @@ -452,10 +491,10 @@ class KernelAutotuner(object): # Export best candidate results if not self.STORE_FULL_KERNEL_SOURCES: - best_candidate[self.src_idx] = None - best_candidate[self.logs_idx] = None - best_candidate[self.prg_idx] = None - best_candidate[self.knl_idx] = None + best_candidate[self.kernel_src_idx] = None + best_candidate[self.extra_kwds_hash_logs_idx] = None + best_candidate[self.program_idx] = None + best_candidate[self.kernel_idx] = None results[self.FULL_RESULTS_KEY] = best_candidate self._dump_cache() @@ -463,8 +502,8 @@ class KernelAutotuner(object): def _build_final_kernel(self, tkernel, best_candidate, extra_kwds): - (extra_parameters, work_load, global_work_size, local_work_size, - _, _, _, _, _, _, _) = best_candidate + (extra_parameters, work_size, work_load, global_work_size, local_work_size, + _, _, _, _, _, _, _, _) = best_candidate global_work_size = npw.asintegerarray(global_work_size) local_work_size = npw.asintegerarray(local_work_size) @@ -490,10 +529,10 @@ class KernelAutotuner(object): global_work_size=global_work_size, local_work_size=local_work_size) - best_candidate[self.prg_idx] = prg - best_candidate[self.knl_idx] = kernel - best_candidate[self.src_idx] = kernel_src - best_candidate[self.src_hash_idx] = src_hash + best_candidate[self.program_idx] = prg + best_candidate[self.kernel_idx] = kernel + best_candidate[self.kernel_src_idx] = kernel_src + best_candidate[self.src_hash_idx] = src_hash return best_candidate def _compute_args_list(self, args_mapping, **kernel_args): @@ -730,17 +769,17 @@ class KernelAutotuner(object): print config - def _print_step_results(self, sorted_candidates, stats_idx): + def _print_step_results(self, sorted_candidates, kernel_statistics_idx): if self.verbose==2: best = sorted_candidates[0][1] worst = sorted_candidates[-1][1] - print self.indent(2)+'worst candidate: {}'.format(worst[stats_idx]) - print self.indent(2)+'best candidate: {}'.format(best[stats_idx]) + print self.indent(2)+'worst candidate: {}'.format(worst[kernel_statistics_idx]) + print self.indent(2)+'best candidate: {}'.format(best[kernel_statistics_idx]) def _print_footer(self, ellapsed, best_candidate): if self.verbose: - (best_extra_params, best_work_load, best_global_size, best_local_size, - _, _, best_stats, _, _, _, _) = best_candidate + (best_extra_params, best_work_size, best_work_load, best_global_size, best_local_size, + _, _, best_stats, _, _, _, _, _) = best_candidate if self.verbose>1: if ellapsed is not None: self._print_separator() diff --git a/hysop/backend/device/kernel_autotuner_config.py b/hysop/backend/device/kernel_autotuner_config.py index b3b4c7710904d8e5b93ba2e1f6ec3e141a335a2a..d6f0fc731f65620309ea3d7be500c6f4ef322015 100644 --- a/hysop/backend/device/kernel_autotuner_config.py +++ b/hysop/backend/device/kernel_autotuner_config.py @@ -16,17 +16,19 @@ class KernelAutotunerConfig(object): } def __init__(self, - dump_folder = None, - autotuner_flag = None, - prune_threshold = None, - max_candidates = None, - verbose = None, - debug = None, - dump_kernels = None, + dump_folder = None, + autotuner_flag = None, + prune_threshold = None, + max_candidates = None, + verbose = None, + debug = None, + dump_kernels = None, + dump_hash_logs = None, generate_isolation_file = None, - override_cache = None, - nruns = None, - plot_statistics = None): + override_cache = None, + nruns = None, + plot_statistics = None, + postprocess_kernels = None): dump_folder = first_not_None(dump_folder, self.default_dump_folder()) autotuner_flag = first_not_None(autotuner_flag, DEFAULT_AUTOTUNER_FLAG) @@ -34,10 +36,12 @@ class KernelAutotunerConfig(object): max_candidates = first_not_None(max_candidates, 4) verbose = first_not_None(verbose, 2*__VERBOSE__) debug = first_not_None(debug, __KERNEL_DEBUG__) - dump_kernels = first_not_None(dump_kernels, __KERNEL_DEBUG__) + dump_kernels = first_not_None(dump_kernels, __KERNEL_DEBUG__) + dump_hash_logs = first_not_None(dump_hash_logs, __KERNEL_DEBUG__) generate_isolation_file = first_not_None(generate_isolation_file, __KERNEL_DEBUG__) override_cache = first_not_None(override_cache, False) plot_statistics = first_not_None(plot_statistics, False) + postprocess_kernels = first_not_None(postprocess_kernels, False) if (nruns is None): nruns = self._default_initial_runs[autotuner_flag] @@ -60,9 +64,11 @@ class KernelAutotunerConfig(object): self.nruns = nruns self.dump_folder = dump_folder self.dump_kernels = dump_kernels + self.dump_hash_logs = dump_hash_logs self.max_candidates = max_candidates self.generate_isolation_file = generate_isolation_file self.plot_statistics = plot_statistics + self.postprocess_kernels = postprocess_kernels @abstractmethod def default_dump_folder(self): diff --git a/hysop/backend/device/kernel_autotuner_statistics.py b/hysop/backend/device/kernel_autotuner_statistics.py index 57efe67f2d6c5a73eb475081c8ae33c379e51140..f372f5565a1abbcc6d9336e717bff6311f397b3e 100644 --- a/hysop/backend/device/kernel_autotuner_statistics.py +++ b/hysop/backend/device/kernel_autotuner_statistics.py @@ -8,9 +8,11 @@ class AutotunedKernelStatistics(dict): class AutotunedParameterStatistics(dict): class AutotunedRunStatistics(object): def __init__(self, - work_load, local_work_size, - global_work_size, statistics, - pruned, local_best, error): + work_size, work_load, + local_work_size, global_work_size, + statistics, pruned, + local_best, error): + self.work_size = work_size self.work_load = work_load self.local_work_size = local_work_size self.global_work_size = global_work_size @@ -42,6 +44,7 @@ class AutotunedKernelStatistics(dict): self.pruned_count = None self.failed_count = None self.total_count = None + self.file_basename = None self.steps = {} def push_parameters(self, extra_param_hash, **kwds): return self.setdefault(extra_param_hash, self.AutotunedParameterStatistics(**kwds)) @@ -49,23 +52,11 @@ class AutotunedKernelStatistics(dict): self.steps[step_id] = candidates def plot(self): + self.collect_exec_times() self.plot_histogram() def plot_histogram(self): - exec_times = self.collect_exec_times() - - def collect_exec_times(self): - run_times = () - for (extra_param_hash, parameter_statistics) in self.iteritems(): - if not parameter_statistics.good(): - continue - for (run_key, run_statistics) in parameter_statistics.iteritems(): - if not run_statistics.good(): - continue - run_time = run_statistics.statistics.mean - run_times += (run_time,) - #run_times += run_statistics.statistics.data[:self.nruns] - run_times = np.asarray(run_times, dtype=np.float64) + run_times = self.run_times.copy() for unit in ('ns', 'us', 'ms', 's'): if run_times.min() < 1e2: break @@ -77,8 +68,6 @@ class AutotunedKernelStatistics(dict): imax = int(np.ceil(np.log10(vnmax))) xmin = 10.0**imin xmax = 10.0**imax - #N = 10 - #logbins = tuple(j*(10//N)*(10.0**i) for i in xrange(imin, imax) for j in xrange(1,N) ) + (xmax,) logbins = np.geomspace(xmin, xmax, (imax-imin+1)*10) fig, axe = plt.subplots() fig.suptitle(self.kernel_name, weight='bold') @@ -97,9 +86,20 @@ class AutotunedKernelStatistics(dict): axe.axvline(x=vnmean, label=r'median: ${:.1f} {unit}$ (x{:.1f})'.format(vmean, vnmean/vnmin, unit=unit), color='darkorange') axe.axvline(x=vnmax, label=r'worst: ${:.1f} {unit}$ (x{:.1f})'.format(vmax, vnmax/vnmin, unit=unit), color='r') axe.legend(framealpha=1.0, title='Execution times') - fig.savefig('{}/histo_{}.png'.format( - self.tkernel.autotuner_config.dump_folder, - self.kernel_name)) - import sys - #sys.exit(1) + fig.savefig('{}/{}_histo.png'.format( + self.tkernel.autotuner_config.dump_folder, self.file_basename)) + + def collect_exec_times(self): + run_times = () + for (extra_param_hash, parameter_statistics) in self.iteritems(): + if not parameter_statistics.good(): + continue + for (run_key, run_statistics) in parameter_statistics.iteritems(): + if not run_statistics.good(): + continue + run_time = run_statistics.statistics.mean + run_times += (run_time,) + #run_times += run_statistics.statistics.data[:self.nruns] + run_times = np.asarray(run_times, dtype=np.float64) + self.run_times = run_times diff --git a/hysop/backend/device/opencl/opencl_autotunable_kernel.py b/hysop/backend/device/opencl/opencl_autotunable_kernel.py index c9f4c6913113734ab9cdd47bcf77827a2d25a583..a6e003b18ea5d40cd7bc55067ab0771578560001 100644 --- a/hysop/backend/device/opencl/opencl_autotunable_kernel.py +++ b/hysop/backend/device/opencl/opencl_autotunable_kernel.py @@ -1,3 +1,4 @@ +import subprocess from abc import ABCMeta, abstractmethod from hysop import __KERNEL_DEBUG__ from hysop.deps import os @@ -34,7 +35,7 @@ class OpenClAutotunableKernel(AutotunableKernel): from hysop.backend.device.opencl.opencl_kernel_autotuner import OpenClKernelAutotuner autotuner = OpenClKernelAutotuner(name=name, tunable_kernel=self) - best_candidate_results = autotuner.autotune(extra_kwds=extra_kwds, + best_candidate_results, file_basename, from_cache = autotuner.autotune(extra_kwds=extra_kwds, force_verbose=force_verbose, force_debug=force_debug) check_instance(best_candidate_results, dict) @@ -44,7 +45,8 @@ class OpenClAutotunableKernel(AutotunableKernel): **extra_kwds['kernel_args']) return self.format_best_candidate(name=name, extra_kwds=extra_kwds, - args_mapping=args_mapping, args_list=args_list, + args_mapping=args_mapping, args_list=args_list, autotuner=autotuner, + file_basename=file_basename, from_cache=from_cache, **best_candidate_results) def compute_global_work_size(self, work, local_work_size, @@ -82,11 +84,15 @@ class OpenClAutotunableKernel(AutotunableKernel): }) return known_vars - def format_best_candidate(self, name, extra_kwds, extra_parameters, work_load, + def format_best_candidate(self, autotuner, + file_basename, from_cache, name, + extra_kwds, extra_parameters, + work_size, work_load, global_work_size, local_work_size, args_mapping, args_list, program, kernel, kernel_name, kernel_src, - kernel_statistics, src_hash, hash_logs): + kernel_statistics, src_hash, + extra_kwds_hash, extra_kwds_hash_logs): """ Post treatment callback for autotuner results. Transform autotuner results in user friendly kernel wrappers. @@ -96,6 +102,7 @@ class OpenClAutotunableKernel(AutotunableKernel): Use the build_launcher method to build OpenClKernelLauncher from this OpenClKernel. """ + check_instance(from_cache, bool) check_instance(extra_parameters, dict, keys=str) check_instance(extra_kwds, dict, keys=str) check_instance(work_load, tuple, values=npw.int32) @@ -107,23 +114,66 @@ class OpenClAutotunableKernel(AutotunableKernel): check_instance(kernel_name, str) check_instance(kernel_statistics, OpenClKernelStatistics) check_instance(src_hash, str) - check_instance(hash_logs, str) + check_instance(extra_kwds_hash, str) + check_instance(extra_kwds_hash_logs, str) + + kernel_hash_logs = self.generate_hash_logs(file_basename, extra_kwds_hash_logs) - isolation_params = extra_kwds['isolation_params'] - - kernel_source = self.generate_source_file(kernel_name, kernel_src) + kernel_source = self.generate_source_file(file_basename, kernel_src) kernel_isolation = self.generate_oclgrind_isolation_file(kernel, - kernel_name, kernel_source, + file_basename, kernel_source, global_work_size, local_work_size, - args_list, args_mapping, isolation_params) + args_list, args_mapping, + extra_kwds['isolation_params']) - kernel = OpenClKernel(name=kernel_name, program=program, + kernel = OpenClKernel(name=autotuner.name, program=program, args_mapping=args_mapping, default_queue=None, default_global_work_size=global_work_size, default_local_work_size=local_work_size, default_args=None) + + autotuner_config = autotuner.autotuner_config + if autotuner_config.postprocess_kernels: + # execute command FILE_BASENAME FROM_CACHE + # AUTOTUNER_DUMP_DIR AUTOTUNER_NAME KERNEL_NAME + # MEAN_EXECUTION_TIME_NS MIN_EXECUTION_TIME_NS MAX_EXECUTION_TIME_NS + # KERNEL_SOURCE_FILE KERNEL_ISOLATION_FILE KERNEL_HASH_LOGS_FILE + # VENDOR_NAME DEVICE_NAME + # WORK_SIZE WORK_LOAD + # GLOBAL_WORK_SIZE LOCAL_WORK_SIZE + # EXTRA_PARAMETERS EXTRA_KWDS_HASH SRC_HASH + command = [str(autotuner_config.postprocess_kernels), + str(file_basename), + '1' if from_cache else '0', + str(autotuner_config.dump_folder), + str(autotuner.name), + str(kernel_name), + str(kernel_statistics.mean), + str(kernel_statistics.min), + str(kernel_statistics.max), + str(kernel_source), + str(kernel_isolation), + str(kernel_hash_logs), + str(kernel._kernel.context.devices[0].platform.name.strip()), + str(kernel._kernel.context.devices[0].name.strip()), + str(work_size), + str(work_load), + str(global_work_size), + str(local_work_size), + str(extra_parameters), + str(extra_kwds_hash), + str(src_hash)] + if autotuner_config.debug: + print('POSTPROCESSING KERNEL {}:\n'.format(autotuner.name) + ' '.join(command)) + try: + subprocess.check_call(command) + except: + msg='\nFATAL ERROR: Failed to call autotuner postprocessing command.\n{}\n' + msg=msg.format(' '.join(command)) + print(msg) + raise args_dict = extra_kwds['kernel_args'] return (kernel, args_dict) @@ -135,8 +185,8 @@ class OpenClAutotunableKernel(AutotunableKernel): # dump the best kernel dump_folder = self.autotuner_config.dump_folder - dump_file=dump_folder+'/rk{}_{}.cl'.format( - main_rank, kernel_name.replace(' ', '_')) + dump_file=dump_folder+'/{}__{}.cl'.format( + kernel_name.replace(' ', '_'), main_rank) if not os.path.exists(dump_folder) and (main_rank == 0): os.makedirs(dump_folder) with open(dump_file, 'w+') as f: @@ -144,6 +194,22 @@ class OpenClAutotunableKernel(AutotunableKernel): print ' >Saving OpenCL kernel source to \'{}\'.'.format(dump_file) f.write(kernel_src) return dump_file + + def generate_hash_logs(self, kernel_name, hash_logs, force=False): + if (not force) and (not self.autotuner_config.dump_hash_logs): + return None + + # dump the best kernel + dump_folder = self.autotuner_config.dump_folder + dump_file=dump_folder+'/{}__{}_hash_logs.txt'.format( + kernel_name.replace(' ', '_'), main_rank) + if not os.path.exists(dump_folder) and (main_rank == 0): + os.makedirs(dump_folder) + with open(dump_file, 'w+') as f: + if self.autotuner_config.verbose: + print ' >Saving hash logs to \'{}\'.'.format(dump_file) + f.write(hash_logs) + return dump_file def generate_oclgrind_isolation_file(self, kernel, kernel_name, kernel_source, global_work_size, local_work_size, @@ -160,8 +226,8 @@ class OpenClAutotunableKernel(AutotunableKernel): assert len(sorted_args) == len(args_list) dump_folder = self.autotuner_config.dump_folder - dump_file=dump_folder+'/rk{}_{}.sim'.format( - main_rank, kernel_name.replace(' ', '_')) + dump_file=dump_folder+'/{}__{}.sim'.format( + kernel_name.replace(' ', '_'), main_rank) with open(dump_file, 'w+') as f: msg ='# Isolation configuration file for kernel {}.'.format(kernel_name) msg+='\n# See https://github.com/jrprice/Oclgrind/wiki/Running-Kernels-in-Isolation ' diff --git a/hysop/tools/io_utils.py b/hysop/tools/io_utils.py index 76c56c5a82bc48acff0a1a474b22f8e1c3d5ab7c..34c85ec3eed063df4b8bd8d35b942a9972df607a 100755 --- a/hysop/tools/io_utils.py +++ b/hysop/tools/io_utils.py @@ -8,7 +8,7 @@ * :class:`~XMF`, tools to prepare/write xmf files. """ -import os, h5py, psutil, warnings, tempfile, socket +import os, h5py, psutil, warnings, tempfile, socket, shutil, atexit import numpy as np import subprocess32 as subprocess from collections import namedtuple @@ -29,6 +29,7 @@ class IO(object): _default_path = None _cache_path = None + _tmp_dirs = {} HDF5 = 998 """HDF5 format id""" @@ -127,6 +128,23 @@ class IO(object): IO.set_cache_path(IO.default_cache_path()) return IO._cache_path + @classmethod + def get_tmp_dir(cls, key): + """ + Create or get an existing temporary directory. + """ + if (key in cls._tmp_dirs): + tmp_dir = cls._tmp_dirs[key] + else: + tmp_dir = tempfile.mkdtemp() + cls._tmp_dirs[key] = tmp_dir + return tmp_dir + + @classmethod + def _remove_tmp_dirs(cls): + for f in cls._tmp_dirs.values(): + shutil.rmtree(f, ignore_errors=True, onerror=None) + @classmethod def set_cache_path(cls, path): if cls.is_shared_fs(path): @@ -533,3 +551,6 @@ class XMF(object): xml_grid += " </Attribute>\n" xml_grid += " </Grid>\n" return xml_grid + +atexit.register(IO._remove_tmp_dirs) + diff --git a/hysop/tools/postprocess_kernel.sh b/hysop/tools/postprocess_kernel.sh new file mode 100755 index 0000000000000000000000000000000000000000..61ed13cc18646b199802a3cf7c6aaf9fd46cd835 --- /dev/null +++ b/hysop/tools/postprocess_kernel.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Example of autotuner post processing script. +# Input arguments are: +# FILE_BASENAME FROM_CACHE +# AUTOTUNER_DUMP_DIR AUTOTUNER_NAME KERNEL_NAME +# MEAN_EXECUTION_TIME_NS MIN_EXECUTION_TIME_NS MAX_EXECUTION_TIME_NS +# KERNEL_SOURCE_FILE KERNEL_ISOLATION_FILE KERNEL_HASH_LOGS +# VENDOR_NAME DEVICE_NAME +# WORK_SIZE WORK_LOAD +# GLOBAL_WORK_SIZE LOCAL_WORK_SIZE +# EXTRA_PARAMETERS EXTRA_KWDS_HASH SRC_HASH +# See example hysop/examples/example_utils.py interface and '--autotuner-postprocess-kernels' argument. + +set -e +if [ "$#" -ne 20 ]; then + echo "Script expected 20 parameters." + exit 1 +fi + +FILE_BASENAME=${1} +FROM_CACHE=${2} +AUTOTUNER_DUMP_DIR=${3} +AUTOTUNER_NAME=${4} +KERNEL_NAME=${5} +MEAN_EXECUTION_TIME_NS=${6} +MIN_EXECUTION_TIME_NS=${7} +MAX_EXECUTION_TIME_NS=${8} +KERNEL_SOURCE_FILE=${9} +KERNEL_ISOLATION_FILE=${10} +KERNEL_HASH_LOGS_FILE=${11} +VENDOR_NAME=${12} +DEVICE_NAME=${13} +WORK_SIZE=${14} +WORK_LOAD=${15} +GLOBAL_WORK_SIZE=${16} +LOCAL_WORK_SIZE=${17} +EXTRA_PARAMETERS=${18} +EXTRA_KWDS_HASH=${19} +SRC_HASH=${20} + +echo "Successfully postprocessed kernel '$AUTOTUNER_NAME'." +exit 0