diff --git a/examples/example_utils.py b/examples/example_utils.py
index 7812f433e6ef8ec91d06a94a2efcc2e4d409ffe4..91d1a45ee17cd6c9fa54ae04cfc2dd41657f84e1 100644
--- a/examples/example_utils.py
+++ b/examples/example_utils.py
@@ -183,8 +183,12 @@ class HysopArgParser(argparse.ArgumentParser):
                 self._rmfiles(dump_dir, 'h5')
                 self._rmfiles(dump_dir, 'xmf')
                 self._rmfiles(dump_dir, 'out')
+                self._rmfiles(dump_dir, 'cl')
                 self._rmfiles(dump_dir, 'txt')
                 self._rmfiles(dump_dir, 'png')
+                self._rmfiles(dump_dir, 'sim')
+                self._rmfiles(dump_dir, 'xml')
+                self._rmfiles(dump_dir, 'json')
                 self._rmfiles(dump_dir, 'pdf')
                 self._rmfiles(dump_dir, 'npz')
                 self._rmfiles(dump_dir, 'pklz')
@@ -261,8 +265,10 @@ class HysopArgParser(argparse.ArgumentParser):
             return msg
     
     @staticmethod
-    def _mkdir(path):
-        path = os.path.dirname(os.path.realpath(path))
+    def _mkdir(path, dirname=True):
+        path = os.path.realpath(path)
+        if dirname:
+            path = os.path.dirname(path)
         try:
             os.makedirs(path)
         except OSError as e:
@@ -758,6 +764,13 @@ class HysopArgParser(argparse.ArgumentParser):
 
     def _add_autotuner_args(self):
         autotuner = self.add_argument_group('Kernel autotuner parameters')
+        autotuner.add_argument('--autotuner-dump-dir', type=str, default=None, 
+                dest='autotuner_dump_dir',
+                help='Configure kernel autotuner dump directory.')
+        autotuner.add_argument('--autotuner-cache-override', 
+                action='store_true',
+                dest='autotuner_cache_override',
+                help='Override kernel autotuner cached data. Best kernels candidates will be stored in a temporary directory instead of persistant system-wide cache directory.')
         autotuner.add_argument('--autotuner-flag', type=str, default=None, 
                 dest='autotuner_flag', 
                 help=('Configure kernel autotuner rigor flag'
@@ -774,21 +787,22 @@ class HysopArgParser(argparse.ArgumentParser):
         autotuner.add_argument('--autotuner-verbose', type=int, default=None,
                 dest='autotuner_verbose',
                 help='Configure kernel autotuner kernel verbosity (0 to 5).')
-        autotuner.add_argument('--autotuner-debug', type=bool, default=None, 
+        autotuner.add_argument('--autotuner-debug', 
+                action='store_true',
                 dest='autotuner_debug',
                 help='Configure kernel autotuner kernel debug flag.')
-        autotuner.add_argument('--autotuner-dump-kernels', type=bool, default=None, 
+        autotuner.add_argument('--autotuner-dump-kernels', 
+                action='store_true',
                 dest='autotuner_dump_kernels',
                 help='Configure kernel autotuner kernel source dumping.')
-        autotuner.add_argument('--autotuner-dump-isolation', type=bool, default=None, 
+        autotuner.add_argument('--autotuner-dump-isolation', 
+                action='store_true',
                 dest='autotuner_dump_isolation',
-                help='Configure kernel autotuner kernel isolation file generation.')
-        autotuner.add_argument('--autotuner-cache-override', type=bool, default=None, 
-                dest='autotuner_cache_override',
-                help='Override kernel autotuner cached data.')
-        autotuner.add_argument('--autotuner-dump-dir', type=str, default=None, 
-                dest='autotuner_dump_dir',
-                help='Configure kernel autotuner dump directory.')
+                help='Configure kernel autotuner to generate oclgrind kernel isolation files for each optimal kernel.')
+        autotuner.add_argument('--autotuner-dump-hash-logs', 
+                action='store_true',
+                dest='autotuner_dump_hash_logs',
+                help='Configure kernel autotuner to generate kernel extra keywords hash logs for kernel caching debugging purposes.')
         autotuner.add_argument('--autotuner-plot-statistics',
                 action='store_true',
                 dest='autotuner_plot_statistics',
@@ -796,16 +810,22 @@ class HysopArgParser(argparse.ArgumentParser):
         autotuner.add_argument('--autotuner-bench-kernels',
                 action='store_true',
                 dest='autotuner_bench_kernels',
-                help='Bench mode for kernels, enables exhaustive search without max candidates and disable prune threshold.')
+                help='Enable standard bench mode for kernels: search without max candidates at maximum verbosity with cache override and nruns=8. Prune threshold and autotuner flag are however not modified.')
+        autotuner.add_argument('--autotuner-postprocess-kernels', type=str, default=None,
+                dest='autotuner_postprocess_kernels',
+                help=('Run a custom command after each final generated kernel: '
+                +'command  FILE_BASENAME  FROM_CACHE  AUTOTUNER_DUMP_DIR  AUTOTUNER_NAME  KERNEL_NAME  MEAN_EXECUTION_TIME_NS  MIN_EXECUTION_TIME_NS  MAX_EXECUTION_TIME_NS  KERNEL_SOURCE_FILE  KERNEL_ISOLATION_FILE  KERNEL_HASH_LOGS_FILE  VENDOR_NAME  DEVICE_NAME  WORK_SIZE  WORK_LOAD  GLOBAL_WORK_SIZE  LOCAL_WORK_SIZE  EXTRA_PARAMETERS  EXTRA_KWDS_HASH  SRC_HASH.'
+                +'See hysop/tools/postprocess_kernel.sh for an example of post processing script.'))
         return autotuner
     
     def _check_autotuner_args(self, args):
-        self._check_default(args, ('autotuner_flag', 'autotuner_dump_dir'), 
+        self._check_default(args, ('autotuner_flag', 'autotuner_dump_dir', 'autotuner_postprocess_kernels'), 
                                         str, allow_none=True)
         self._check_default(args, ('autotuner_nruns', 'autotuner_max_candidates', 
                                    'autotuner_verbose'), int, allow_none=True)
         self._check_default(args, ('autotuner_dump_kernels', 
                                    'autotuner_dump_isolation',
+                                   'autotuner_dump_hash_logs',
                                    'autotuner_bench_kernels',
                                    'autotuner_plot_statistics'), 
                                    bool, allow_none=True)
@@ -824,8 +844,6 @@ class HysopArgParser(argparse.ArgumentParser):
             args.autotuner_nruns = 8
             args.autotuner_max_candidates = np.iinfo(np.int64).max
             args.autotuner_verbose = np.iinfo(np.int64).max
-            args.autotuner_flag = self._convert_autotuner_flag('autotuner_flag', 'exhaustive')
-            args.autotuner_prune_threshold = 2.0
             args.autotuner_cache_override = True
             args.autotuner_plot_statistics = True
     
@@ -843,10 +861,12 @@ class HysopArgParser(argparse.ArgumentParser):
                 verbose=args.autotuner_verbose,
                 debug=args.autotuner_debug,
                 dump_kernels=args.autotuner_dump_kernels,
+                dump_hash_logs=args.autotuner_dump_hash_logs,
                 generate_isolation_file=args.autotuner_dump_isolation,
                 plot_statistics=args.autotuner_plot_statistics,
                 override_cache=override_cache,
-                dump_folder=args.autotuner_dump_dir)
+                dump_folder=args.autotuner_dump_dir,
+                postprocess_kernels=args.autotuner_postprocess_kernels)
         return autotuner_config
 
     def _add_file_io_args(self, default_dump_dir, generate_io_params):
@@ -1286,6 +1306,8 @@ class HysopArgParser(argparse.ArgumentParser):
                 msg='{} directory \'{}\' cannot be stored on a network file system.'
                 msg=msg.format(argname, argvalue)
                 self.error(msg)
+
+            self._mkdir(argvalue, dirname=False)
             setattr(args, argname, os.path.realpath(argvalue))
 
     def _setup_hysop_env(self, args):
diff --git a/hysop/backend/device/autotunable_kernel.py b/hysop/backend/device/autotunable_kernel.py
index 20ef8c74b02abbe5df3a94735cb9cc6106ba1a95..b70bcf0dd35dcdaba49a087a8c3b22b110d0f874 100644
--- a/hysop/backend/device/autotunable_kernel.py
+++ b/hysop/backend/device/autotunable_kernel.py
@@ -23,7 +23,7 @@ class AutotunableKernel(object):
         self.symbolic_mode = first_not_None(symbolic_mode, autotuner_config.debug)
 
     def custom_hash(self, *args, **kwds):
-        HASH_DEBUG=False
+        HASH_DEBUG=self.autotuner_config.dump_hash_logs
         assert args or kwds, 'no arguments to be hashed.'
 
         def _hash_arg(a):
diff --git a/hysop/backend/device/codegen/kernels/transpose.py b/hysop/backend/device/codegen/kernels/transpose.py
index 4d4701de045b93cf0c6f7e8424394aae6606c5fc..15ab1b9640d018835cbfa633d75456255c36ecf0 100644
--- a/hysop/backend/device/codegen/kernels/transpose.py
+++ b/hysop/backend/device/codegen/kernels/transpose.py
@@ -25,8 +25,8 @@ class TransposeKernelGenerator(KernelCodeGenerator):
             use_diagonal_coordinates):
         pdim = len(axes)
         axes = [ str(j) if i!=j else 'X' for i,j in enumerate(axes) ]
-        return '{}transpose_{}_{}_{}d__N{}__T{}__P{}__{}'.format(
-                'diag_' if use_diagonal_coordinates else '',
+        return 'transpose{}_{}_{}_{}d__N{}__T{}__P{}__{}'.format(
+                '_dc' if use_diagonal_coordinates else '_nc',
                 'inplace' if is_inplace else 'out_of_place',
                 ctype.replace(' ','_'), pdim, vectorization, tile_size, tile_padding, 
                 '_'.join(axes))
diff --git a/hysop/backend/device/kernel_autotuner.py b/hysop/backend/device/kernel_autotuner.py
index 1fce4b597188274c40bcdb70318cc6b58c4b559a..316579c87d17c162af3926164347c2cde25968fb 100644
--- a/hysop/backend/device/kernel_autotuner.py
+++ b/hysop/backend/device/kernel_autotuner.py
@@ -28,12 +28,15 @@ class KernelAutotuner(object):
     @staticmethod 
     def _hash_func():
         return hashlib.new('sha256')
-
-    @staticmethod
-    def cache_dir():
-        cache_dir = IO.cache_path() + '/kernel_autotuner'
-        return cache_dir
-
+    
+    def use_tmp_cache(self):
+        self._cache_dir = IO.get_tmp_dir('kernel_autotuner')
+    def use_system_cache(self):
+        self._cache_dir = IO.cache_path() + '/kernel_autotuner'
+
+    def cache_dir(self):
+        assert (self._cache_dir is not None)
+        return self._cache_dir
     def cache_file(self):
         cache_file = '{}/{}.pklz'.format(self.cache_dir(), self.name.replace(' ','_'))
         return cache_file
@@ -80,12 +83,25 @@ class KernelAutotuner(object):
         self.indent = lambda i: '  '*i
         self.verbose = self.autotuner_config.verbose
             
-        self.prg_idx      = 4
-        self.knl_idx      = 5
-        self.stats_idx    = 6
-        self.src_idx      = 7
-        self.src_hash_idx = 9
-        self.logs_idx     = 10
+        self.result_keys = (
+                'extra_parameters',    #00
+                'work_size',           #01
+                'work_load',           #02
+                'global_work_size',    #03
+                'local_work_size',     #04
+                'program',             #05
+                'kernel',              #06
+                'kernel_statistics',   #07
+                'kernel_src',          #08
+                'kernel_name',         #09
+                'src_hash',            #10
+                'extra_kwds_hash',     #10
+                'extra_kwds_hash_logs' #12
+            )
+        for (i, pname) in enumerate(self.result_keys):
+            setattr(self, '{}_idx'.format(pname), i)
+        
+        self._cache_dir = None
             
     def autotune(self, extra_kwds,
             first_working=False, 
@@ -105,34 +121,42 @@ class KernelAutotuner(object):
         autotuner_config = self.autotuner_config
 
         extra_kwds_hash, extra_kwds_hash_logs = tkernel.hash_extra_kwds(extra_kwds)
+        hasher = self._hash_func()
+        hasher.update(str(extra_kwds_hash))
+        extra_kwds_hash = hasher.hexdigest()
+        check_instance(extra_kwds_hash, str)
         check_instance(extra_kwds_hash_logs, str)
+        file_basename = '{}_{}'.format(self.name, extra_kwds_hash[:4])
         
         self._print_header(extra_kwds)
-        results = self._reload_cache(extra_kwds_hash)
-        
         if autotuner_config.override_cache:
             if self.verbose:
-                print self.indent(1)+'>Ignoring cached results, benching all kernels.'
-            best_candidate = None
-        elif first_working:
+                print self.indent(1)+'>Using temporary cache folder, benching all new kernels.'
+            self.use_tmp_cache()
+        else:
+            self.use_system_cache()
+        results = self._reload_cache(extra_kwds_hash)
+        
+        if first_working:
             best_candidate = None
         else:
             best_candidate = self._load_results_from_cache(tkernel, results, extra_kwds,
-                    force_verbose, force_debug, extra_kwds_hash_logs)
+                    force_verbose, force_debug, extra_kwds_hash, extra_kwds_hash_logs, file_basename)
 
         if (best_candidate is None):
             best_candidate = self._autotune_kernels(tkernel, results, extra_kwds,
-                    force_verbose, force_debug, first_working, extra_kwds_hash_logs)
+                    force_verbose, force_debug, first_working, 
+                    extra_kwds_hash, extra_kwds_hash_logs, file_basename)
+            from_cache = False
+        else:
+            from_cache = True
         
-        result_keys = ('extra_parameters', 'work_load', 'global_work_size', 'local_work_size', 
-                        'program', 'kernel', 'kernel_statistics', 'kernel_src', 'kernel_name', 
-                        'src_hash', 'hash_logs')
-        assert len(result_keys) == len(best_candidate)
-        return dict(zip(result_keys, best_candidate))
+        assert len(self.result_keys) == len(best_candidate)
+        return dict(zip(self.result_keys, best_candidate)), file_basename, from_cache
 
     
     def _load_results_from_cache(self, tkernel, results, extra_kwds,
-            force_verbose, force_debug, extra_kwds_hash_logs):
+            force_verbose, force_debug, extra_kwds_hash, extra_kwds_hash_logs, file_basename):
         if (self.FULL_RESULTS_KEY not in results):
             if self.verbose:
                 print ('  >No best candidate was cached for this configuration, '
@@ -147,10 +171,19 @@ class KernelAutotuner(object):
         # pyopencl kernel and program objects.
         best_candidate = copy.deepcopy(results[self.FULL_RESULTS_KEY])
         
-        (extra_parameters, work_load, global_work_size, local_work_size, 
-            prg, kernel, statistics, cached_kernel_src, 
-            cached_kernel_name, cached_src_hash, 
-            cached_kernel_hash_logs) = best_candidate
+        (extra_parameters, 
+          work_size, work_load, global_work_size, local_work_size, 
+          prg, kernel, statistics, cached_kernel_src, 
+          cached_kernel_name, cached_src_hash, 
+          cached_kernel_hash, cached_kernel_hash_logs) = best_candidate
+        
+        if (cached_kernel_hash != extra_kwds_hash):
+            msg='\nCached kernel extra_kwds hash did not match the benched one:\n {}\n {}\n'
+            msg+='\nThis might be due to an upgrade of the generated code or '
+            msg+='a faulty implementation of {}.hash_extra_kwds().'
+            msg=msg.format(cached_kernel_hash, extra_kwds_hash, type(tkernel).__name__)
+            warnings.warn(msg, CodeGeneratorWarning)
+            return None
 
         assert prg is None
         assert kernel is None
@@ -213,15 +246,16 @@ class KernelAutotuner(object):
                 global_work_size=global_work_size, 
                 local_work_size=local_work_size)
             
-        best_candidate[self.prg_idx]  = prg
-        best_candidate[self.knl_idx]  = kernel
-        best_candidate[self.src_idx]  = kernel_src
-        best_candidate[self.logs_idx] = extra_kwds_hash_logs
+        best_candidate[self.program_idx]    = prg
+        best_candidate[self.kernel_idx]     = kernel
+        best_candidate[self.kernel_src_idx] = kernel_src
+        best_candidate[self.extra_kwds_hash_logs_idx]  = extra_kwds_hash_logs
         return tuple(best_candidate)
 
 
     def _autotune_kernels(self, tkernel, results, extra_kwds, 
-            force_verbose, force_debug, first_working, extra_kwds_hash_logs):
+            force_verbose, force_debug, first_working, 
+            extra_kwds_hash, extra_kwds_hash_logs, file_basename):
         autotuner_config = self.autotuner_config 
         if first_working:
             nruns = 1
@@ -238,6 +272,7 @@ class KernelAutotuner(object):
         ks = AutotunedKernelStatistics(tkernel, extra_kwds)
         ks.max_candidates = max_candidates
         ks.nruns = nruns
+        ks.file_basename = file_basename
         
         with Timer() as timer:
             params = tkernel.compute_parameters(extra_kwds=extra_kwds)
@@ -265,6 +300,7 @@ class KernelAutotuner(object):
                                                           preferred_work_group_size_multiple=preferred_work_group_size_multiple,
                                                           extra_parameters=extra_parameters,
                                                           extra_kwds=extra_kwds)
+                work_size = work_bounds.work_size
                 
                 self._print_parameters(extra_parameters, work_bounds)
 
@@ -306,7 +342,7 @@ class KernelAutotuner(object):
                             hasher.update(kernel_src)
                             src_hash = hasher.hexdigest()
 
-                            if (not autotuner_config.override_cache) and (run_key in results):
+                            if (run_key in results):
                                 (cache_src_hash, cache_stats) = results[run_key]
                                 if (cache_src_hash != src_hash):
                                     msg='\nCached parameters candidate did not match the '
@@ -354,19 +390,21 @@ class KernelAutotuner(object):
                                 local_best = False
                             
                             candidate =  (extra_parameters, 
+                                    tuple(work_size),
                                     tuple(work_load), 
                                     tuple(global_work_size), 
                                     tuple(local_work_size), 
                                     prg, kernel, statistics, 
                                     kernel_src, kernel_name, 
-                                    src_hash, extra_kwds_hash_logs)
+                                    src_hash, extra_kwds_hash, extra_kwds_hash_logs)
                             
                             results[run_key] = (src_hash, statistics)
                             bench_results[run_key] = candidate
                             pks.push_run_statistics(run_key,
-                                    work_load=work_load, local_work_size=local_work_size, 
-                                    global_work_size=global_work_size, statistics=statistics, 
-                                    pruned=pruned, local_best=local_best, error=None)
+                                    work_size=work_size, work_load=work_load,
+                                    local_work_size=local_work_size, global_work_size=global_work_size, 
+                                    statistics=statistics, pruned=pruned, 
+                                    local_best=local_best, error=None)
                         except KernelGenerationError as e:
                             if __KERNEL_DEBUG__:
                                 sys.stderr.write(str(e)+'\n')
@@ -374,10 +412,10 @@ class KernelAutotuner(object):
                             statistics = None
                             from_cache=False
                             pks.push_run_statistics(run_key,
-                                    work_load=work_load, local_work_size=local_work_size, 
-                                    global_work_size=global_work_size, 
-                                    statistics=None,
-                                    pruned=None, local_best=None, error=e)
+                                    work_size=work_size, work_load=work_load, 
+                                    local_work_size=local_work_size, global_work_size=global_work_size, 
+                                    statistics=None, pruned=None, 
+                                    local_best=None, error=e)
                         total_count += 1
                         abort = (max_candidates is not None) and \
                                 ((pruned_count + kept_count) >= max_candidates)
@@ -412,7 +450,7 @@ class KernelAutotuner(object):
             keep_only = max(previous_pow2(kept_count),1)
             self._print_first_step_results(total_count, kept_count, pruned_count, 
                     failed_count, keep_only)
-            candidates = sorted(bench_results.items(), key=lambda x: x[1][self.stats_idx])
+            candidates = sorted(bench_results.items(), key=lambda x: x[1][self.kernel_statistics_idx])
             candidates = candidates[:keep_only]
             while(len(candidates)>1):
                 step_count += 1
@@ -420,16 +458,16 @@ class KernelAutotuner(object):
                 
                 self._print_step(step_count, '{} BEST'.format(len(candidates)), nruns)
                 for (run_key, run_params) in candidates:
-                    (extra_params, work_load, global_work_size, local_work_size, 
-                            _, kernel, old_stats, _, _, _, _) = run_params
+                    (extra_params, work_size, work_load, global_work_size, local_work_size, 
+                            _, kernel, old_stats, _, _, _, _, _) = run_params
                     self.bench_one_from_binary(kernel=kernel,
                                              target_nruns=nruns, 
                                              old_stats=old_stats,
                                              best_stats=best_stats,
                                              global_work_size=global_work_size,
                                              local_work_size=local_work_size)
-                candidates = sorted(candidates, key=lambda x: x[1][self.stats_idx])
-                self._print_step_results(candidates, self.stats_idx)
+                candidates = sorted(candidates, key=lambda x: x[1][self.kernel_statistics_idx])
+                self._print_step_results(candidates, self.kernel_statistics_idx)
                 candidates = candidates[:max(previous_pow2(len(candidates)),1)]
                 ks.push_step(step_count, candidates)
             best_candidate = candidates[0][1]
@@ -437,11 +475,12 @@ class KernelAutotuner(object):
         
         ks.exec_time = timer.interval
         ks.best_candidate = best_candidate
-        ks.kernel_name = kernel_name
+        ks.kernel_name = self.name
         ks.kept_count = kept_count
         ks.pruned_count = pruned_count
         ks.failed_count = failed_count
         ks.total_count = total_count
+        ks.extra_kwds_hash = best_candidate[self.extra_kwds_hash_idx]
         if autotuner_config.plot_statistics and not first_working: 
             ks.plot()
         
@@ -452,10 +491,10 @@ class KernelAutotuner(object):
 
         # Export best candidate results
         if not self.STORE_FULL_KERNEL_SOURCES:
-            best_candidate[self.src_idx]  = None
-            best_candidate[self.logs_idx] = None
-        best_candidate[self.prg_idx] = None
-        best_candidate[self.knl_idx] = None
+            best_candidate[self.kernel_src_idx] = None
+            best_candidate[self.extra_kwds_hash_logs_idx]  = None
+        best_candidate[self.program_idx] = None
+        best_candidate[self.kernel_idx] = None
         results[self.FULL_RESULTS_KEY] = best_candidate
         self._dump_cache()
 
@@ -463,8 +502,8 @@ class KernelAutotuner(object):
     
     def _build_final_kernel(self, tkernel, best_candidate,
             extra_kwds):
-        (extra_parameters, work_load, global_work_size, local_work_size, 
-            _, _, _, _, _, _, _) = best_candidate
+        (extra_parameters, work_size, work_load, global_work_size, local_work_size, 
+            _, _, _, _, _, _, _, _) = best_candidate
         
         global_work_size = npw.asintegerarray(global_work_size)
         local_work_size  = npw.asintegerarray(local_work_size)
@@ -490,10 +529,10 @@ class KernelAutotuner(object):
                 global_work_size=global_work_size, 
                 local_work_size=local_work_size)
          
-        best_candidate[self.prg_idx] = prg
-        best_candidate[self.knl_idx] = kernel
-        best_candidate[self.src_idx] = kernel_src
-        best_candidate[self.src_hash_idx] = src_hash
+        best_candidate[self.program_idx]    = prg
+        best_candidate[self.kernel_idx]     = kernel
+        best_candidate[self.kernel_src_idx] = kernel_src
+        best_candidate[self.src_hash_idx]   = src_hash
         return best_candidate
 
     def _compute_args_list(self, args_mapping, **kernel_args):
@@ -730,17 +769,17 @@ class KernelAutotuner(object):
             print config
 
 
-    def _print_step_results(self, sorted_candidates, stats_idx):
+    def _print_step_results(self, sorted_candidates, kernel_statistics_idx):
         if self.verbose==2:
             best  = sorted_candidates[0][1]
             worst = sorted_candidates[-1][1]
-            print self.indent(2)+'worst candidate: {}'.format(worst[stats_idx])
-            print self.indent(2)+'best  candidate: {}'.format(best[stats_idx])
+            print self.indent(2)+'worst candidate: {}'.format(worst[kernel_statistics_idx])
+            print self.indent(2)+'best  candidate: {}'.format(best[kernel_statistics_idx])
             
     def _print_footer(self, ellapsed, best_candidate):
         if self.verbose:
-            (best_extra_params, best_work_load, best_global_size, best_local_size, 
-                    _, _, best_stats, _, _, _, _) = best_candidate
+            (best_extra_params, best_work_size, best_work_load, best_global_size, best_local_size, 
+                    _, _, best_stats, _, _, _, _, _) = best_candidate
             if self.verbose>1:
                 if ellapsed is not None:
                     self._print_separator()
diff --git a/hysop/backend/device/kernel_autotuner_config.py b/hysop/backend/device/kernel_autotuner_config.py
index b3b4c7710904d8e5b93ba2e1f6ec3e141a335a2a..d6f0fc731f65620309ea3d7be500c6f4ef322015 100644
--- a/hysop/backend/device/kernel_autotuner_config.py
+++ b/hysop/backend/device/kernel_autotuner_config.py
@@ -16,17 +16,19 @@ class KernelAutotunerConfig(object):
     }
 
     def __init__(self,
-            dump_folder     = None,
-            autotuner_flag  = None,
-            prune_threshold = None,
-            max_candidates  = None,
-            verbose         = None,
-            debug           = None,
-            dump_kernels    = None,
+            dump_folder             = None,
+            autotuner_flag          = None,
+            prune_threshold         = None,
+            max_candidates          = None,
+            verbose                 = None,
+            debug                   = None,
+            dump_kernels            = None,
+            dump_hash_logs          = None,
             generate_isolation_file = None,
-            override_cache  = None,
-            nruns           = None,
-            plot_statistics = None):
+            override_cache          = None,
+            nruns                   = None,
+            plot_statistics         = None,
+            postprocess_kernels     = None):
 
         dump_folder     = first_not_None(dump_folder, self.default_dump_folder())
         autotuner_flag  = first_not_None(autotuner_flag,  DEFAULT_AUTOTUNER_FLAG)
@@ -34,10 +36,12 @@ class KernelAutotunerConfig(object):
         max_candidates  = first_not_None(max_candidates, 4)
         verbose         = first_not_None(verbose,  2*__VERBOSE__)
         debug           = first_not_None(debug, __KERNEL_DEBUG__)
-        dump_kernels    = first_not_None(dump_kernels, __KERNEL_DEBUG__)
+        dump_kernels    = first_not_None(dump_kernels,   __KERNEL_DEBUG__)
+        dump_hash_logs  = first_not_None(dump_hash_logs, __KERNEL_DEBUG__)
         generate_isolation_file = first_not_None(generate_isolation_file, __KERNEL_DEBUG__)
         override_cache = first_not_None(override_cache, False)
         plot_statistics = first_not_None(plot_statistics, False)
+        postprocess_kernels = first_not_None(postprocess_kernels, False)
 
         if (nruns is None):
             nruns = self._default_initial_runs[autotuner_flag]
@@ -60,9 +64,11 @@ class KernelAutotunerConfig(object):
         self.nruns = nruns
         self.dump_folder = dump_folder
         self.dump_kernels = dump_kernels
+        self.dump_hash_logs = dump_hash_logs
         self.max_candidates = max_candidates
         self.generate_isolation_file = generate_isolation_file
         self.plot_statistics = plot_statistics
+        self.postprocess_kernels = postprocess_kernels
 
     @abstractmethod
     def default_dump_folder(self):
diff --git a/hysop/backend/device/kernel_autotuner_statistics.py b/hysop/backend/device/kernel_autotuner_statistics.py
index 57efe67f2d6c5a73eb475081c8ae33c379e51140..f372f5565a1abbcc6d9336e717bff6311f397b3e 100644
--- a/hysop/backend/device/kernel_autotuner_statistics.py
+++ b/hysop/backend/device/kernel_autotuner_statistics.py
@@ -8,9 +8,11 @@ class AutotunedKernelStatistics(dict):
     class AutotunedParameterStatistics(dict):
         class AutotunedRunStatistics(object):
             def __init__(self,
-                        work_load, local_work_size, 
-                        global_work_size, statistics, 
-                        pruned, local_best, error):
+                        work_size, work_load, 
+                        local_work_size, global_work_size, 
+                        statistics, pruned, 
+                        local_best, error):
+                self.work_size = work_size
                 self.work_load = work_load
                 self.local_work_size = local_work_size
                 self.global_work_size = global_work_size
@@ -42,6 +44,7 @@ class AutotunedKernelStatistics(dict):
         self.pruned_count = None
         self.failed_count = None
         self.total_count = None
+        self.file_basename = None
         self.steps = {}
     def push_parameters(self, extra_param_hash, **kwds):
         return self.setdefault(extra_param_hash, self.AutotunedParameterStatistics(**kwds))
@@ -49,23 +52,11 @@ class AutotunedKernelStatistics(dict):
         self.steps[step_id] = candidates
 
     def plot(self):
+        self.collect_exec_times()
         self.plot_histogram()
 
     def plot_histogram(self):
-        exec_times = self.collect_exec_times()
-
-    def collect_exec_times(self):
-        run_times = ()
-        for (extra_param_hash, parameter_statistics) in self.iteritems():
-            if not parameter_statistics.good():
-                continue
-            for (run_key, run_statistics) in parameter_statistics.iteritems():
-                if not run_statistics.good():
-                    continue
-                run_time = run_statistics.statistics.mean
-                run_times += (run_time,)
-                #run_times += run_statistics.statistics.data[:self.nruns]
-        run_times = np.asarray(run_times, dtype=np.float64)
+        run_times = self.run_times.copy()
         for unit in ('ns', 'us', 'ms', 's'):
             if run_times.min() < 1e2:
                 break
@@ -77,8 +68,6 @@ class AutotunedKernelStatistics(dict):
         imax = int(np.ceil(np.log10(vnmax)))
         xmin = 10.0**imin
         xmax = 10.0**imax
-        #N = 10
-        #logbins = tuple(j*(10//N)*(10.0**i) for i in xrange(imin, imax) for j in xrange(1,N) ) + (xmax,)
         logbins = np.geomspace(xmin, xmax, (imax-imin+1)*10)
         fig, axe = plt.subplots()
         fig.suptitle(self.kernel_name, weight='bold')
@@ -97,9 +86,20 @@ class AutotunedKernelStatistics(dict):
         axe.axvline(x=vnmean, label=r'median: ${:.1f} {unit}$ (x{:.1f})'.format(vmean, vnmean/vnmin, unit=unit), color='darkorange')
         axe.axvline(x=vnmax,  label=r'worst: ${:.1f} {unit}$ (x{:.1f})'.format(vmax, vnmax/vnmin, unit=unit), color='r')
         axe.legend(framealpha=1.0, title='Execution times')
-        fig.savefig('{}/histo_{}.png'.format(
-            self.tkernel.autotuner_config.dump_folder, 
-            self.kernel_name))
-        import sys
-        #sys.exit(1)
+        fig.savefig('{}/{}_histo.png'.format(
+            self.tkernel.autotuner_config.dump_folder, self.file_basename))
+
+    def collect_exec_times(self):
+        run_times = ()
+        for (extra_param_hash, parameter_statistics) in self.iteritems():
+            if not parameter_statistics.good():
+                continue
+            for (run_key, run_statistics) in parameter_statistics.iteritems():
+                if not run_statistics.good():
+                    continue
+                run_time = run_statistics.statistics.mean
+                run_times += (run_time,)
+                #run_times += run_statistics.statistics.data[:self.nruns]
+        run_times = np.asarray(run_times, dtype=np.float64)
+        self.run_times = run_times
 
diff --git a/hysop/backend/device/opencl/opencl_autotunable_kernel.py b/hysop/backend/device/opencl/opencl_autotunable_kernel.py
index c9f4c6913113734ab9cdd47bcf77827a2d25a583..a6e003b18ea5d40cd7bc55067ab0771578560001 100644
--- a/hysop/backend/device/opencl/opencl_autotunable_kernel.py
+++ b/hysop/backend/device/opencl/opencl_autotunable_kernel.py
@@ -1,3 +1,4 @@
+import subprocess
 from abc import ABCMeta, abstractmethod
 from hysop import __KERNEL_DEBUG__
 from hysop.deps import os
@@ -34,7 +35,7 @@ class OpenClAutotunableKernel(AutotunableKernel):
         from hysop.backend.device.opencl.opencl_kernel_autotuner import OpenClKernelAutotuner
         autotuner = OpenClKernelAutotuner(name=name, tunable_kernel=self)
 
-        best_candidate_results = autotuner.autotune(extra_kwds=extra_kwds,
+        best_candidate_results, file_basename, from_cache  = autotuner.autotune(extra_kwds=extra_kwds,
                 force_verbose=force_verbose, force_debug=force_debug)
         check_instance(best_candidate_results, dict)
 
@@ -44,7 +45,8 @@ class OpenClAutotunableKernel(AutotunableKernel):
                 **extra_kwds['kernel_args'])
 
         return self.format_best_candidate(name=name, extra_kwds=extra_kwds,
-                args_mapping=args_mapping, args_list=args_list,
+                args_mapping=args_mapping, args_list=args_list, autotuner=autotuner,
+                file_basename=file_basename, from_cache=from_cache,
                 **best_candidate_results)
 
     def compute_global_work_size(self, work, local_work_size,
@@ -82,11 +84,15 @@ class OpenClAutotunableKernel(AutotunableKernel):
             })
         return known_vars
 
-    def format_best_candidate(self, name, extra_kwds, extra_parameters, work_load,
+    def format_best_candidate(self, autotuner,
+            file_basename, from_cache, name,
+            extra_kwds, extra_parameters, 
+            work_size, work_load,
             global_work_size, local_work_size,
             args_mapping, args_list,
             program, kernel, kernel_name, kernel_src,
-            kernel_statistics, src_hash, hash_logs):
+            kernel_statistics, src_hash, 
+            extra_kwds_hash, extra_kwds_hash_logs):
         """
         Post treatment callback for autotuner results.
         Transform autotuner results in user friendly kernel wrappers.
@@ -96,6 +102,7 @@ class OpenClAutotunableKernel(AutotunableKernel):
 
         Use the build_launcher method to build OpenClKernelLauncher from this OpenClKernel.
         """
+        check_instance(from_cache, bool)
         check_instance(extra_parameters, dict, keys=str)
         check_instance(extra_kwds, dict, keys=str)
         check_instance(work_load, tuple, values=npw.int32)
@@ -107,23 +114,66 @@ class OpenClAutotunableKernel(AutotunableKernel):
         check_instance(kernel_name, str)
         check_instance(kernel_statistics, OpenClKernelStatistics)
         check_instance(src_hash, str)
-        check_instance(hash_logs, str)
+        check_instance(extra_kwds_hash, str)
+        check_instance(extra_kwds_hash_logs, str)
+        
+        kernel_hash_logs = self.generate_hash_logs(file_basename, extra_kwds_hash_logs)
 
-        isolation_params = extra_kwds['isolation_params']
-
-        kernel_source = self.generate_source_file(kernel_name, kernel_src)
+        kernel_source = self.generate_source_file(file_basename, kernel_src)
 
         kernel_isolation = self.generate_oclgrind_isolation_file(kernel,
-                kernel_name, kernel_source,
+                file_basename, kernel_source,
                 global_work_size, local_work_size,
-                args_list, args_mapping, isolation_params)
+                args_list, args_mapping, 
+                extra_kwds['isolation_params'])
 
-        kernel = OpenClKernel(name=kernel_name, program=program,
+        kernel = OpenClKernel(name=autotuner.name, program=program,
                 args_mapping=args_mapping,
                 default_queue=None,
                 default_global_work_size=global_work_size,
                 default_local_work_size=local_work_size,
                 default_args=None)
+        
+        autotuner_config = autotuner.autotuner_config
+        if autotuner_config.postprocess_kernels:
+            # execute command FILE_BASENAME FROM_CACHE
+            # AUTOTUNER_DUMP_DIR  AUTOTUNER_NAME  KERNEL_NAME
+            # MEAN_EXECUTION_TIME_NS  MIN_EXECUTION_TIME_NS  MAX_EXECUTION_TIME_NS
+            # KERNEL_SOURCE_FILE  KERNEL_ISOLATION_FILE  KERNEL_HASH_LOGS_FILE
+            # VENDOR_NAME  DEVICE_NAME  
+            # WORK_SIZE  WORK_LOAD  
+            # GLOBAL_WORK_SIZE  LOCAL_WORK_SIZE 
+            # EXTRA_PARAMETERS  EXTRA_KWDS_HASH  SRC_HASH
+            command = [str(autotuner_config.postprocess_kernels), 
+                       str(file_basename),
+                       '1' if from_cache else '0',
+                       str(autotuner_config.dump_folder), 
+                       str(autotuner.name),
+                       str(kernel_name), 
+                       str(kernel_statistics.mean), 
+                       str(kernel_statistics.min), 
+                       str(kernel_statistics.max), 
+                       str(kernel_source), 
+                       str(kernel_isolation), 
+                       str(kernel_hash_logs),
+                       str(kernel._kernel.context.devices[0].platform.name.strip()),
+                       str(kernel._kernel.context.devices[0].name.strip()),
+                       str(work_size),
+                       str(work_load), 
+                       str(global_work_size),
+                       str(local_work_size), 
+                       str(extra_parameters),
+                       str(extra_kwds_hash), 
+                       str(src_hash)]
+            if autotuner_config.debug:
+                print('POSTPROCESSING KERNEL {}:\n'.format(autotuner.name) + ' '.join(command))
+            try:
+                subprocess.check_call(command)
+            except:
+                msg='\nFATAL ERROR: Failed to call autotuner postprocessing command.\n{}\n'
+                msg=msg.format(' '.join(command))
+                print(msg)
+                raise
 
         args_dict = extra_kwds['kernel_args']
         return (kernel, args_dict)
@@ -135,8 +185,8 @@ class OpenClAutotunableKernel(AutotunableKernel):
 
         # dump the best kernel
         dump_folder = self.autotuner_config.dump_folder
-        dump_file=dump_folder+'/rk{}_{}.cl'.format(
-            main_rank, kernel_name.replace(' ', '_'))
+        dump_file=dump_folder+'/{}__{}.cl'.format(
+            kernel_name.replace(' ', '_'), main_rank)
         if not os.path.exists(dump_folder) and (main_rank == 0):
             os.makedirs(dump_folder)
         with open(dump_file, 'w+') as f:
@@ -144,6 +194,22 @@ class OpenClAutotunableKernel(AutotunableKernel):
                 print '  >Saving OpenCL kernel source to \'{}\'.'.format(dump_file)
             f.write(kernel_src)
         return dump_file
+    
+    def generate_hash_logs(self, kernel_name, hash_logs, force=False):
+        if (not force) and (not self.autotuner_config.dump_hash_logs):
+            return None
+
+        # dump the best kernel
+        dump_folder = self.autotuner_config.dump_folder
+        dump_file=dump_folder+'/{}__{}_hash_logs.txt'.format(
+            kernel_name.replace(' ', '_'), main_rank)
+        if not os.path.exists(dump_folder) and (main_rank == 0):
+            os.makedirs(dump_folder)
+        with open(dump_file, 'w+') as f:
+            if self.autotuner_config.verbose:
+                print '  >Saving hash logs to \'{}\'.'.format(dump_file)
+            f.write(hash_logs)
+        return dump_file
 
     def generate_oclgrind_isolation_file(self, kernel, kernel_name, kernel_source,
             global_work_size, local_work_size,
@@ -160,8 +226,8 @@ class OpenClAutotunableKernel(AutotunableKernel):
         assert len(sorted_args) == len(args_list)
 
         dump_folder = self.autotuner_config.dump_folder
-        dump_file=dump_folder+'/rk{}_{}.sim'.format(
-            main_rank, kernel_name.replace(' ', '_'))
+        dump_file=dump_folder+'/{}__{}.sim'.format(
+            kernel_name.replace(' ', '_'), main_rank)
         with open(dump_file, 'w+') as f:
             msg ='# Isolation configuration file for kernel {}.'.format(kernel_name)
             msg+='\n# See https://github.com/jrprice/Oclgrind/wiki/Running-Kernels-in-Isolation '
diff --git a/hysop/tools/io_utils.py b/hysop/tools/io_utils.py
index 76c56c5a82bc48acff0a1a474b22f8e1c3d5ab7c..34c85ec3eed063df4b8bd8d35b942a9972df607a 100755
--- a/hysop/tools/io_utils.py
+++ b/hysop/tools/io_utils.py
@@ -8,7 +8,7 @@
 * :class:`~XMF`, tools to prepare/write xmf files.
 
 """
-import os, h5py, psutil, warnings, tempfile, socket
+import os, h5py, psutil, warnings, tempfile, socket, shutil, atexit
 import numpy as np
 import subprocess32 as subprocess
 from collections import namedtuple
@@ -29,6 +29,7 @@ class IO(object):
 
     _default_path = None
     _cache_path = None
+    _tmp_dirs = {}
 
     HDF5 = 998
     """HDF5 format id"""
@@ -127,6 +128,23 @@ class IO(object):
             IO.set_cache_path(IO.default_cache_path())
         return IO._cache_path
 
+    @classmethod
+    def get_tmp_dir(cls, key):
+        """
+        Create or get an existing temporary directory.
+        """
+        if (key in cls._tmp_dirs):
+            tmp_dir = cls._tmp_dirs[key]
+        else:
+            tmp_dir = tempfile.mkdtemp()
+            cls._tmp_dirs[key] = tmp_dir
+        return tmp_dir
+
+    @classmethod
+    def _remove_tmp_dirs(cls):
+        for f in cls._tmp_dirs.values():
+            shutil.rmtree(f, ignore_errors=True, onerror=None)
+
     @classmethod
     def set_cache_path(cls, path):
         if cls.is_shared_fs(path):
@@ -533,3 +551,6 @@ class XMF(object):
             xml_grid += "    </Attribute>\n"
         xml_grid += "   </Grid>\n"
         return xml_grid
+
+atexit.register(IO._remove_tmp_dirs)
+
diff --git a/hysop/tools/postprocess_kernel.sh b/hysop/tools/postprocess_kernel.sh
new file mode 100755
index 0000000000000000000000000000000000000000..61ed13cc18646b199802a3cf7c6aaf9fd46cd835
--- /dev/null
+++ b/hysop/tools/postprocess_kernel.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Example of autotuner post processing script.
+# Input arguments are:
+#    FILE_BASENAME  FROM_CACHE
+#    AUTOTUNER_DUMP_DIR  AUTOTUNER_NAME  KERNEL_NAME
+#    MEAN_EXECUTION_TIME_NS  MIN_EXECUTION_TIME_NS  MAX_EXECUTION_TIME_NS
+#    KERNEL_SOURCE_FILE  KERNEL_ISOLATION_FILE  KERNEL_HASH_LOGS
+#    VENDOR_NAME  DEVICE_NAME  
+#    WORK_SIZE  WORK_LOAD  
+#    GLOBAL_WORK_SIZE  LOCAL_WORK_SIZE 
+#    EXTRA_PARAMETERS  EXTRA_KWDS_HASH  SRC_HASH
+# See example hysop/examples/example_utils.py interface and '--autotuner-postprocess-kernels' argument.
+
+set -e
+if [ "$#" -ne 20 ]; then 
+    echo "Script expected 20 parameters."
+    exit 1
+fi
+
+FILE_BASENAME=${1}
+FROM_CACHE=${2}
+AUTOTUNER_DUMP_DIR=${3}
+AUTOTUNER_NAME=${4}
+KERNEL_NAME=${5}
+MEAN_EXECUTION_TIME_NS=${6}
+MIN_EXECUTION_TIME_NS=${7}
+MAX_EXECUTION_TIME_NS=${8}
+KERNEL_SOURCE_FILE=${9}
+KERNEL_ISOLATION_FILE=${10}
+KERNEL_HASH_LOGS_FILE=${11}
+VENDOR_NAME=${12}
+DEVICE_NAME=${13}
+WORK_SIZE=${14}
+WORK_LOAD=${15}
+GLOBAL_WORK_SIZE=${16}
+LOCAL_WORK_SIZE=${17}
+EXTRA_PARAMETERS=${18}
+EXTRA_KWDS_HASH=${19}
+SRC_HASH=${20}
+
+echo "Successfully postprocessed kernel '$AUTOTUNER_NAME'."
+exit 0