From 64899172d3b8e725ca3eaf827dd6142084d4de3d Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Keck <Jean-Baptiste.Keck@imag.fr>
Date: Tue, 19 Sep 2017 17:01:32 +0200
Subject: [PATCH] working inplace and out of place transpose operators

---
 hysop/__init__.py                             |  1 -
 .../device/codegen/kernels/transpose.py       | 83 ++++------------
 hysop/backend/device/kernel_autotuner.py      | 11 ++-
 .../backend/device/kernel_autotuner_config.py |  1 +
 .../opencl/autotunable_kernels/transpose.py   | 89 +++++++++--------
 .../opencl/opencl_autotunable_kernel.py       | 22 ++++-
 hysop/backend/device/opencl/opencl_copy.py    | 99 ++++++++++++++-----
 hysop/backend/device/opencl/opencl_kernel.py  |  8 ++
 .../opencl/opencl_kernel_autotuner_config.py  |  4 +-
 .../device/opencl/opencl_kernel_launcher.py   | 71 ++++++++++---
 .../device/opencl/operator/transpose.py       | 22 ++---
 hysop/core/arrays/array_backend.py            |  4 +
 hysop/core/graph/computational_operator.py    |  1 +
 hysop/core/graph/node_generator.py            |  4 +-
 hysop/fields/cartesian_discrete_field.py      |  6 +-
 hysop/operator/base/transpose_operator.py     |  2 +
 hysop/operator/tests/test_transpose.py        | 73 ++++++++------
 hysop/operator/transpose.py                   |  5 +-
 hysop/tools/io_utils.py                       | 36 +------
 19 files changed, 310 insertions(+), 232 deletions(-)

diff --git a/hysop/__init__.py b/hysop/__init__.py
index bfab7f120..3afa33583 100644
--- a/hysop/__init__.py
+++ b/hysop/__init__.py
@@ -28,7 +28,6 @@ __ENABLE_LONG_TESTS__ = False
 __DEFAULT_PLATFORM_ID__ = 1
 __DEFAULT_DEVICE_ID__   = 0
 
-
 if __MPI_ENABLED__:
     from hysop.core.mpi import MPI, main_rank, main_size, \
                                host_rank, interhost_size, \
diff --git a/hysop/backend/device/codegen/kernels/transpose.py b/hysop/backend/device/codegen/kernels/transpose.py
index d4199d1a8..05f8df051 100644
--- a/hysop/backend/device/codegen/kernels/transpose.py
+++ b/hysop/backend/device/codegen/kernels/transpose.py
@@ -37,8 +37,8 @@ class TransposeKernelGenerator(KernelCodeGenerator):
         pdim = len(axes)
         assert pdim>=2
         assert set(axes)==set(range(pdim))
-        last_axe_permuted = (axes[-1] != (pdim-1))
-        if last_axe_permuted:
+        contiguous_permutation = (axes[-1] != (pdim-1))
+        if contiguous_permutation:
             tile_indexes = (pdim-1, axes[-1])
         else:
             tile_indexes = (pdim-1,)
@@ -59,7 +59,7 @@ class TransposeKernelGenerator(KernelCodeGenerator):
             else:
                 continue
             j+=1
-        return (last_axe_permuted, wdim, work_shape, tile_indexes)
+        return (contiguous_permutation, wdim, work_shape, tile_indexes)
     
     @classmethod
     def max_local_worksize(cls, shape, work_dim, tile_size, vectorization, axes):
@@ -91,15 +91,10 @@ class TransposeKernelGenerator(KernelCodeGenerator):
         return max_local_worksize
 
     @classmethod
-    def shape_to_worksize(cls, shape, tile_size, 
-            vectorization, axes, local_work_size, 
-            return_tile_indexes=False, convert_to_numpy_axes=False):
-        if convert_to_numpy_axes:
-            # numpy axes are fortan contiguous (ie. axe 0 is the one with the greatest stride)
-            axes = np.asarray(axes)
-            axes = (axes.size - axes - 1)[::-1]
-        nt = tile_size * vectorization
-        
+    def compute_global_size(cls, shape, tile_size, 
+            vectorization, axes, 
+            local_work_size, work_load):
+
         pdim = len(axes)
         contiguous_permutation = (axes[-1] != (pdim-1))
         if contiguous_permutation:
@@ -112,58 +107,25 @@ class TransposeKernelGenerator(KernelCodeGenerator):
         assert wdim <= pdim, 'workdim to big.'
         assert wdim >= (1 + int(contiguous_permutation)), 'workdim to small.'
 
-        work_size = np.empty(shape=(wdim,), dtype=np.int32)
+        ngroups = np.empty(shape=(wdim,), dtype=np.int32)
+        vts = tile_size * vectorization
+        ts  = tile_size
         j=0
         for i,Si in enumerate(shape):
             if i==0:
-                work_size[j]  = (Si+tile_size*vectorization-1)/(tile_size*vectorization)
-                work_size[j] *=local_work_size[j]
+                wl = work_load[j]
+                ngroups[j]  = (Si+vts*wl-1)/(vts*wl)
             elif i in tile_indexes:
-                work_size[j] = (Si+tile_size-1)/tile_size * local_work_size[j]
+                wl = work_load[j]
+                ngroups[j] = ((Si+ts*wl-1)/(ts*wl))
             elif i < (wdim - int(contiguous_permutation and tile_indexes[1]>wdim-1)):
-                work_size[j] = Si
+                wl = work_load[j]
+                ngroups[j] = (Si+wl-1)/wl
             else:
                 continue
             j+=1
         assert j==wdim, '{} != {}'.format(j, wdim)
-
-        if return_tile_indexes:
-            return (tile_indexes, work_size)
-        else:
-            return work_size
-    
-    @classmethod
-    def get_max_global_size(cls, work_size, work_load, **kargs):
-        """
-        Return global_work_size from effective work_size without
-        taking into account local_work_size alignment
-        """
-        
-        work_size = np.asarray(work_size).copy()
-        work_load = np.asarray(work_load).copy()
-        global_size = ((work_size+work_load-1)/work_load)
-            
-        return global_size
-
-    def get_global_size(self, work_size, local_work_size, work_load=None):
-        """
-        Return global_work_size from effective work_size and given local_work_size
-        global_work_size will be a multiple of local_work_size
-        """
-        work_dim        = self.work_dim
-        work_load       = [1]*work_dim if (work_load is None) else work_load
-
-        work_size       = np.asarray(work_size)
-        work_load       = np.asarray(work_load)
-        local_work_size = np.asarray(local_work_size)
-
-        if 'local_size' in self.known_vars:
-            assert (self.known_vars['local_size'] == local_work_size[:work_dim]).all(),\
-                    'local_work_size mismatch!'
-
-        max_global_size = self.get_max_global_size(work_size, work_load)
-        global_size = ((max_global_size+local_work_size-1)/local_work_size) * local_work_size
-
+        global_size = ngroups * local_work_size
         return global_size
     
     def required_workgroup_cache_size(self):
@@ -192,7 +154,6 @@ class TransposeKernelGenerator(KernelCodeGenerator):
     def __init__(self, typegen, ctype, vectorization,
             axes, tile_size, tile_padding,
             use_diagonal_coordinates = True,
-            convert_to_numpy_axes = False,
             is_inplace = False,
             known_vars = None,
             debug_mode = False,
@@ -201,13 +162,10 @@ class TransposeKernelGenerator(KernelCodeGenerator):
         axes = np.asarray(axes)
         pdim = axes.size
         Pdim = upper_pow2_or_3(pdim)
-        
-        # numpy axes are fortan contiguous (ie. axe 0 is the one with the greatest stride)
-        if convert_to_numpy_axes:
-            axes = (pdim - axes - 1)[::-1]
         assert pdim <= 16, 'Maximal permutation dimension is 16.'
         assert Pdim in [1,2,3,4,8,16]
         assert vectorization in [1,2,4,8,16]
+        assert tile_padding >= 0
         
         # check permutation axes
         msg='Invalid permutation {} for dimension {}.'
@@ -250,8 +208,9 @@ class TransposeKernelGenerator(KernelCodeGenerator):
         tile_index_to_id = dict( (j,i) for (i,j) in enumerate(tile_indexes) )
         
         device = typegen.device
-        if device.max_work_item_dimensions < tdim:
-            msg='OpenCL device {} does not support {} working dimensions required to transpose whith axes {}.'
+        if (device.max_work_item_dimensions < tdim):
+            msg='OpenCL device {} does not support {} working dimensions required '
+            msg+='to transpose whith axes {}.'
             msg=msg.format(device.name, tdim, axes)
         work_dim = min(pdim, device.max_work_item_dimensions)
         
diff --git a/hysop/backend/device/kernel_autotuner.py b/hysop/backend/device/kernel_autotuner.py
index 9151b265e..a8323ea28 100644
--- a/hysop/backend/device/kernel_autotuner.py
+++ b/hysop/backend/device/kernel_autotuner.py
@@ -48,7 +48,6 @@ class KernelAutotuner(object):
         self.build_opts = tunable_kernel.build_opts
 
         self.indent = lambda i: '  '*i
-        self.autotuner_config.verbose = 1
         self.verbose = self.autotuner_config.verbose
         
         #self._init_and_load_cache()
@@ -148,7 +147,8 @@ class KernelAutotuner(object):
                                         tuple(work_load), 
                                         tuple(global_work_size), 
                                         tuple(local_work_size), 
-                                        prg, kernel, statistics, src_hash)
+                                        prg, kernel, statistics, 
+                                        kernel_src, src_hash)
                                 kept_count += 1
                         except KernelGenerationError as e:
                             if verbose>1:
@@ -179,7 +179,7 @@ class KernelAutotuner(object):
                 self._print_step(step_count, '{} BEST'.format(len(candidates)), nruns)
                 for (run_key, run_params) in candidates:
                     (extra_params, work_load, global_work_size, local_work_size, 
-                            prg, kernel, old_stats, src_hash) = run_params
+                            _, kernel, old_stats, _, _) = run_params
                             
                     self.bench_one_from_binary(kernel=kernel,
                                              target_nruns=nruns, 
@@ -195,7 +195,8 @@ class KernelAutotuner(object):
         self._print_footer(ellapsed=timer.interval, best_candidate=best_candidate)
         
         result_keys = ('extra_parameters', 'work_load', 'global_work_size', 'local_work_size', 
-                    'program', 'kernel', 'kernel_statistics', 'src_hash')
+                    'program', 'kernel', 'kernel_statistics', 'kernel_src', 'src_hash')
+        assert len(result_keys) == len(best_candidate[1])
         return dict(zip(result_keys, best_candidate[1]))
 
 
@@ -260,7 +261,7 @@ class KernelAutotuner(object):
     def _print_footer(self, ellapsed, best_candidate):
         if self.verbose:
             (best_extra_params, best_work_load, best_global_size, best_local_size, 
-                    _, _, best_stats, _) = best_candidate[1]
+                    _, _, best_stats, _, _) = best_candidate[1]
             if ellapsed is not None:
                 self._print_separator()
                 msg='\n|| AUTOTUNING SUCCESSFULLY FINISHED IN {}.'
diff --git a/hysop/backend/device/kernel_autotuner_config.py b/hysop/backend/device/kernel_autotuner_config.py
index 1bd102af8..00179a6f5 100644
--- a/hysop/backend/device/kernel_autotuner_config.py
+++ b/hysop/backend/device/kernel_autotuner_config.py
@@ -44,6 +44,7 @@ class KernelAutotunerConfig(object):
         self.debug   = debug
         self.override_cache  = override_cache
         self.nruns = nruns
+        self.dump_folder = dump_folder
 
     @abstractmethod
     def default_dump_folder(self):
diff --git a/hysop/backend/device/opencl/autotunable_kernels/transpose.py b/hysop/backend/device/opencl/autotunable_kernels/transpose.py
index 9121b87f5..88ba45921 100644
--- a/hysop/backend/device/opencl/autotunable_kernels/transpose.py
+++ b/hysop/backend/device/opencl/autotunable_kernels/transpose.py
@@ -1,8 +1,9 @@
 
 from hysop.tools.numpywrappers import npw
 from hysop.tools.types import check_instance
-from hysop.tools.misc import upper_pow2
+from hysop.tools.misc import upper_pow2, previous_pow2
 from hysop.tools.units import bytes2str
+from hysop.constants import AutotunerFlags
 from hysop.backend.device.opencl import cl, clTools
 from hysop.backend.device.opencl.opencl_autotunable_kernel import OpenClAutotunableKernel
 from hysop.backend.device.codegen.kernels.transpose import TransposeKernelGenerator
@@ -43,7 +44,8 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
 
         check_instance(axes, tuple, values=int)
         check_instance(is_inplace, bool)
-        self._check_cartesian_fields(input_field, output_field, check_res=True)
+        self._check_cartesian_fields(input_field, output_field, 
+                check_res=False, check_size=True)
             
         dim   = input_field.domain.dim
         dtype = input_field.dtype
@@ -57,13 +59,18 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
         assert axes != tuple(range(dim))
         
         # check if is_inplace is allowed
-        assert (is_inplace == (input_field == output_field))
+        assert (is_inplace == (input_field.dfield == output_field.dfield))
         if is_inplace:
             #Only 2D square matrix inplace transposition is supported
-            compute_inplace  = (self.dim == 2)
+            compute_inplace  = (dim == 2)
             compute_inplace &= all(shape[0]==shape)
         else:
             compute_inplace = False
+        
+        if compute_inplace:
+            kernel_args = (input_field(0).data,)
+        else:
+            kernel_args = (input_field(0).data, output_field(0).data)
 
         if (name is None):
             name = 'transpose_{}_[{}]_{}'.format(ctype,
@@ -75,8 +82,7 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
         (last_axe_permuted, work_dim, work_shape, tile_indices) = \
             TransposeKernelGenerator.characterize_permutation(shape, axes, 
                     self.max_work_dim())
-
-        kernel_args = (input_field(0).data, output_field(0).data)
+        
         
         # keyword arguments will be agregated into extra_kwds dictionnary
         return super(OpenClAutotunableTransposeKernel, self).autotune(name=name, 
@@ -96,19 +102,39 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
     def compute_parameters(self, extra_kwds): 
         """Register extra parameters to optimize."""
         check_instance(extra_kwds, dict, keys=str)
-        params = super(OpenClAutotunableTransposeKernel, self).compute_parameters(extra_kwds=extra_kwds)
+        params = super(OpenClAutotunableTransposeKernel, self).compute_parameters(
+                extra_kwds=extra_kwds)
 
         ## Register extra parameters   
         # compute max tile fize from device cache
         tile_indices = extra_kwds['tile_indices']
         dtype = extra_kwds['dtype']
         shape = extra_kwds['shape']
+        last_axe_permuted = extra_kwds['last_axe_permuted']
         max_tile_size = self._max_tile_size(shape, dtype, tile_indices)
-
+        
+        flag = self.autotuner_config.autotuner_flag
         vectorization = (1,)
         use_diagonal_coordinates = (False,)
-        tile_padding = (0,)
+        if last_axe_permuted:
+            use_diagonal_coordinates += (True,)
+        tile_padding = (0,1,)
+
         tile_sizes = (max_tile_size,)
+        tile_size = max_tile_size
+        while tile_size>1:
+            tile_size = previous_pow2(tile_size)
+            tile_sizes += (tile_size,)
+        if flag == AutotunerFlags.ESTIMATE:
+            ntiles = 1
+        elif flag == AutotunerFlags.MEASURE:
+            ntiles = 2
+        elif flag == AutotunerFlags.PATIENT:
+            ntiles = 4
+        elif flag == AutotunerFlags.EXHAUSTIVE:
+            ntiles = len(tile_sizes)
+        ntiles = min(ntiles, len(tile_sizes))
+        tile_sizes = tile_sizes[:ntiles]
         
         params.register_extra_parameter('vectorization', vectorization) 
         params.register_extra_parameter('use_diagonal_coordinates', use_diagonal_coordinates)
@@ -117,39 +143,16 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
 
         return params
 
-        
-    def compute_work_bounds(self, extra_parameters, extra_kwds):
-        """
-        Configure workbounds (work_dim, work_size, max_work_load).
-        Return a WorkBoundsConfiguration object.
-        """
-        check_instance(extra_parameters, dict, keys=str)
-        check_instance(extra_kwds, dict, keys=str)
-        
-        tile_indices      = extra_kwds['tile_indices']
-        work_size         = extra_kwds['work_size']
-        last_axe_permuted = extra_kwds['last_axe_permuted']
-        
-        tile_size    = extra_parameters['tile_size']
-        tile_padding = extra_parameters['tile_padding']
-
-        assert npw.all(tile_size <= upper_pow2(work_size[tile_indices]))
-
-        work_bounds = super(OpenClAutotunableTransposeKernel, self).compute_work_bounds(
-                extra_parameters=extra_parameters,
-                extra_kwds=extra_kwds)
-        return work_bounds
-
-
     def compute_work_candidates(self, work_bounds, work_load, extra_parameters, extra_kwds):
         """
-        Configure work (global_size, local_size candidates) given a OpenClWorkBoundsConfiguration
-        object and a work_load.
+        Configure work (global_size, local_size candidates) given a 
+        OpenClWorkBoundsConfiguration object and a work_load.
+
         Return a WorkConfiguration object.
         
         Notes
         -----
-        global_work_size can be set to None if it depends on local_work_size and will be set
+        global_work_size can be ignored if it depends on local_work_size and will be set
         in self.compute_global_work_size().
         """
         work = super(OpenClAutotunableTransposeKernel, self).compute_work_candidates(
@@ -171,8 +174,16 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
 
         return work
 
-    #def compute_global_work_size(self, local_work_size, work, extra_parameters, extra_kwds):
-        #return None
+    def compute_global_work_size(self, local_work_size, work, extra_parameters, extra_kwds):
+        shape = extra_kwds['shape']
+        axes = extra_kwds['axes']
+        vectorization = extra_parameters['vectorization']
+        tile_size     = extra_parameters['tile_size']
+
+        gs = TransposeKernelGenerator.compute_global_size(shape=shape, tile_size=tile_size,
+                vectorization=vectorization, axes=axes, local_work_size=local_work_size,
+                work_load=work.work_load)
+        return gs
 
     def generate_kernel_src(self, global_work_size, local_work_size,
         extra_parameters, extra_kwds,
@@ -223,7 +234,7 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
         if extra_kwds['compute_inplace']:
             args_mapping = { 'inout': (0, cl.MemoryObjectHolder) }
         else:
-            args_mapping = { 'input' :  (0, cl.MemoryObjectHolder), 
+            args_mapping = { 'input' : (0, cl.MemoryObjectHolder), 
                              'output': (1, cl.MemoryObjectHolder) }
         return args_mapping
 
diff --git a/hysop/backend/device/opencl/opencl_autotunable_kernel.py b/hysop/backend/device/opencl/opencl_autotunable_kernel.py
index fe9c49eef..37c9ffbc1 100644
--- a/hysop/backend/device/opencl/opencl_autotunable_kernel.py
+++ b/hysop/backend/device/opencl/opencl_autotunable_kernel.py
@@ -1,5 +1,7 @@
 
 from abc import ABCMeta, abstractmethod
+from hysop import __KERNEL_DEBUG__
+from hysop.deps import os
 from hysop.constants import Backend
 from hysop.tools.numpywrappers import npw
 from hysop.tools.types import check_instance, first_not_None
@@ -22,7 +24,7 @@ class OpenClAutotunableKernel(AutotunableKernel):
 
         self.cl_env = cl_env
         self.usable_cache_bytes_per_wg = clCharacterize.usable_local_mem_size(cl_env.device)
-    
+        
     def autotune(self, name, **extra_kwds):
         from hysop.backend.device.opencl.opencl_kernel_autotuner import OpenClKernelAutotuner
         autotuner = OpenClKernelAutotuner(name=name, tunable_kernel=self)
@@ -67,7 +69,7 @@ class OpenClAutotunableKernel(AutotunableKernel):
    
     def format_best_candidate(self, name, extra_kwds, extra_parameters, work_load,
             global_work_size, local_work_size, 
-            program, kernel, kernel_statistics, src_hash):
+            program, kernel, kernel_src, kernel_statistics, src_hash):
         """
         Post treatment callback for autotuner results.
         Transform autotuner results in user friendly kernel wrappers.
@@ -84,9 +86,20 @@ class OpenClAutotunableKernel(AutotunableKernel):
         check_instance(local_work_size, tuple, values=npw.int32)
         check_instance(program, cl.Program)
         check_instance(kernel, cl.Kernel)
+        check_instance(kernel_src, str)
         check_instance(kernel_statistics, OpenClKernelStatistics)
         check_instance(src_hash, str)
 
+        if __KERNEL_DEBUG__:
+            # dump the best kernel
+            dump_folder = self.autotuner_config.dump_folder
+            dump_file=dump_folder+'/'+'{}.cl'.format(name.replace(' ', '_'))
+            if not os.path.exists(dump_folder):
+                os.makedirs(dump_folder)
+            with open(dump_file, 'w+') as f:
+                print '>Saving OpenCL kernel source to \'{}\'.'.format(dump_file)
+                f.write(kernel_src)
+
         args_mapping = self.compute_args_mapping(extra_kwds=extra_kwds, 
                                                  extra_parameters=extra_parameters)
         check_instance(args_mapping, dict, keys=str, values=tuple)
@@ -106,7 +119,10 @@ class OpenClAutotunableKernel(AutotunableKernel):
         return self.cl_env.device.max_work_group_size
     
     def max_work_item_sizes(self):
-        """Maximum number of work-items that can be specified in each dimension of the work-group."""
+        """
+        Maximum number of work-items that can be specified in each dimension 
+        of the work-group.
+        """
         return self.cl_env.device.max_work_item_sizes
 
     @classmethod
diff --git a/hysop/backend/device/opencl/opencl_copy.py b/hysop/backend/device/opencl/opencl_copy.py
index 470efd617..b8feebe56 100644
--- a/hysop/backend/device/opencl/opencl_copy.py
+++ b/hysop/backend/device/opencl/opencl_copy.py
@@ -4,12 +4,12 @@ from hysop.deps import np
 from hysop.tools.decorators import debug
 from hysop.tools.types import check_instance, first_not_None
 from hysop.tools.numpywrappers import npw
-from hysop.backend.device.opencl import cl
+from hysop.backend.device.opencl import cl, clArray
+from hysop.backend.device.opencl.opencl_kernel_launcher import OpenClKernelLauncher
 from hysop.backend.device.opencl.opencl_kernel_statistics import OpenClKernelStatistics
 
 class OpenClCopyKernelLauncher(OpenClKernelLauncher):
     """Interface to non-blocking OpenCL copy kernels."""
-    __metaclass__ = ABCMeta
     
     @debug
     def __init__(self, name, dst, src,
@@ -25,9 +25,11 @@ class OpenClCopyKernelLauncher(OpenClKernelLauncher):
         assert 'default_global_work_size' not in kwds
         assert 'default_local_work_size'  not in kwds
         assert 'is_blocking' not in kwds
+
         enqueue_copy_kwds['dest'] = dst
         enqueue_copy_kwds['src'] = src
-        enqueue_copy_kwds['is_blocking'] = False
+        if isinstance(src, np.ndarray) or isinstance(dst, np.ndarray):
+            enqueue_copy_kwds['is_blocking'] = False
 
         super(OpenClCopyKernelLauncher, self).__init__(name=name,
                 kernel=None, args_list=(), **kwds)
@@ -45,12 +47,14 @@ class OpenClCopyKernelLauncher(OpenClKernelLauncher):
     def __call__(self, queue=None, wait_for=None):
         queue = first_not_None(queue, self._default_queue)
         check_instance(queue, cl.CommandQueue)
-        dprint(self._apply_msg)
         evt = cl.enqueue_copy(queue=queue, **self._enqueue_copy_kwds)
+
+    def global_size_configured(self):
+        return True
     
     enqueue_copy_kwds = property(_get_enqueue_copy_kwds)
 
-class OpenClCopyBuffer(OpenClCopyKernelLauncher):
+class OpenClCopyBufferLauncher(OpenClCopyKernelLauncher):
     """Non-blocking OpenCL copy kernel between host buffers and/or opencl device buffers."""
     def __init__(self, varname, src, dst,
             src_device_offset=None, 
@@ -64,19 +68,19 @@ class OpenClCopyBuffer(OpenClCopyKernelLauncher):
         ----------
         varname: str
             Name of the variable copied for loggin purposes.
-        src: cl.MemoryObject or np.ndarray
+        src: cl.MemoryObjectHolder or np.ndarray
             The source buffer.
-        dst: cl.MemoryObject or np.ndarray
+        dst: cl.MemoryObjectHolder or np.ndarray
             The destination buffer.
         src_device_offset: int, optional
             Offset in the source buffer, only valid if 
-            source buffer is a cl.MemoryObject.
+            source buffer is a cl.MemoryObjectHolder.
         dst_device_offset: int, optional
             Offset in the source buffer, only valid if 
-            source buffer is a cl.MemoryObject.
+            source buffer is a cl.MemoryObjectHolder.
         byte_count: int
             Byte count to copy if and only if source and destination
-            buffers are cl.MemoryObjects.
+            buffers are cl.MemoryObjectHolders.
         
         Notes
         -----
@@ -90,39 +94,88 @@ class OpenClCopyBuffer(OpenClCopyKernelLauncher):
         Device buffers cannot have views like np.ndarrays, an offset in bytes can
         be given as src_device_offset or dst_device_offset instead.
         """
-        check_instance(src, (cl.MemoryObject, np.ndarray))
-        check_instance(dst, (cl.MemoryObject, np.ndarray))
+        check_instance(src, (cl.MemoryObjectHolder, np.ndarray))
+        check_instance(dst, (cl.MemoryObjectHolder, np.ndarray))
         check_instance(src_device_offset, (int, np.integer), allow_none=True)
         check_instance(dst_device_offset, (int, np.integer), allow_none=True)
         check_instance(byte_count, (int, np.integer), allow_none=True)
 
+        msg='Host to host copy is not supported.'
+        assert not (isinstance(src, np.ndarray) and isinstance(dst, np.ndarray)), msg 
+
         enqueue_copy_kwds = {}
         if (src_device_offset is not None):
-            assert isinstance(src, cl.MemoryObject)
+            assert isinstance(src, cl.MemoryObjectHolder)
             enqueue_copy_kwds['src_offset'] = src_device_offset
         if (dst_device_offset is not None):
-            assert isinstance(dst, cl.MemoryObject)
+            assert isinstance(dst, cl.MemoryObjectHolder)
             enqueue_copy_kwds['dst_offset'] = dst_device_offset
         if (byte_count is not None):
-            assert isinstance(src, cl.MemoryObject)
-            assert isinstance(dst, cl.MemoryObject)
+            assert isinstance(src, cl.MemoryObjectHolder)
+            assert isinstance(dst, cl.MemoryObjectHolder)
             enqueue_copy_kwds['byte_count'] = byte_count
 
         shape = first_not_None((byte_count,), 
-                getattr(src, shape, None),
-                getattr(dst, shape, None),
+                getattr(src, 'shape', None),
+                getattr(dst, 'shape', None),
                 '...')
         
         assert 'name' not in kwds
         name = 'enqueue_copy_{}__{}_to_{}'.format(varname,
-                'host' is isinstance(src, np.ndarray) else 'device',
-                'host' is isinstance(dst, np.ndarray) else 'device')
+                'host' if isinstance(src, np.ndarray) else 'device',
+                'host' if isinstance(dst, np.ndarray) else 'device')
         apply_msg='{}<<<{}>>>'.format(name, shape)
 
-        super(OpenClCopyHostBuffer, self).__init__(dst=dst, src=src, 
+        super(OpenClCopyBufferLauncher, self).__init__(dst=dst, src=src, 
                 enqueue_copy_kwds=enqueue_copy_kwds, 
                 name=name, apply_msg=apply_msg, **kwds)
 
-class OpenClCopyHost2Device(OpenClCopyKernelLauncher):
-    pass
+    def _format_device_arg(self, arg, arg_offset):
+        from hysop.backend.device.opencl.opencl_array import OpenClArray
+        nbytes=None
+        if isinstance(arg, (OpenClArray, clArray.Array)):
+            arg_offset = first_not_None(arg_offset, 0)
+            arg_offset += arg.offset
+            nbytes = arg.nbytes
+            arg = arg.base_data
+        elif isinstance(arg, cl.MemoryObjectHolder):
+            pass
+        else:
+            msg='Unknown type {} to format device buffer arguments.'
+            msg=msg.format(type(arg))
+            raise TypeError(msg)
+        return (arg, arg_offset, nbytes)
+
+class OpenClCopyHost2DeviceLauncher(OpenClCopyBufferLauncher):
+    """Reduced interface for host to device copy kernels."""
+    def __init__(self, varname, src, dst, dst_device_offset=None):
+        check_instance(src, (np.ndarray,))
+        check_instance(dst, (cl.MemoryObjectHolder,))
+        check_instance(dst_device_offset, (int, np.integer), allow_none=True)
+        super(OpenClCopyHost2DeviceLauncher, self).__init__(varname=varname, src=src,
+                dst=dst, dst_device_offset=dst_device_offset)
+
+class OpenClCopyDevice2HostLauncher(OpenClCopyBufferLauncher):
+    """Reduced interface for device to host copy kernels."""
+    def __init__(self, varname, src, dst, src_device_offset=None):
+        check_instance(src, (cl.MemoryObjectHolder,))
+        check_instance(dst, (np.ndarray,))
+        check_instance(src_device_offset, (int, np.integer), allow_none=True)
+        super(OpenClCopyDevice2HostLauncher, self).__init__(varname=varname, src=src,
+                dst=dst, src_device_offset=src_device_offset)
 
+class OpenClCopyDevice2DeviceLauncher(OpenClCopyBufferLauncher):
+    """Reduced interface for device to device copy kernels."""
+    def __init__(self, varname, src, dst, 
+            src_device_offset=None, dst_device_offset=None, byte_count=None):
+        src, src_device_offset, src_nbytes = self._format_device_arg(src, src_device_offset)
+        dst, dst_device_offset, dst_nbytes = self._format_device_arg(dst, dst_device_offset)
+        byte_count = first_not_None(byte_count, min(src_nbytes, dst_nbytes))
+        check_instance(src, (cl.MemoryObjectHolder,))
+        check_instance(dst, (cl.MemoryObjectHolder,))
+        check_instance(src_device_offset, (int, np.integer), allow_none=True)
+        check_instance(dst_device_offset, (int, np.integer), allow_none=True)
+        check_instance(byte_count, (int, np.integer), allow_none=True)
+        super(OpenClCopyDevice2DeviceLauncher, self).__init__(varname=varname, src=src, dst=dst, 
+                src_device_offset=src_device_offset, dst_device_offset=dst_device_offset,
+                byte_count=byte_count)
diff --git a/hysop/backend/device/opencl/opencl_kernel.py b/hysop/backend/device/opencl/opencl_kernel.py
index c809f7327..1af0e1915 100644
--- a/hysop/backend/device/opencl/opencl_kernel.py
+++ b/hysop/backend/device/opencl/opencl_kernel.py
@@ -100,6 +100,14 @@ class OpenClKernel(object):
     default_global_work_size = property(_get_default_global_work_size)
     default_local_work_size = property(_get_default_local_work_size)
 
+    def build_list_launcher(self, launcher_name=None, *args, **kwds):
+        """
+        Build a OpenClKernelLauncher and return it as a OpenClKernelListLauncher.
+        See self.build_launcher() and OpenClKernelLauncher.as_list_launcher()
+        """
+        launcher_name = first_not_None(launcher_name, self.name)
+        return self.build_launcher(*args, **kwds).as_list_launcher(name=launcher_name)
+
     def build_launcher(self, name=None, name_prefix=None, name_postfix=None,
             queue=None, local_work_size=None, global_work_size=None, **kwds):
         """
diff --git a/hysop/backend/device/opencl/opencl_kernel_autotuner_config.py b/hysop/backend/device/opencl/opencl_kernel_autotuner_config.py
index bd5002aa0..301920627 100644
--- a/hysop/backend/device/opencl/opencl_kernel_autotuner_config.py
+++ b/hysop/backend/device/opencl/opencl_kernel_autotuner_config.py
@@ -1,4 +1,5 @@
 
+from hysop.tools.io_utils import IO
 from hysop.backend.device.kernel_autotuner_config import KernelAutotunerConfig
 from hysop.backend.device.opencl import OPENCL_KERNEL_DUMP_FOLDER
 
@@ -8,4 +9,5 @@ class OpenClKernelAutotunerConfig(KernelAutotunerConfig):
         super(OpenClKernelAutotunerConfig, self).__init__(*args, **kwds) 
     
     def default_dump_folder(self):
-        return OPENCL_KERNEL_DUMP_FOLDER
+        default_path = IO.default_path()
+        return '{}/{}'.format(default_path, OPENCL_KERNEL_DUMP_FOLDER)
diff --git a/hysop/backend/device/opencl/opencl_kernel_launcher.py b/hysop/backend/device/opencl/opencl_kernel_launcher.py
index e509a1b59..291486827 100644
--- a/hysop/backend/device/opencl/opencl_kernel_launcher.py
+++ b/hysop/backend/device/opencl/opencl_kernel_launcher.py
@@ -29,25 +29,66 @@ class OpenClKernelListLauncher(object):
         check_instance(name, str)
         self._name = name
         self._kernels = ()
-        self._apply_msg = 'OpenClKernelListLauncher {}.__apply__()'.format(name)
+        self._apply_msg = '>OpenClKernelListLauncher {}'.format(name)
+
+    def push_copy_host_device(self, varname, src, dst,
+            src_device_offset=None, dst_device_offset=None, byte_count=None):
+        """Shortcut for OpenClCopyBuffer kernels creation."""
+        from hysop.backend.device.opencl.opencl_copy import OpenClCopyBufferLauncher
+        kernel = OpenClCopyBufferLauncher(varname=varname, 
+                src=src, dst=dst, byte_count=byte_count,
+                src_device_offset=src_device_offset, dst_device_offset=dst_device_offset)
+        self.push_kernels(kernel)
+        return self
+
+    def push_copy_host_to_device(self, varname, src, dst, dst_device_offset=None):
+        """Shortcut for OpenClCopyHost2Device kernels creation."""
+        from hysop.backend.device.opencl.opencl_copy import OpenClCopyHost2DeviceLauncher
+        kernel = OpenClCopyHost2DeviceLauncher(varname=varname, src=src, dst=dst, 
+                                            dst_device_offset=dst_device_offset)
+        self.push_kernels(kernel)
+        return self
+
+    def push_copy_device_to_host(self, varname, src, dst, src_device_offset=None):
+        """Shortcut for OpenClCopyDevice2Host kernels creation."""
+        from hysop.backend.device.opencl.opencl_copy import OpenClCopyDevice2HostLauncher
+        kernel = OpenClCopyDevice2HostLauncher(varname=varname, 
+                src=src, dst=dst, 
+                src_device_offset=src_device_offset)
+        self.push_kernels(kernel)
+        return self
+
+    def push_copy_device_to_device(self, varname, src, dst, 
+            src_device_offset=None, dst_device_offset=None, byte_count=None): 
+        """Shortcut for OpenClCopyDevice2Device kernels creation."""
+        from hysop.backend.device.opencl.opencl_copy import OpenClCopyDevice2DeviceLauncher
+        kernel = OpenClCopyDevice2DeviceLauncher(varname=varname, 
+                src=src, dst=dst, byte_count=byte_count,
+                src_device_offset=src_device_offset, dst_device_offset=dst_device_offset)
+        self.push_kernels(kernel)
+        return self
 
     def push_kernels(self, *kernel_launchers):
         """
         Push OpenClKernelLaunchers into the list.
         None values are ignored for convenience.
         """
-        for kernel in kernels:
+        for kernel in kernel_launchers:
             if (kernel is None):
                 continue
-            if not isinstance(kernel, OpenClKernelLauncher):
-                msg='Expected an OpenClKernelLauncher but got a {}.'
+            if isinstance(kernel, OpenClKernelLauncher):
+                if not kernel.global_size_configured():
+                    msg='OpenClKernelLauncher {} global_work_size has not been configured.'
+                    msg=msg.format(kernel.name)
+                    raise RuntimeError(msg)
+                self._kernels += (kernel,)
+            elif isinstance(kernel, OpenClKernelListLauncher):
+                self._kernels += kernel._kernels
+            else:
+                msg='Expected an OpenClKernelLauncher or a OpenClKernelListLauncher but got a {}.'
                 msg=msg.format(type(kernel))
                 raise TypeError(msg)
-            if not kernel.global_size_configured():
-                msg='OpenClKernelLauncher {} global_work_size has not been configured.'
-                msg=msg.format(kernel.name)
-                raise RuntimeError(msg)
-            self._kernels += (kernel,)
+        return self
 
     def __call__(self, queue, wait_for=None):
         """
@@ -56,10 +97,10 @@ class OpenClKernelListLauncher(object):
         If this OpenClKernelListLauncher is empty, cl.wait_for_events 
         will be called instead.
         """
-        dprint(self._apply_msg.format())
+        dprint(self._apply_msg)
         kernels = self._kernels
         if kernels:
-            evt = kernels[0).__call__(queue=queue, wait_for=wait_for)
+            evt = kernels[0].__call__(queue=queue, wait_for=wait_for)
             for kernel in kernels[1:]:
                 evt = kernel.__call__(queue=queue)
         else:
@@ -139,7 +180,7 @@ class OpenClKernelLauncher(object):
         self._events = ()
         self._kernel_is_shared = kernel_is_shared
         self._kernel_statistics = OpenClKernelStatistics()
-        self._apply_msg = '  {}<<<{}, {}>>>'
+        self._apply_msg = '  {}<<<{}, {}>>>'.format(name, '{}', '{}')
         
     def queue_configured(self):
         """
@@ -155,7 +196,7 @@ class OpenClKernelLauncher(object):
         """
         return (self._default_global_work_size is not None)
 
-    def as_list_launcher(self, name)
+    def as_list_launcher(self, name):
         """Convert a OpenClKernelLauncher to a OpenClKernelListLauncher."""
         llauncher = OpenClKernelListLauncher(name=name)
         llauncher.push_kernels(self)
@@ -216,7 +257,7 @@ class OpenClKernelLauncher(object):
         assert isinstance(queue, cl.CommandQueue)
         assert isinstance(global_work_size, tuple)
         assert isinstance(local_work_size, (tuple, type(None)))
-        
+       
         dprint(self._apply_msg.format(global_work_size, local_work_size))
 
         kernel = self._kernel
@@ -227,7 +268,7 @@ class OpenClKernelLauncher(object):
                 global_work_size=global_work_size, 
                 local_work_size=local_work_size, wait_for=wait_for)
 
-        if (cl.command_queue_properties.PROFILING_ENABLE in queue.properties):
+        if (cl.command_queue_properties.PROFILING_ENABLE & queue.properties):
             self._events.append(evt)
 
         return evt
diff --git a/hysop/backend/device/opencl/operator/transpose.py b/hysop/backend/device/opencl/operator/transpose.py
index e39aac913..ccb2782a8 100644
--- a/hysop/backend/device/opencl/operator/transpose.py
+++ b/hysop/backend/device/opencl/operator/transpose.py
@@ -39,18 +39,20 @@ class OpenClTranspose(TransposeOperatorBase, OpenClOperator):
                 is_inplace=is_inplace, input_field=input_field, output_field=output_field)
         
         kernel_launchers=()
-        for i in xrange(input_field.nb_components):
+        for i in xrange(self.nb_components):
             if compute_inplace:
                 launcher = transpose.build_launcher(inout=input_field[i].data)
             elif is_inplace:
-                launcher = transpose.build_launcher(input=input_field[i].data, output=self.dtmp.data)
-                launcher = launcher.as_list_launcher(name='transpose_copy_{}{}'.format(input_field.name, i))
-                launcher.enqueue_copy(dst='output', src='input')
+                launcher = transpose.build_list_launcher(input=input_field[i].data, 
+                                                        output=self.dtmp.data)
+                launcher.push_copy_device_to_device(varname='tmp', src=self.dtmp, 
+                                                                   dst=input_field[i])
             else:
-                launcher = transpose.build_launcher(input=input_field[i].data, output=output_field[i].data)
+                launcher = transpose.build_launcher(input=input_field[i].data, 
+                                                    output=output_field[i].data)
             kernel_launchers += (launcher,)
 
-        self._transpose_kernel_launchers = kernel_launchers
+        self._kernel_launchers = kernel_launchers
     
     def enqueue_copy_kernel(self, _dst, _src, queue):
         pass
@@ -60,11 +62,9 @@ class OpenClTranspose(TransposeOperatorBase, OpenClOperator):
         super(OpenClTranspose,self).apply(**kwds)
 
         queue = self.cl_env.default_queue
-        compute_inplace = self.compute_inplace
-        is_inplace = self.is_inplace
-
-        kernel_launchers = self.transpose_kernel_launchers
-        for i in range(din.nb_components): 
+        
+        kernel_launchers = self._kernel_launchers
+        for i in range(self.nb_components): 
             kernel = kernel_launchers[i]
             evt = kernel(queue=queue)
         
diff --git a/hysop/core/arrays/array_backend.py b/hysop/core/arrays/array_backend.py
index 587943df2..eabbac65f 100644
--- a/hysop/core/arrays/array_backend.py
+++ b/hysop/core/arrays/array_backend.py
@@ -486,6 +486,10 @@ Exception was:
                 if isinstance(dst.backend, backend_cls):
                     src = dst.backend.wrap(src)
                     dst.backend.copyto(dst, src, **kargs)
+                elif backend_cls is HostArrayBackend:
+                    host_array_backend = dst.backend.host_array_backend
+                    src = host_array_backend.wrap(src)
+                    host_array_backend.copyto(dst, src,**kargs)
                 else:
                     msg='dst does not match registered backend for type {}.'
                     msg=msg.format(cls)
diff --git a/hysop/core/graph/computational_operator.py b/hysop/core/graph/computational_operator.py
index ed912f73f..9cd6354bf 100644
--- a/hysop/core/graph/computational_operator.py
+++ b/hysop/core/graph/computational_operator.py
@@ -535,6 +535,7 @@ class ComputationalGraphOperator(ComputationalGraphNode):
         from hysop.core.graph.computational_graph import ComputationalGraph
         name = name or '{}_graph'.format(self.name)
         graph = ComputationalGraph(name=name)
+        print self.operators
         graph.push_nodes(self.operators)
         return graph
     
diff --git a/hysop/core/graph/node_generator.py b/hysop/core/graph/node_generator.py
index db6f2d6d5..79b7a06c0 100644
--- a/hysop/core/graph/node_generator.py
+++ b/hysop/core/graph/node_generator.py
@@ -79,6 +79,6 @@ class ComputationalGraphNodeGenerator(object):
         graph.push_nodes(*self.nodes)
         return graph
     
-    def build(self, name=None, **kwds):
+    def build(self, name=None, outputs_are_inputs=False, **kwds):
         """Convert a computational node generator to a graph and prepares it for apply."""
-        return self.to_graph(name=name).build(**kwds)
+        return self.to_graph(name=name).build(outputs_are_inputs=outputs_are_inputs, **kwds)
diff --git a/hysop/fields/cartesian_discrete_field.py b/hysop/fields/cartesian_discrete_field.py
index 9164cebc6..cce095d68 100644
--- a/hysop/fields/cartesian_discrete_field.py
+++ b/hysop/fields/cartesian_discrete_field.py
@@ -291,7 +291,7 @@ CartesianDiscreteFieldView (id={}, tag={})
     def randomize(self, **kwds):
         """Initialize a the with random values."""
         for d in xrange(self.nb_components):
-            self.array_backend.rand(out=self.data[d], **kwds)
+            self.backend.rand(out=self.data[d], **kwds)
     
 
     def copy(self, field_in, **kwds):
@@ -303,8 +303,8 @@ CartesianDiscreteFieldView (id={}, tag={})
             field to be copied
         """
         for d in xrange(self.nb_components):
-            self.array_backend.memcpy(dst=self.data[d], src=field_in[d], **kwds)
-    
+            self.backend.memcpy(dst=self.data[d], src=field_in[d], **kwds)
+   
     def initialize(self, formula, vectorize=False, **kwds):
         """
         Initialize the field components
diff --git a/hysop/operator/base/transpose_operator.py b/hysop/operator/base/transpose_operator.py
index f5ff5ee6a..dde1cae34 100644
--- a/hysop/operator/base/transpose_operator.py
+++ b/hysop/operator/base/transpose_operator.py
@@ -46,6 +46,7 @@ class TransposeOperatorBase(object):
         assert input_field.domain is output_field.domain
 
         dim = input_field.domain.dim
+        nb_components = input_field.nb_components
         assert dim>=2
         assert set(axes)==set(range(dim))
         assert tuple(axes)!=tuple(range(dim))
@@ -58,6 +59,7 @@ class TransposeOperatorBase(object):
 
         self.input_field = input_field
         self.output_field = output_field
+        self.nb_components = nb_components
         self.dim = dim
         self.axes = axes
      
diff --git a/hysop/operator/tests/test_transpose.py b/hysop/operator/tests/test_transpose.py
index 9ab872f6a..1d0c690e9 100644
--- a/hysop/operator/tests/test_transpose.py
+++ b/hysop/operator/tests/test_transpose.py
@@ -6,6 +6,7 @@ from hysop.testsenv import opencl_failed, iter_clenv
 from hysop.tools.contexts import printoptions
 from hysop.tools.numerics import is_fp, is_integer
 from hysop.tools.types import check_instance
+from hysop.tools.io_utils import IO
 from hysop.operator.transpose import Transpose, Implementation
 
 from hysop import Field, Box
@@ -16,17 +17,18 @@ class TestTransposeOperator(object):
     def setup_class(cls, 
             enable_extra_tests=__ENABLE_LONG_TESTS__,
             enable_debug_mode=False):
+
+        IO.set_default_path('/tmp/hysop_tests/test_transpose')
         
         if enable_debug_mode:
-            cls.size_min = 4
+            cls.size_min = 3
             cls.size_max = 5
         else:
             cls.size_min = 2
-            cls.size_max = 32
+            cls.size_max = 16
         
         cls.enable_extra_tests = enable_extra_tests
         cls.enable_debug_mode  = enable_debug_mode
-        
 
     @classmethod
     def teardown_class(cls):
@@ -36,17 +38,13 @@ class TestTransposeOperator(object):
     def _test(self, dim, dtype, is_inplace):
         enable_extra_tests = self.enable_extra_tests
         assert dim > 1
-        if is_inplace:
-            msg='is_inplace transposition has not been implmented yet.'
-            raise ValueError(msg)
 
         nshapes = 9 if enable_extra_tests else 3
 
-        shapes = ((np.random.randint(low=self.size_min, high=self.size_max),)*dim,)
-        shapes += tuple( tuple( np.random.randint(low=self.size_min, 
-                                                  high=self.size_max, size=dim).tolist() )
-                    for i in xrange(nshapes-1) )
-
+        shapes = ((np.random.randint(low=self.size_min, high=self.size_max+1),)*dim,)
+        shapes += tuple(set( tuple( np.random.randint(low=self.size_min, 
+                                                  high=self.size_max+1, size=dim).tolist() )
+                    for i in xrange(nshapes-1) ))
         all_axes = set(it.permutations(range(dim)))
         all_axes.remove(tuple(range(dim)))
         
@@ -78,7 +76,7 @@ class TestTransposeOperator(object):
     def _test_one(self, shape, axes,
             dim, dtype, is_inplace,
             domain, Fin, Fout):
-        
+
         print 'Testing inplace={} dtype={} shape={} axes={}'.format(
                 is_inplace, dtype.__name__, shape, axes)
         if is_inplace:
@@ -95,11 +93,10 @@ class TestTransposeOperator(object):
         # Compute reference solution
         transpose = Transpose(fields=fin, output_fields=fout,
                               variables=variables, axes=axes,
-                              implementation=ref_impl,
-                              name='test_transpose_{}'.format(str(ref_impl))).build()
+                              implementation=ref_impl).build()
         dfin, dfout = transpose.input_discrete_fields[fin], transpose.output_discrete_fields[fout]
         dfin.initialize(self.__field_init, dtype=dtype)
-
+        
         if is_inplace:
             refin = tuple(df.copy() for df in dfin.buffers)
         else:
@@ -108,13 +105,14 @@ class TestTransposeOperator(object):
         transpose.apply()
         
         refout = tuple(df.copy() for df in dfout.buffers)
+
         for in_,out_ in zip(refin, refout):
             assert np.all(out_ == np.transpose(in_, axes=axes))
         
         def iter_impl(impl):
             base_kwds = dict(fields=fin, output_fields=fout, variables=variables,
                              axes=axes, implementation=impl, 
-                             name='test_transpose_{}'.format(str(impl)))
+                             name='test_transpose_{}'.format(str(impl).lower()))
             if impl is ref_impl:
                 return 
             elif impl is Implementation.OPENCL_CODEGEN:
@@ -129,8 +127,8 @@ class TestTransposeOperator(object):
             for op in iter_impl(impl):
                 op = op.build()
                 dfin, dfout = op.input_discrete_fields[fin], op.output_discrete_fields[fout]
-                dfin.initialize(self.__field_init, dtype=dtype)
-                transpose.apply()
+                dfin.copy(refin)
+                op.apply()
                 out = tuple( data.get().handle for data in dfout.data )
                 self._check_output(impl, op, refin, refout, out)
     
@@ -171,29 +169,44 @@ class TestTransposeOperator(object):
             raise RuntimeError(msg) 
 
 
-    
     def test_2d_int_out_of_place(self):
         self._test(dim=2, dtype=np.int32, is_inplace=False)
-    def test_2d_uint_out_of_place(self):
-        self._test(dim=2, dtype=np.uint32, is_inplace=False)
     def test_2d_float_out_of_place(self):
         self._test(dim=2, dtype=np.float32, is_inplace=False)
-
     def test_3d_int_out_of_place(self):
         self._test(dim=3, dtype=np.int32, is_inplace=False)
-    def test_3d_uint_out_of_place(self):
-        self._test(dim=3, dtype=np.uint32, is_inplace=False)
     def test_3d_float_out_of_place(self):
         self._test(dim=3, dtype=np.float32, is_inplace=False)
+    def test_4d_int_out_of_place(self):
+        self._test(dim=4, dtype=np.int32, is_inplace=False)
+    
+    def test_2d_int_inplace(self):
+        self._test(dim=2, dtype=np.int32, is_inplace=True)
+    def test_2d_float_inplace(self):
+        self._test(dim=2, dtype=np.float32, is_inplace=True)
+    def test_3d_int_inplace(self):
+        self._test(dim=3, dtype=np.int32, is_inplace=True)
+    def test_3d_float_inplace(self):
+        self._test(dim=3, dtype=np.float32, is_inplace=True)
+    def test_4d_int_inplace(self):
+        self._test(dim=4, dtype=np.int32, is_inplace=False)
 
     def perform_tests(self):
-        self.test_2d_int_out_of_place()
-        self.test_2d_uint_out_of_place()
-        self.test_2d_float_out_of_place()
+        # self.test_2d_int_out_of_place()
+        # self.test_2d_float_out_of_place()
+
+        # self.test_3d_int_out_of_place()
+        # self.test_3d_float_out_of_place()
+
+        # self.test_4d_int_out_of_place()
+        
+        self.test_2d_int_inplace()
+        self.test_2d_float_inplace()
+
+        self.test_3d_int_inplace()
+        self.test_3d_float_inplace()
 
-        self.test_3d_int_out_of_place()
-        self.test_3d_uint_out_of_place()
-        self.test_3d_float_out_of_place()
+        self.test_4d_int_inplace()
     
 if __name__ == '__main__':
     TestTransposeOperator.setup_class(enable_extra_tests=False, 
diff --git a/hysop/operator/transpose.py b/hysop/operator/transpose.py
index badcaa4e2..74c98b93b 100644
--- a/hysop/operator/transpose.py
+++ b/hysop/operator/transpose.py
@@ -77,7 +77,7 @@ class Transpose(ComputationalGraphNodeGenerator):
             Input and output are matched by order int list/tuple.
         variables: dict
             Dictionary of fields as keys and CartesianTopologyDescriptors as values.
-        axes: tuple of ints, of array like of tuple of ints, or dictionnary of (tuple of ints -> TranspositionState).
+        axes: tuple of ints, or array like of tuples, or dict of (tuple, TranspositionState).
             Permutation of axes in numpy notations (as a tuple of ints).
             Axe dim-1 is the contiguous axe, axe 0 has the greatest stride in memory.
             
@@ -114,7 +114,8 @@ class Transpose(ComputationalGraphNodeGenerator):
         Out of place transpose will always be faster to process.
         The only exception to this rule may be 2D square matrices.
 
-        Component-wise transpose is *not* yet supported in Fields and will raise directly in frontend.
+        Component-wise transpose is *not* yet supported in Fields and will 
+        raise directly in frontend.
 
         Inplace transposition may request a temporary buffer because not all implementations
         may support inplace transposition.
diff --git a/hysop/tools/io_utils.py b/hysop/tools/io_utils.py
index 3211a174b..a74aaafa7 100755
--- a/hysop/tools/io_utils.py
+++ b/hysop/tools/io_utils.py
@@ -25,7 +25,6 @@ class IO(object):
     """
 
     _default_path = None
-    
     _default_cache_path = os.path.expanduser('~') + '/.cache/hysop'
     _cache_path = None
 
@@ -69,37 +68,6 @@ class IO(object):
         #ind = -1
         interactive_path = './interactive/p' + str(mpi.main_size)
         interactive_path = os.path.abspath(interactive_path)
-        # --- ipython ---
-        #from hysop.tools.sys_utils import SysUtils
-        # if SysUtils.in_ipython():
-        #     # Note FP: because of set_default_path call
-        #     # in __init__.py, this condition must never happen.
-        #     # But we keep the code below, just in case ...
-
-        #     # list of files (fullpath) which contain the callers
-        #     sublist = [i[1] for i in a]
-        #     # look for ipython in callers ...
-        #     # If found, keep index of the file just before
-        #     # first occurence of ipython, i.e. the name
-        #     # of the 'main' file
-        #     for val in sublist:
-        #         ll = findall('ipython', val, IGNORECASE)
-        #         if len(ll) > 0:
-        #             ind = sublist.index(val) - 1
-        #             break
-
-        #     if ind > -1:
-        #         # -- interactive ipython but call with execfile--
-        #         if len(findall('io_utils', a[ind][1])) > 0:
-        #             return interactive_path
-        #         a = a[ind]
-        #     else:
-        #         # -- interactive ipython without execfile call --
-        #         return interactive_path
-
-        # else:
-        # -- python --
-        # if test session, set default path to interactive_path
         for fname in a:
             cond1 = len(findall('py.test', fname[1])) > 0
             cond2 = len(findall('pytest', fname[1])) > 0
@@ -166,12 +134,10 @@ class IO(object):
         used for the simulation.
 
         """
-        IO._default_path = pathdir
-        IO._default_path = os.path.join(IO._default_path,
+        IO._default_path = os.path.join(pathdir,
                                         'p' + str(mpi.main_size))
         IO.check_dir(IO._default_path)
 
-
     @staticmethod
     def default_cache_path():
         return IO._default_cache_path
-- 
GitLab