diff --git a/CMakeLists.txt b/CMakeLists.txt
old mode 100755
new mode 100644
index c56fe833eda8e7606a270231bfd898d8a3fabbf1..766202db64021d550b87f1bd93d8d1bd4575a89a
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -175,7 +175,8 @@ endif()
 if(WITH_FFTW)
     #set(FIND_FFTW_VERBOSE VERBOSE_MODE)
     set(FIND_FFTW_VERBOSE OFF)
-    set(FIND_FFTW_DEBUG OFF)
+    set(FIND_FFTW_DEBUG ON)
+    set(FIND_FFTW_SHARED_ONLY ON)
     compile_with(FFTW 
       REQUIRED COMPONENTS Fftw3f Fftw3d
                           Fftw3f-mpi Fftw3d-mpi
diff --git a/cmake/FindFFTW.cmake b/cmake/FindFFTW.cmake
index 15bca50d3bd1ed1c893974f91bc28dd0da96065f..c6a42e4fb70b5e575bf5f002576a51f1d0f74fcc 100644
--- a/cmake/FindFFTW.cmake
+++ b/cmake/FindFFTW.cmake
@@ -16,6 +16,7 @@
 # == Full example == 
 #  set(FIND_FFTW_VERBOSE ON) (optional)
 #  set(FIND_FFTW_DEBUG OFF)  (optional)
+#  set(FIND_FFTW_SHARED_ONLY ON) (optional)
 #  find_package(FFTW 
 #      REQUIRED COMPONENTS Fftw3f Fftw3d 
 #      OPTIONAL_COMPONENTS Fftw3l Fftw3q 
@@ -55,6 +56,10 @@
 # == Checking against a specific version or the library ==
 #   Not supported yet.
 #
+# == Looking for shared or static libraries only ==
+# set ${FIND_FFTW_SHARED_ONLY} to ON before find_package call.
+# set ${FIND_FFTW_STATIC_ONLY} to ON before find_package call.
+#
 # == Debug and verbose mode ==
 # Set ${FIND_FFTW_VERBOSE} to ON before find_package call to enable verbose mode
 # Set ${FIND_FFTW_DEBUG}   to ON before find_package call to enable debug mode
@@ -98,6 +103,13 @@ foreach(fftw_comp ${FFTW_FIND_COMPONENTS})
     # -- find library name given the component name
     string(REPLACE "fftw3d" "fftw3" library "${component}")
 
+    # -- rename library to match shared or static constraints
+    if(FIND_FFTW_SHARED_ONLY)
+	set(library "lib${library}.so")
+    elseif(FIND_FFTW_STATIC_ONLY)
+	set(library "lib${library}.a")
+    endif()
+
     if(FIND_FFTW_DEBUG)
         message("\tFFTW::${fftw_comp}:${COMPONENT}:${component}, LIB=${library} HEADER=${header}")
     endif()
diff --git a/examples/taylor_green/taylor_green_monoscale.py b/examples/taylor_green/taylor_green_monoscale.py
index d599b58deef39647775e230e82295e4627731985..d3ec5259ecb2084d6a69f99b9208cc0ad3933ed1 100644
--- a/examples/taylor_green/taylor_green_monoscale.py
+++ b/examples/taylor_green/taylor_green_monoscale.py
@@ -30,11 +30,11 @@ def init_vorticity(res, x, y, z, t):
     return res
 
 def do_solve(npts, viscosity=1./1600., lcfl=0.125, cfl=0.50):
-    autotuner_config = AutotunerConfig(autotuner_flag=AutotunerFlags.EXHAUSTIVE, 
-       prune_threshold=1.2, override_cache=False, verbose=2)
+    autotuner_config = AutotunerConfig(autotuner_flag=AutotunerFlags.ESTIMATE, 
+       prune_threshold=1.2, override_cache=True, verbose=1)
     kernel_config = KernelConfig(autotuner_config=autotuner_config)
 
-    dim    = 1
+    dim    = 3
 
     box = Box(length=(2*pi,)*dim) 
 
@@ -66,10 +66,10 @@ def do_solve(npts, viscosity=1./1600., lcfl=0.125, cfl=0.50):
             # method = {SpaceDiscretization: SpaceDiscretization.FDC4}
         # )
 
-    # poisson = Poisson(name='poisson', velocity=velo, vorticity=vorti, 
-            # variables={velo:d3d, vorti: d3d})
-    # diffusion = Diffusion(name='diffusion', input_field=vorti, viscosity=viscosity,
-                        # variables={vorti: d3d})
+    poisson = Poisson(name='poisson', velocity=velo, vorticity=vorti, 
+            variables={velo:d3d, vorti: d3d})
+    diffusion = Diffusion(name='diffusion', input_field=vorti, viscosity=viscosity,
+                        variables={vorti: d3d})
         
     splitting = StrangSplitting(splitting_dim=dim, order=StrangOrder.STRANG_SECOND_ORDER,
             method={KernelConfig: kernel_config})
@@ -78,16 +78,16 @@ def do_solve(npts, viscosity=1./1600., lcfl=0.125, cfl=0.50):
         
     problem = Problem()
     problem.insert(splitting)
-    # problem.insert(diffusion)
-    # problem.insert(poisson)
+    problem.insert(diffusion)
+    problem.insert(poisson)
     
     problem.build()
-    # problem.display()
+    problem.display()
 
     problem.solve(simu)
     problem.finalize()
 
 if __name__=='__main__':
-    N = 128*128 + 1
+    N = 128 + 1
     do_solve(N)
 
diff --git a/hysop/backend/device/codegen/base/function_codegen.py b/hysop/backend/device/codegen/base/function_codegen.py
index d9ec1f53718e25747b6059ca0ef7d5b94ca81f6e..cb8c0654dcc7baa32f51856d1e95e619f9d52aaa 100644
--- a/hysop/backend/device/codegen/base/function_codegen.py
+++ b/hysop/backend/device/codegen/base/function_codegen.py
@@ -29,13 +29,14 @@ class FunctionBase(object):
                         pass
                     else:
                         varval.const = True
-                        if symbolic_mode is not None:
+                        if symbolic_mode:
                             varval.force_symbolic(symbolic_mode)
                         fargs[varname]=varval 
                 else:
                     fargs[varname].set_value(varval)
                     fargs[varname].const = True
-                    fargs[varname].force_symbolic(symbolic_mode)
+                    if symbolic_mode:
+                        fargs[varname].force_symbolic(symbolic_mode)
         fargs.lock()
 
         self.fname  = fname
diff --git a/hysop/backend/device/codegen/base/kernel_codegen.py b/hysop/backend/device/codegen/base/kernel_codegen.py
index 41224982a6a1022d4413c24f6b9ae11a5ebfd3df..936a1591d5bec7e5cbdcc5cb789b37ce0787ad2d 100644
--- a/hysop/backend/device/codegen/base/kernel_codegen.py
+++ b/hysop/backend/device/codegen/base/kernel_codegen.py
@@ -1,4 +1,4 @@
-
+     
 from contextlib import contextmanager
 
 from hysop.tools.types import check_instance
diff --git a/hysop/backend/device/codegen/base/utils.py b/hysop/backend/device/codegen/base/utils.py
index 50c08aa80daa0407294a8ea24d42a27d5fae2276..b69bbc9722196a0434475964c8293cd1fad23059 100644
--- a/hysop/backend/device/codegen/base/utils.py
+++ b/hysop/backend/device/codegen/base/utils.py
@@ -71,6 +71,7 @@ class ArgDict(WriteOnceDict):
         i=0
         for varname in self.arg_order:
             var = self[varname]
+            print var.name, var.symbolic_mode, var.is_symbolic()
             if var.symbolic_mode and var.known():
                 constant_args.append(var)
             elif var.is_symbolic():
diff --git a/hysop/backend/device/codegen/base/variables.py b/hysop/backend/device/codegen/base/variables.py
index 943a08fab9253ea63499512c5dd2c13d46a567af..b8bd145562f9d795fbe9159dade51d465893bd58 100644
--- a/hysop/backend/device/codegen/base/variables.py
+++ b/hysop/backend/device/codegen/base/variables.py
@@ -50,8 +50,10 @@ def dtype_to_ctype(dtype):
         return _dtype_to_ctype[dtype]
 def ctype_to_dtype(ctype):
     if ctype not in _ctype_to_dtype.keys():
-        msg='Unknown ctype {}.'.format(ctype)
-        raise ValueError(msg)
+        # msg='Unknown ctype {}.'.format(ctype)
+        # raise ValueError(msg)
+        from hysop.backend.device.opencl.opencl_types import np_dtype
+        return np_dtype(ctype)
     else:
         return _ctype_to_dtype[ctype]
 def register_ctype_dtype(ctype,dtype):
@@ -89,6 +91,7 @@ class CodegenVariable(object):
         self.volatile = volatile
         self.add_impl_const = add_impl_const
         self.storage = storage
+        self.declared = False
 
         self.nl = nl if (nl is not None) else (storage is not None)
         self.struct_var = struct_var
@@ -134,7 +137,7 @@ class CodegenVariable(object):
                 dtype = self.typegen.floatn(1)
             else:
                 dtype = ctype_to_dtype(ctype)
-            value = dtype(value)
+            value = np.asarray([value],dtype=dtype)[0]
         elif value.__class__ in [list,tuple]:
             dtype = ctype_to_dtype(ctype)
             try:
@@ -152,7 +155,7 @@ class CodegenVariable(object):
         
         self.value   = value
         self.svalue  = svalue
-        self.init=init
+        self.init    = init
         
         # check
         if add_impl_const:
@@ -382,6 +385,7 @@ class CodegenVariable(object):
             code+=';'
         
         self.force_symbolic() 
+        self.declared = True
         
         if not align:
             code = code.replace('$','')
@@ -561,9 +565,9 @@ class CodegenArray(CodegenVariable):
         self.sshape = sshape
         
     def decl_name(self):
-        if self.shape:
+        if self.shape is not None:
             static_array = ['[{}]'.format(val) for val in self.shape]
-        elif self.shape:
+        elif self.sshape is not None:
             static_array = ['[{}]'.format(val) for val in self.sshape]
         else:
             static_array = []
@@ -673,7 +677,7 @@ class CodegenVectorClBuiltin(CodegenVector):
         if (value is not None):
             dtype = ctype_to_dtype(btype)
             value = np.asarray(value,dtype)
-            svalue = [typegen.dump(dtype(f)) for f in value]
+            svalue = [typegen.dump(np.asarray([f],dtype=dtype)[0]) for f in value]
         
         super(CodegenVectorClBuiltin,self).__init__(name=name,ctype=ctype,dim=dim,typegen=typegen,
                 value=value,svalue=svalue, const=const, add_impl_const=add_impl_const,
@@ -741,16 +745,18 @@ class CodegenVectorClBuiltin(CodegenVector):
         dim = self.dim
         if isinstance(key,slice) :
             ids = range(*key.indices(dim))
+            if self.declared and key.indices(dim)==(0,dim,1):
+                return self.name
+            else:
+                return self.__getitem__(ids)
+        elif isinstance(key, (tuple,list)):
             if self.is_symbolic():
-                if key.indices(dim)==(0,dim,1):
-                    return self.name
-                else:
-                    mode   = self.access_mode
-                    access = self.access_prefix(True) + ('s' if mode.lower()=='hex' else '')
-                    access += ''.join([self.typegen.vtype_component_adressing(i,mode) for i in ids])
+                mode   = self.access_mode
+                access = self.access_prefix(True) + ('s' if mode.lower()=='hex' else '')
+                access += ''.join([self.typegen.vtype_component_adressing(i,mode) for i in key])
             else:
-                ctype = self.btype + (str(len(ids)) if len(ids)!=1 else '')
-                value = [self.svalue[i] for i in ids]
+                ctype = self.btype + (str(len(key)) if len(key)!=1 else '')
+                value = [self.svalue[i] for i in key]
                 return '({})({})'.format(ctype, ','.join(value))
             return access
         elif isinstance(key, int) :
@@ -763,6 +769,7 @@ class CodegenVectorClBuiltin(CodegenVector):
             raise TypeError, 'Invalid key type!'
     
     def declare(self, codegen=None, init=None, **kargs):
+        init = init or self.init
         if isinstance(init,int):
             init = ','.join([self.typegen.dump(init) for _ in xrange(self.dim)])
             init = '({})({})'.format(self.ctype,init)
diff --git a/hysop/backend/device/codegen/kernels/tests/test_directional_transpose.py b/hysop/backend/device/codegen/kernels/tests/test_directional_transpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..643e4a16fb610b4172b42b41c902a1ddb3e8d158
--- /dev/null
+++ b/hysop/backend/device/codegen/kernels/tests/test_directional_transpose.py
@@ -0,0 +1,352 @@
+
+import copy, math
+
+from hysop import __ENABLE_LONG_TESTS__
+from hysop.deps import np
+from hysop.backend.device.opencl import cl, clTools
+from hysop.tools.types import check_instance
+from hysop.backend.device.codegen.base.test import _test_typegen
+# from hysop.backend.device.codegen.kernels.transpose import TransposeKernel
+
+class TestDirectionalAdvection(object):
+
+    @classmethod
+    def setup_class(cls,
+            enable_extra_tests=__ENABLE_LONG_TESTS__,
+            enable_error_plots=False):
+        
+        typegen = _test_typegen('double','dec')
+        queue = cl.CommandQueue(typegen.context)
+        ctx   = typegen.context
+        
+        # Lx = min(typegen.device.max_work_item_sizes[0], typegen.device.max_work_group_size)
+        # Lx = min(Lx, grid_size[0])
+        
+        cls.enable_extra_tests = enable_extra_tests
+        cls.enable_error_plots = enable_error_plots
+
+        cls.size_min = 8
+        cls.size_max = 64
+
+        cls.queue = queue
+        cls.ctx   = ctx
+
+        # cls.local_work_size = np.asarray([Lx,1,1])
+        # cls.work_load       = np.asarray([1,1,1])
+        
+
+    def _alloc_cpu_gpu(self, dtype, dim):
+
+        ctx   = self.ctx
+        queue = self.queue
+
+        grid_size = np.rint(self.size_min + np.random.rand(dim) * (self.size_max - self.size_min))\
+                      .astype(np.int32)
+        
+        igrid_ghosts = 1 + np.rint(dim*np.random.rand(dim)).astype(np.int32)
+        ogrid_ghosts = 1 + np.rint(dim*np.random.rand(dim)).astype(np.int32)
+
+        igrid_size = grid_size + 2*igrid_ghosts
+        ogrid_size = grid_size + 2*ogrid_ghosts
+        
+        grid_shape  = grid_size[::-1]
+        igrid_shape = igrid_size[::-1]
+        ogrid_shape = ogrid_size[::-1]
+        
+        grid_bytes  = grid_size.size  * np.dtype(dtype).itemsize
+        igrid_bytes = igrid_size.size * np.dtype(dtype).itemsize 
+        ogrid_bytes = ogrid_size.size * np.dtype(dtype).itemsize 
+        
+        grid_view  = [ slice(0,          grid_size[i] + 0         ) for i in xrange(dim-1,-1,-1) ]
+        igrid_view = [ slice(igrid_ghosts[i], grid_size[i] + igrid_ghosts[i]) for i in xrange(dim-1,-1,-1) ]
+        ogrid_view = [ slice(ogrid_ghosts[i], grid_size[i] + ogrid_ghosts[i]) for i in xrange(dim-1,-1,-1) ]
+        
+        print '::Alloc:: dtype={}  dim={}'.format(dtype, dim)
+        print '  *INPUT:  base={}  ghosts={}  size={}'.format(grid_size, igrid_ghosts, igrid_size)
+        print '  *OUTPUT: base={}  ghosts={}  size={}'.format(grid_size, ogrid_ghosts, ogrid_size)
+        print '  allocating...'
+
+        mf = cl.mem_flags
+
+        host_buffers_init = {
+                'no_ghosts': {
+                    'Tin':         np.random.rand(*grid_shape).astype(dtype),
+                    'Tout': -1   * np.ones(shape=grid_shape, dtype=dtype),
+                    'dbg0': -1   * np.ones(shape=grid_shape, dtype=np.int32),
+                    'dbg1': -1   * np.ones(shape=grid_shape, dtype=np.int32),
+                },
+                'with_ghosts': {
+                    'Tin':         np.random.rand(*igrid_shape).astype(dtype),
+                    'Tout': -1   * np.ones(shape=ogrid_shape, dtype=dtype),
+                    'dbg0': -1   * np.ones(shape=grid_shape, dtype=np.int32),
+                    'dbg1': -1   * np.ones(shape=grid_shape, dtype=np.int32),
+                }
+        }
+
+        device_buffers = {
+                'no_ghosts': {
+                    'Tin':  cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['Tin']),
+                    'Tout': cl.Buffer(ctx, flags=mf.WRITE_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['Tout']),
+                    'dbg0': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR,
+								hostbuf=host_buffers_init['no_ghosts']['dbg0']),
+                    'dbg1': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['dbg1']),
+                },
+                'with_ghosts': {
+                    'Tin':  cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['Tin']),
+                    'Tout': cl.Buffer(ctx, flags=mf.WRITE_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['Tout']),
+                    'dbg0': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['dbg0']),
+                    'dbg1': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['dbg1']),
+                }
+        }
+
+        host_buffers_reference = copy.deepcopy(host_buffers_init)
+        host_buffers_gpu       = copy.deepcopy(host_buffers_init)
+        
+        self.dtype = dtype
+        self.dim   = dim
+
+        self.host_buffers_init      = host_buffers_init
+        self.host_buffers_reference = host_buffers_reference
+        self.host_buffers_gpu       = host_buffers_gpu
+        self.device_buffers         = device_buffers
+
+        self.grid_size  = grid_size
+        self.igrid_size = igrid_size 
+        self.ogrid_size = ogrid_size
+        
+        self.grid_shape  = grid_shape
+        self.igrid_shape = igrid_shape 
+        self.ogrid_shape = ogrid_shape
+
+        self.grid_bytes  = grid_bytes
+        self.igrid_bytes = igrid_bytes 
+        self.ogrid_bytes = ogrid_bytes
+
+        self.grid_view  = grid_view
+        self.igrid_view = igrid_view 
+        self.ogrid_view = ogrid_view
+        
+        self.igrid_ghosts = igrid_ghosts 
+        self.ogrid_ghosts = ogrid_ghosts
+
+    @classmethod
+    def teardown_class(cls):
+        pass
+    
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        pass
+
+    def _do_transpose_cpu(self, is_inplace, axes):
+        
+        out_target = 'Tin' if is_inplace else 'Tout'
+
+        for target in ['no_ghosts', 'with_ghosts']:
+            host_init_buffers      = self.host_buffers_init[target] 
+            host_buffers_reference = self.host_buffers_reference[target]
+            
+            if target == 'no_ghosts':
+                in_view  = self.grid_view
+                out_view = self.grid_view
+            else:
+                in_view  = self.igrid_view 
+                out_view = self.igrid_view if is_inplace else self.ogrid_view
+
+            Tin  = host_init_buffers['Tin']
+            Tout = host_buffers_reference[out_target]
+        
+            print axes, Tin[in_view].shape, Tout[out_view].shape
+            Tout[out_view] = np.transpose(Tin[in_view].copy(), axes=axes)
+    
+    def _do_compute_gpu_and_check(self, dim, axes, is_inplace, vectorization):
+
+        msg = '\nTesting {}d {} transposition with {} axes and {} vectorization.'
+        msg = msg.format(dim, 
+                'inplace' if is_inplace else 'out of place',
+                axes, vectorization)
+        print msg
+
+        return
+
+        host_init_buffers      = self.host_buffers_init[target] 
+        host_buffers_reference = self.host_buffers_reference[target]
+        host_buffers_gpu       = self.host_buffers_gpu[target]
+        device_buffers         = self.device_buffers[target]
+
+        dak = DirectionalAdvectionKernel(
+            typegen=self.typegen, 
+            ftype=self.typegen.fbtype,
+            work_dim=work_dim, 
+            rk_scheme=rk_scheme,
+            is_cached=cached,
+            vboundary=(boundary, boundary),
+            nparticles = nparticles,
+            min_ghosts=min_ghosts,
+            symbolic_mode=False,
+            debug_mode=True,
+            known_vars=known_vars)
+        # dak.edit()
+        
+        global_work_size = dak.get_global_size(work_size,local_work_size,work_load)
+        (static_shared_bytes, dynamic_shared_bytes, total_bytes) = \
+                dak.required_workgroup_cache_size(local_work_size)
+        
+        variables = ['u','pos']
+        debug     = ['dbg0', 'dbg1']
+        for varname in variables:
+            kernel_args.append(device_buffers[varname])
+            kernel_args.append(np.uint64(0))  
+        for varname in debug:
+            kernel_args.append(device_buffers[varname])
+        if (dynamic_shared_bytes != 0):
+            shared_buffer = cl.LocalMemory(dynamic_shared_bytes)
+            kernel_args.append(shared_buffer)
+    
+        print '\tGenerating and compiling Kernel...'
+        source = dak.__str__()
+        prg = cl.Program(self.typegen.context, source)
+        prg.build(devices=[self.typegen.device])
+        kernel = prg.all_kernels()[0]
+        kernel.set_args(*kernel_args)
+        
+        print '\tCPU => GPU'
+        for buf in variables+debug:
+            src = host_init_buffers[buf]
+            dst = device_buffers[buf]
+            cl.enqueue_copy(queue,dst,src)
+        
+        print '\tKernel execution <<<{},{}>>>'.format(global_work_size,local_work_size)
+        evt = cl.enqueue_nd_range_kernel(queue, kernel, 
+                global_work_size.tolist(), local_work_size.tolist())
+        evt.wait()
+        
+        print '\tGPU => CPU'
+        for buf in variables+debug:
+            src = device_buffers[buf]
+            dst = host_buffers_gpu[buf]
+            cl.enqueue_copy(queue,dst,src)
+
+        print '\tSynchronize queue'
+        queue.flush()
+        queue.finish()
+
+        buffers = [(varname,host_buffers_reference[varname],host_buffers_gpu[varname]) 
+                        for varname in variables]
+        self._cmp_buffers(buffers,view,dak,work_dim)
+    
+    def _cmp_buffers(self,buffers,view,dak,work_dim):
+        good = True
+        err_buffers = []
+
+        for (name,host,dev) in buffers:
+            (l1,l2,linf) = self._distances(host,dev,view)
+            print '\t{} -> l1={}  l2={}  linf={}'.format(name,l1,l2,linf)
+            if linf>1e-12:
+                err_buffers.append(name)
+                good = False
+        if not good:
+            msg = '\n[FAIL] Buffer comparisson failed for buffers {}.\n'.format(err_buffers)
+            print msg
+            dak.edit()
+            if self.enable_error_plots:
+                from matplotlib import pyplot as plt
+                for (name,host,dev) in buffers:
+                    if name in err_buffers:
+
+                        host = host[view]
+                        dev  = dev[view]
+
+                        d = (dev-host)*(dev-host)
+                        d -= np.mean(d)
+                    
+                        if work_dim==3:
+                            fig,axes = plt.subplots(2,2)
+                            axes[0][0].imshow(np.sum(d,axis=0),interpolation='nearest')
+                            axes[0][1].imshow(np.sum(d,axis=1),interpolation='nearest')
+                            axes[1][0].imshow(np.sum(d,axis=2),interpolation='nearest')
+                            axes[1][1].imshow(np.sum(d,axis=(0,1))[np.newaxis,:],interpolation='nearest')
+                            plt.title(name)
+                            fig.show()
+                            raw_input('Press enter to close all windows.')
+                            break
+                        elif work_dim==2:
+                            fig,axe = plt.subplots()
+                            axe.imshow(d,interpolation='nearest')
+                            plt.title(name)
+                            fig.show()
+                            raw_input('Press enter to close all windows.')
+                            break
+                        else:
+                            pass
+            raise RuntimeError(msg)
+
+    def _distances(self,lhs,rhs,view):
+        d    = rhs[view]-lhs[view]
+        da   = np.abs(d)
+
+        l1   = np.sum(da)/d.size
+        l2   = np.sqrt(np.sum(d*d))/d.size
+        linf = np.max(da)
+        return (l1,l2,linf)
+    
+    
+    def _test_transpose(self, dim, axes, is_inplace):
+        check_instance(axes, tuple, values=int)
+        assert dim > 1
+        assert len(axes)==2
+
+        if self.enable_extra_tests:
+            vectorizations=[1,2,4,8,16]
+            dtypes = [np.float32, np.float64, 
+                      np.int8, np.int16, np.int32, np.int64,
+                      np.uint8, np.uint16, np.uint32, np.uint64]
+        else:
+            vectorizations=[1,2]
+            dtypes = [np.float32, np.int32, np.uint32]
+        
+        for dtype in dtypes:
+            self._alloc_cpu_gpu(dtype=dtype, dim=dim)
+            self._do_transpose_cpu(axes=axes, is_inplace=is_inplace)
+            for vectorization in vectorizations:
+                self._do_compute_gpu_and_check(dim=dim, axes=axes, 
+                        vectorization=vectorization, is_inplace=is_inplace)
+            
+    
+    def test_transpose_XY_out_of_place(self):
+        self._test_transpose(dim=2, axes=(1,0), is_inplace=False)
+        self._test_transpose(dim=3, axes=(1,0), is_inplace=False)
+    
+    def test_transpose_XZ_out_of_place(self):
+        self._test_transpose(dim=3, axes=(2,1,0), is_inplace=False)
+    
+    def test_transpose_YZ_out_of_place(self):
+        self._test_transpose(dim=3, axes=(0,2,1), is_inplace=False)
+
+
+    def test_transpose_XY_inplace(self):
+        self._test_transpose(dim=2, axes=(1,0), is_inplace=True)
+        self._test_transpose(dim=3, axes=(1,0), is_inplace=True)
+    
+    def test_transpose_XZ_inplace(self):
+        self._test_transpose(dim=3, axes=(2,1,0), is_inplace=True)
+    
+    def test_transpose_YZ_inplace(self):
+        self._test_transpose(dim=3, axes=(0,2,1), is_inplace=True)
+
+if __name__ == '__main__':
+    TestDirectionalAdvection.setup_class(enable_extra_tests=False, enable_error_plots=True)
+    
+    test = TestDirectionalAdvection()
+    test.test_transpose_XY_out_of_place()
+
+    TestDirectionalAdvection.teardown_class()
+
diff --git a/hysop/backend/device/codegen/kernels/transpose.py b/hysop/backend/device/codegen/kernels/transpose.py
index 4d974f35cc616bfac4347e599656c53a3c865ae3..06e3d63b73f0abc4241458caee6fc16204259965 100644
--- a/hysop/backend/device/codegen/kernels/transpose.py
+++ b/hysop/backend/device/codegen/kernels/transpose.py
@@ -1,9 +1,12 @@
 import operator
 import numpy as np
+from contextlib import contextmanager, nested
 
+from hysop.tools.misc import upper_pow2
 from hysop.backend.device.codegen.base.opencl_codegen import OpenClCodeGenerator
 from hysop.backend.device.codegen.base.kernel_codegen import KernelCodeGenerator
-from hysop.backend.device.codegen.base.variables      import CodegenVariable, CodegenVectorClBuiltin, CodegenArray
+from hysop.backend.device.codegen.base.variables      import CodegenVariable, \
+        CodegenVectorClBuiltin, CodegenArray
 from hysop.backend.device.opencl.opencl_types          import OpenClTypeGen
 from hysop.backend.device.codegen.base.utils          import WriteOnceDict, ArgDict
 
@@ -12,34 +15,78 @@ from hysop.backend.device.codegen.functions.compute_index import ComputeIndexFun
 class TransposeKernel(KernelCodeGenerator):
 
     @staticmethod
-    def codegen_name(components,dtype,work_dim):
-        return 'transpose_{}_{}_{}d'.format(components,dtype,work_dim)
+    def codegen_name(is_inplace, components, ctype, 
+            tile_dim, tile_size, tile_padding, vectorization):
+        components = [ str(j) if i!=j else 'X' for i,j in enumerate(components) ]
+        return 'transpose_{}_{}_{}d__N{}__T{}__P{}__{}'.format(
+                'inplace' if is_inplace else 'out_of_place',
+                ctype, tile_dim, vectorization, tile_size, tile_padding, 
+                '_'.join(components))
     
-    def __init__(self, typegen, work_dim, dtype,
-            tile_size=(8,8), tile_padding=1,
+    def __init__(self, typegen, ctype, vectorization,
+            components, tile_size, tile_padding,
+            is_inplace = False,
             known_vars = None,
             **kargs):
-
-        if len(tile_size)==1:
-            tile_size = (tile_size,tile_size,)
-        elif len(tile_size)!=2:
-            raise ValueError('Tile size should be a 2d tuple.')
         
-        name = TransposeKernel.codegen_name('xy',dtype, work_dim)
-
-        kernel_args = self.gen_kernel_arguments(typegen, dtype, work_dim)
+        components = np.asarray(components)
+        dim        = components.size
+        pdim       = upper_pow2(dim)
+        assert dim <= 16, 'Maximal permutation dimension is 16.'
+        assert pdim in [1,2,4,8,16]
+        assert vectorization in [1,2,4,8,16]
+        
+        # check permutation components
+        msg='Invalid permutation {} for dimension {}.'
+        msg=msg.format(components, dim)
+        assert components.size == dim, msg
+        assert (components<dim).all(), msg
+        axes = set(components.tolist())
+        if len(axes) != dim:
+            raise ValueError(msg)
+        
+        permutation = (components != range(dim))
+        naxes = sum(permutation)
+
+        if (naxes == 0):
+            msg='There is nothing to transpose with given permutation {}.'
+            msg=msg.format()
+            raise ValueError(msg)
+        assert naxes >= 2, msg
+        
+        # we need the first axe for contiguous reads and writes even if it's not part
+        # of the permutation scheme.
+        tile_dim = naxes + int(not permutation[0])
+        tile_shape = [tile_size,]*tile_dim
+        tile_shape[-1] += tile_padding
+        tile_shape = tuple(tile_shape)
+        
+        name = TransposeKernel.codegen_name(is_inplace, components, ctype, tile_dim, 
+                tile_size, tile_padding, vectorization)
+        
+        work_dim = min(3,tile_dim)
+        kernel_args = self.gen_kernel_arguments(typegen, ctype, pdim)
 
         super(self.__class__,self).__init__(
                 name=name,
                 typegen=typegen,
                 work_dim=work_dim, 
-                kernel_args=kernel_args,
-                vec_type_hint=dtype,
+                known_vars = known_vars,
+                kernel_args = kernel_args,
                 **kargs)
         
-        self.dtype        = dtype
-        self.tile_size    = tile_size
-        self.tile_padding = tile_padding
+        self.ctype         = ctype
+        self.permutation   = permutation
+        self.components    = components
+        self.dim           = dim
+        self.pdim          = pdim
+        self.tile_dim      = tile_dim
+        self.tile_size     = tile_size
+        self.tile_shape    = tile_shape
+        self.tile_padding  = tile_padding
+        self.is_inplace    = is_inplace
+        self.vectorization = vectorization
+        self.naxes         = naxes
 
         self.gencode()
 
@@ -58,14 +105,17 @@ class TransposeKernel(KernelCodeGenerator):
         return reqs
 
     
-    def gen_kernel_arguments(self, typegen, dtype, work_dim):
+    def gen_kernel_arguments(self, typegen, ctype, pdim):
         _global = OpenClCodeGenerator.default_keywords['global']
         tg = typegen
         
         kargs  = ArgDict()
-        kargs['in']    = CodegenVariable(dtype=dtype,name='in' ,typegen=tg,storage=_global,ptr=True,const=True,restrict=True,nl=True)
-        kargs['out']   = CodegenVariable(dtype=dtype,name='out',typegen=tg,storage=_global,ptr=True,restrict=True,nl=True)
-        kargs['dsize'] = CodegenVectorClBuiltin(btype='int', dim=work_dim, name='S', typegen=tg, add_impl_const=True)
+        kargs['in']    = CodegenVariable(ctype=ctype, name='in',
+                typegen=tg, storage=_global, const=True, ptr=True, add_impl_const=True, ptr_restrict=True, nl=True)
+        kargs['out']   = CodegenVariable(ctype=ctype, name='out',
+                typegen=tg, storage=_global, ptr=True, ptr_const=True, ptr_restrict=True, nl=True)
+        kargs['shape'] = CodegenVectorClBuiltin(btype='int', dim=pdim,
+                name='shape', typegen=tg, add_impl_const=True, symbolic_mode=True)
         return kargs
 
 
@@ -75,46 +125,101 @@ class TransposeKernel(KernelCodeGenerator):
 
         _local  = OpenClCodeGenerator.default_keywords['local']
 
-        s        = self
-        tg       = s.typegen
-        work_dim = s.work_dim
-        dtype    = s.dtype
-       
-        global_id     = s.vars['global_id']
-        local_id      = s.vars['local_id']
-
-        _in  = s.vars['in']
-        _out = s.vars['out']
-        S    = s.vars['dsize']
-
-        tile_size = CodegenVectorClBuiltin(typegen=tg,name='TILE_SIZE',btype='uint', dim=2, 
-                const=True, value=self.tile_size)
-        tile_padding = CodegenVariable(typegen=tg,name='TILE_PADDING',dtype='uint', 
-                const=True, value=self.tile_padding)
-        tile = CodegenArray(typegen=tg, name='tile', dtype=dtype, storage='__constant',
-                dim=2,
-                size=(tile_size[0](), '{}+{}'.format(tile_size[1](), tile_padding()),)
-                )
-
+        s             = self
+        tg            = s.typegen
+        work_dim      = s.work_dim
+        symbolic_mode = s.symbolic_mode
+
+        dim           = s.dim
+        pdim          = s.pdim
+
+        ctype         = s.ctype
+        permutation   = s.permutation
+        components    = s.components
+        tile_dim      = s.tile_dim
+        tile_size     = s.tile_size
+        tile_shape    = s.tile_shape
+        tile_padding  = s.tile_padding
+        is_inplace    = s.is_inplace
+        vectorization = s.vectorization
+        naxes         = s.naxes
+
+        local_id   = s.vars['local_id']
+        local_size = s.vars['local_size']
+        group_id   = s.vars['group_id']
+        group_size = s.vars['num_groups']
+
+        _in       = s.vars['in']
+        _out      = s.vars['out']
+        S         = s.vars['shape']
+
+        tile_size = CodegenVariable(typegen=tg, name='tile_size', ctype='int', 
+                const=True, value=self.tile_size, symbolic_mode=symbolic_mode)
+        tile_padding = CodegenVariable(typegen=tg, name='tile_padding', ctype='int', 
+                const=True, value=self.tile_padding, symbolic_mode=symbolic_mode)
+        tile_sshape = [tile_size]*tile_dim
+        tile_sshape[-1] = '{}+{}'.format(tile_size.name, tile_padding.name)
+        tile = CodegenArray(typegen=tg, name='tile', ctype=ctype, storage='__local',
+                dim=tile_dim, shape=tile_sshape)
+        
+        is_tile_index  = tuple( (i==0 or permutation[i]) for i in xrange(dim) ) 
+        is_tile_index += (False,)*(pdim-dim)
+        is_tile_index_var = CodegenVectorClBuiltin('is_tile_index', 'int', pdim, tg, const=True, 
+                init=np.asarray(is_tile_index, dtype=np.int32))
+
+        ntiles = CodegenVectorClBuiltin('ntiles', 'int', pdim, tg, const=True, 
+                init='{} * (({}+{}-1)/{})'.format(is_tile_index_var, S, tile_size, tile_size)) 
+
+        idx = CodegenVectorClBuiltin('idx', 'int', pdim, tg, const=True)
+
+        @contextmanager
+        def _work_iterate_(i):
+            try:
+                if is_tile_index[i]:
+                    loop = '{i}=0; {i}<{N}; {i}++'.format(i=idx[i], N=S[i])
+                else:
+                    loop = '{i}=0; {i}<{N}; {i}++'.format(i=idx[i], N=S[i])
+                with s._for_(loop) as ctx:
+                    yield ctx
+            except:
+                raise
+        nested_loops = [ _work_iterate_(i) for i,k in enumerate(permutation) if (i!=0 and not k) ][::-1]
 
         with s._kernel_():
             with s._align_() as al:
-                tile_size.define(al,align=True)
-                tile_padding.define(al,align=True)
+                tile_size.declare(al,align=True)
+                tile_padding.declare(al,align=True)
+            s.jumpline()
+            is_tile_index_var.declare(s)
+            ntiles.declare(s)
+            s.jumpline()
             with s._align_() as al:
-                global_id.define(al,align=True,const=True)
-                local_id.define(al,align=True,const=True)
-            #s.check_workitem_bounds(grid_size, compact=False)
-            #s.jumpline()
+                local_id.declare(al,align=True,const=True)
+                group_id.declare(al,align=True,const=True)
+                local_size.declare(al,align=True,const=True)
+                group_size.declare(al,align=True,const=True)
+
+            s.jumpline()
+            tile.declare(s)
+            
+            s.jumpline()
+            s.comment('Iteration over all non active axes')
+            idx.declare(s)
+            with nested(*nested_loops):
+                s.append(';')
 
 
 if __name__ == '__main__':
     
-    from hysop.backend.device.codegen.base.test import test_typegen
-    typegen = test_typegen('float')
-    ek = TransposeKernel(typegen=tg, work_dim=3, dtype='float4',
-            tile_size=(8,8,), tile_padding=1,
-            known_vars=dict())
+    from hysop.backend.device.codegen.base.test import _test_typegen
+    tg = _test_typegen('float')
+    ek = TransposeKernel(typegen=tg, ctype='float', vectorization=2,
+            components = (0,2,1,3,4),
+            tile_size=8, tile_padding=1,
+            is_inplace=False,
+            symbolic_mode=True,
+            known_vars={
+                'shape' : (100,200,300,400,500,0,0,0)
+            })
     ek.edit()
-    #ek.test_compile()
-    print
+    ek.test_compile()
diff --git a/hysop/constants.py b/hysop/constants.py
index 1cca17bde937c3eacc9efb87a79bc85409a16323..2494bced9cc26549e1aa7570be250e1e8378ea7f 100644
--- a/hysop/constants.py
+++ b/hysop/constants.py
@@ -197,6 +197,9 @@ TranspositionState2D = transposition_states[2]
 TranspositionState3D = transposition_states[3]
 """3D memory layout (transposition state) enum"""
 
+TransposeConfiguration = EnumFactory.create('TranspositionConfiguration',
+        ['TRANSPOSE_XY', 'TRANSPOSE_XZ', 'TRANSPOSE_YZ'])
+
 default_order = None
 """
 Default HySoP memory ordering for array allocations.
diff --git a/hysop/constants.py.in b/hysop/constants.py.in
index b2a900d716f7eca19fa6f91955ca480868ea623c..dd00fa9b0677cb1efe659a1b930b2d60d2c0bc4d 100644
--- a/hysop/constants.py.in
+++ b/hysop/constants.py.in
@@ -197,6 +197,12 @@ TranspositionState2D = transposition_states[2]
 TranspositionState3D = transposition_states[3]
 """3D memory layout (transposition state) enum"""
 
+TransposeConfiguration = EnumFactory.create('TranspositionConfiguration',
+        ['TRANSPOSE_XY', 'TRANSPOSE_XZ', 'TRANSPOSE_YZ'])
+"""
+Possible transposition configurations used to transpose fields in the Transposition operator.
+"""
+
 default_order = None
 """
 Default HySoP memory ordering for array allocations.
diff --git a/hysop/old/gpu.old/cl_src/kernels/transpose_xz.cl b/hysop/old/gpu.old/cl_src/kernels/transpose_xz.cl
index b2197fbb12f4643f1922ab8674a22af9a0583174..7d646eca97e3c18eecfe7b1f0f7ec25fc43b7ca1 100644
--- a/hysop/old/gpu.old/cl_src/kernels/transpose_xz.cl
+++ b/hysop/old/gpu.old/cl_src/kernels/transpose_xz.cl
@@ -70,7 +70,7 @@ __kernel void transpose_xz(__global const float* in,
 	    /* Fill the tile */
 	    temp = vload__N__((index_in + i*NB_III + j*NB_III*NB_II)/__N__, in);
 	    tile[lid_z + j][lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__;
-	  }
+	}
 
 	}
 	/* Synchronize work-group */
diff --git a/hysop/tools/misc.py b/hysop/tools/misc.py
index 341f3d37440bba10fb36d10eeb6df6f60e274897..a66bd6fdc572445681b0e1cceebb48494c6d3c8d 100644
--- a/hysop/tools/misc.py
+++ b/hysop/tools/misc.py
@@ -41,6 +41,21 @@ def kargs2args(func, kargs, remove=[]):
     argnames,_,_,_ = inspect.getargspec(func)
     return tuple([kargs[a] for a in argnames if a not in remove])
 
+def upper_pow2(x):
+    def _upper_pow2(x):
+        if x<0:
+            raise RuntimeError('x<0')
+        i=0
+        k=2**i
+        while k<x:
+            i+=1
+            k*=2
+        return k
+
+    if np.isscalar(x):
+        return _upper_pow2(x)
+    else:
+        return np.asarray([_upper_pow2(_x) for _x in x])
 
 class Utils(object):
     """tools to handle array and slices.
diff --git a/setup.py.in b/setup.py.in
index 8aeb7c90af78cda61ed8c9da32599d1d41c7caa0..cbcd22b59ebcd0b9a826229b7bcb9367cef47ebd 100644
--- a/setup.py.in
+++ b/setup.py.in
@@ -242,16 +242,13 @@ def create_swig_extension(name, inc_dirs, src_dirs=None, sources=None):
 # ------------ Set list of packages required to build the module -------------
 # List of modules (directories) to be included
 with_test = "@WITH_TESTS@" is "ON"
-if with_test:
-    packages = find_packages(exclude=["*opencl*"], where="@CMAKE_SOURCE_DIR@")
-else:
-    packages = find_packages(exclude=["*tests*", "*opencl*"],
-                             where="@CMAKE_SOURCE_DIR@")
-
-if "@WITH_GPU@" is "ON":
-    packages.append('hysop.backend.device.opencl')
-    if with_test:
-        packages.append('hysop.backend.device.opencl.tests')
+with_gpu = "@WITH_GPU@" is "ON"
+exclude=[]
+if not with_test:
+   exclude.append('*tests*')
+if not with_gpu:
+   exclude.append('*opencl*')
+packages = find_packages(exclude=exclude, where="@CMAKE_SOURCE_DIR@")
 
 # Enable this to get debug info
 DISTUTILS_DEBUG = 1