diff --git a/examples/example_utils.py b/examples/example_utils.py
index 4e5d7706ee3bb2aaea5feac36fa9ae753519821b..f7fee9b5dab904d0353ac2de2e4c27adab03758c 100644
--- a/examples/example_utils.py
+++ b/examples/example_utils.py
@@ -111,7 +111,7 @@ class HysopArgParser(argparse.ArgumentParser):
         if (domain is None):
             domain = 'box'
         if (default_dump_dir is None):
-            default_dump_dir = '/tmp/hysop/{}'.format(prog_name)
+            default_dump_dir = '{}/hysop/{}'.format(self.tmp_dir(), prog_name)
 
         self._domain = domain
         
@@ -605,10 +605,18 @@ class HysopArgParser(argparse.ArgumentParser):
                 dest='clean',
                 help=('Clean the dump_folder prior to launch '+
                         '(remove all *.txt, *.png, *.xmf and *.h5 files)'))
+        file_io.add_argument('--debug-dump-dir', type=str, 
+                default='{}/hysop/debug'.format(self.tmp_dir()), 
+                dest='debug_dump_dir',
+                help=('Target root directory for debug dumps. Debug dumps will appear into <dump dir>/<target>.'))
+        file_io.add_argument('--debug-dump-target', type=str, default=None, 
+                dest='debug_dump_target',
+                help=('Tag for field debug dumps. Debug dumps will appear into <dump dir>/<target>.'))
         return file_io
     
     def _check_file_io_args(self, args):
-        self._check_default(args, ('dump_dir', 'cache_dir'), str, allow_none=True)
+        self._check_default(args, ('dump_dir', 'cache_dir', 
+            'debug_dump_dir', 'debug_dump_target'), str, allow_none=True)
         self._check_default(args, ('dump_freq'), int, allow_none=True)
         self._check_default(args, ('dump_times'), tuple, allow_none=True)
         self._check_default(args, ('override_cache'), bool, allow_none=True)
diff --git a/examples/taylor_green/taylor_green.py b/examples/taylor_green/taylor_green.py
index 278fba68c440ca779571aee302b99228936cd6e9..bb06faebcde0965c488c585125c65c0eef9f3650 100644
--- a/examples/taylor_green/taylor_green.py
+++ b/examples/taylor_green/taylor_green.py
@@ -37,7 +37,6 @@ def compute(args):
 
     from hysop.methods import SpaceDiscretization, Remesh, TimeIntegrator, \
                               ComputeGranularity, Interpolation
-
     # Define the domain
     dim  = args.ndim
     npts = args.npts
@@ -226,12 +225,21 @@ def compute(args):
             min_max_U.Finf, min_max_W.Finf, adapt_dt.equivalent_CFL,
             filename='parameters.txt', precision=8)
     
+    # Attach a field debug dumper if requested
+    from hysop.tools.debug_dumper import DebugDumper
+    if args.debug_dump_target:
+        debug_dumper = DebugDumper(
+                path=args.debug_dump_dir,
+                name=args.debug_dump_target, 
+                force_overwrite=True, enable_on_op_apply=True)
+    else:
+        debug_dumper = None
+    
     # Initialize only the vorticity
-    dfields = problem.input_discrete_fields
-    dfields[vorti].initialize(formula=init_vorticity)
+    problem.initialize_field(vorti, formula=init_vorticity)
 
     # Finally solve the problem 
-    problem.solve(simu, dry_run=args.dry_run, debug_dumper=None)
+    problem.solve(simu, dry_run=args.dry_run, debug_dumper=debug_dumper)
     
     # Finalize
     problem.finalize()
diff --git a/hysop/backend/device/opencl/opencl_array.py b/hysop/backend/device/opencl/opencl_array.py
index 071081edcf0d7f87e9923f6deb833ba45ee7c76a..db424e001f581d1022ebf48744546a647a0a9121 100644
--- a/hysop/backend/device/opencl/opencl_array.py
+++ b/hysop/backend/device/opencl/opencl_array.py
@@ -36,8 +36,15 @@ class OpenClArray(Array):
             msg='{} unsupported yet for OpenCl arrays.'.format(handle.dtype)
             raise TypeError(msg)
         
-        backend.check_queue(handle.queue)
         super(OpenClArray,self).__init__(handle=handle, backend=backend, **kargs)
+        
+        # at this time the opencl backend works only with the default_queue 
+        # so we enforce it.
+        if (handle.queue is not None) and (handle.queue is not self.default_queue):
+            msg = 'pyopencl.Array has been created with a non-default queue.'
+            raise RuntimeError(msg)
+        backend.check_queue(handle.queue)
+        self.set_default_queue(self.default_queue)
     
     def as_symbolic_array(self, name, **kwds):
         """
@@ -191,6 +198,11 @@ class OpenClArray(Array):
         """
         Sets the default queue for this array.
         """
+        # at this time the opencl backend works only with the default_queue 
+        # so we enforce it.
+        if (queue is not self.default_queue):
+            msg='Default queue override has been disabled for non-default queues.'
+            raise RuntimeError(msg)
         queue = self.backend.check_queue(queue)
         self._handle.queue = queue
     def reset_default_queue(self):
@@ -367,7 +379,7 @@ class OpenClArray(Array):
             try:
                 self.handle.setitem(subscript=subscript, value=value, 
                         queue=queue)
-            except (ValueError, RuntimeError):
+            except:
                 from hysop.backend.device.opencl.opencl_copy_kernel_launchers import \
                         OpenClCopyBufferRectLauncher
                 kl = OpenClCopyBufferRectLauncher.from_slices(varname='buffer', src=value, 
diff --git a/hysop/backend/device/opencl/opencl_array_backend.py b/hysop/backend/device/opencl/opencl_array_backend.py
index a8e8972323a390ae6f48e667aaf9d487ddf60358..2b72adb314c799290bc375d6b55290251001d85e 100644
--- a/hysop/backend/device/opencl/opencl_array_backend.py
+++ b/hysop/backend/device/opencl/opencl_array_backend.py
@@ -343,12 +343,18 @@ class OpenClArrayBackend(ArrayBackend):
         check_instance(queue, cl.CommandQueue, allow_none=True)
         check_instance(allocator, OpenClAllocator, allow_none=True)
         
+        # disable multi queue support at this time
+        msg='Non-default queue support has been disabled. Please provide a cl_env only.'
+        if (cl_env is None):
+            raise RuntimeError(msg)
+        if (queue is not None) and (queue is not cl_env.default_queue):
+            raise RuntimeError(msg)
+        
         if (queue is None):
             cl_env    = cl_env or get_or_create_opencl_env()
             context   = cl_env.context
             queue     = cl_env.default_queue
             allocator = allocator or cl_env.allocator
-            assert allocator.context == context
         elif (cl_env is None):
             context = queue.context
             if (allocator is None):
@@ -1486,8 +1492,13 @@ class OpenClArrayBackend(ArrayBackend):
 
         shape = to_tuple(shape)
         
-        queue = queue or self.default_queue
-        self.check_queue(queue)
+        # at this time the opencl backend works only with the default_queue 
+        # so we enforce it by not binding any Array to any queue.
+        if (queue is not None) and (queue is not self.default_queue):
+            msg = 'pyopencl.Array has been created with non-default queue.'
+            raise RuntimeError(msg)
+        queue = self.check_queue(queue)
+        cq = queue #force default queue
 
         if buf is None:
             assert offset==0
@@ -1504,7 +1515,7 @@ class OpenClArrayBackend(ArrayBackend):
             buf = allocator.allocate_aligned(size, alignment=alignment)
         
     
-        handle = self._call(clArray.Array, cq=queue, shape=shape, dtype=dtype, order=order, 
+        handle = self._call(clArray.Array, cq=cq, shape=shape, dtype=dtype, order=order, 
                                 allocator=None, data=buf, offset=offset, 
                                 strides=strides, events=events)
         return self.wrap(handle)
@@ -1518,10 +1529,8 @@ class OpenClArrayBackend(ArrayBackend):
         from hysop.backend.device.opencl.opencl_array  import Array, OpenClArray
         from hysop.backend.host.host_array import HostArray
         acls = a.__class__.__name__
-        self.check_queue(queue)
+        queue = self.check_queue(queue)
 
-        queue = queue or self.default_queue
-        
         if (dtype is not None):
             pass
         if isinstance(a, Array):
@@ -1557,7 +1566,7 @@ class OpenClArrayBackend(ArrayBackend):
         Return an array copy of the given object.
         """
         self._unsupported_argument('copy','order',order,MemoryOrdering.SAME_ORDER)
-        self.check_queue(queue)
+        queue = queue = self.check_queue(queue)
         return self._call(a.handle.copy, queue=queue)
 
 # Filling
@@ -1565,7 +1574,7 @@ class OpenClArrayBackend(ArrayBackend):
         """
         Fill the array with given value
         """
-        self.check_queue(queue)
+        queue = queue = self.check_queue(queue)
         if a.flags.forc:
             # only c-contiguous arrays are handled by pyopencl
             a.handle.fill(value=value, queue=queue)
@@ -3067,7 +3076,7 @@ class OpenClArrayBackend(ArrayBackend):
         Default generator is clRandom.PhiloxGenerator.
         """
         self._check_argtype('rand', 'shape', shape, tuple)
-        self.check_queue(queue)
+        queue = self.check_queue(queue)
         if (generator is None):
             generator = clRandom.PhiloxGenerator(context=self.context)
         if (out is None):
@@ -3083,7 +3092,7 @@ class OpenClArrayBackend(ArrayBackend):
         Return samples from the standard normal distribution [mu,sigma].
         Default generator is clRandom.PhiloxGenerator.
         """
-        self.check_queue(queue)
+        queue = self.check_queue(queue)
         self._check_argtype('randn', 'shape', shape, tuple)
         if (generator is None):
             generator = clRandom.PhiloxGenerator(context=self.context)
diff --git a/hysop/backend/device/opencl/opencl_operator.py b/hysop/backend/device/opencl/opencl_operator.py
index a941e0b9788dd3161bd19f307e9a6f9cca1a61e6..18550a1ea682f55220ec27d3b3d14820afdeed1c 100644
--- a/hysop/backend/device/opencl/opencl_operator.py
+++ b/hysop/backend/device/opencl/opencl_operator.py
@@ -148,8 +148,10 @@ class OpenClOperator(ComputationalGraphOperator):
             msg=msg.format(precision)
             raise ValueError(msg)
    
-        self.typegen = self.cl_env.build_typegen(precision=precision, float_dump_mode=float_dump_mode,
-                use_short_circuit_ops=use_short_circuit_ops, unroll_loops=unroll_loops)
+        self.typegen = self.cl_env.build_typegen(precision=precision, 
+                float_dump_mode=float_dump_mode,
+                use_short_circuit_ops=use_short_circuit_ops, 
+                unroll_loops=unroll_loops)
         self.autotuner_config = autotuner_config
         self.precision = convert_precision(precision)
         
diff --git a/hysop/backend/device/opencl/operator/poisson_rotational.py b/hysop/backend/device/opencl/operator/poisson_rotational.py
index 52862ec7f7ab53addac8382eb0febf321b0f2fc6..068057da057879b4d9cbdf08aeb5132fe592197e 100644
--- a/hysop/backend/device/opencl/operator/poisson_rotational.py
+++ b/hysop/backend/device/opencl/operator/poisson_rotational.py
@@ -219,22 +219,16 @@ class OpenClPoissonRotational(PoissonRotationalOperatorBase, OpenClSymbolic):
             self._exchange_W_ghosts = self.dW.exchange_ghosts(build_launcher=True)
 
     @op_apply
-    def apply(self, simulation, debug_dumper=None, **kwds):
+    def apply(self, simulation, **kwds):
         '''Solve the PoissonRotational equation.'''
         # /!\ clFFT use the destination buffer as a scratch
         # so we reverse the order of forward transforms.
         
-        #it = simulation.current_iteration
-        #t = simulation.t()
-        
-        #debug_dumper(it, t, 'W', self.dW)
-        
         for fp in self.forward_W_plans:
             evt, = fp.enqueue(queue=self.queue)
         
         # project and recover vorticity if required
         if self._do_project(simulation):
-            assert False
             evt = self.projection_kernel(queue=self.queue)
             for bp in self.backward_W_plans:
                 evt, = bp.enqueue()
@@ -243,11 +237,9 @@ class OpenClPoissonRotational(PoissonRotationalOperatorBase, OpenClSymbolic):
         
         # recover velocity
         evt = self.compute_velocity_kernel(queue=self.queue)
-
+        
         for bp in self.backward_U_plans:
             evt, = bp.enqueue()
-        
+
         if (self._exchange_U_ghosts is not None):
             evt = self._exchange_U_ghosts(queue=self.queue)
-        
-        #debug_dumper(it, t, 'U', self.dU)
diff --git a/hysop/core/arrays/array.py b/hysop/core/arrays/array.py
index 3240192758c35853a9d64e75551deb5c6d7bb0f4..deab126139df360672e04ee89a0cc093c7841e11 100644
--- a/hysop/core/arrays/array.py
+++ b/hysop/core/arrays/array.py
@@ -70,7 +70,9 @@ class Array(object):
         Return underlying implementation of this buffer.
         """
         if not hasattr(self, '_handle'):
-            raise RuntimeError('{} has no _handle defined.', self.__class__)
+            msg='{} has no _handle defined.'
+            msg=msg.format(self.__class__)
+            raise RuntimeError(msg)
         return self._handle
 
     def get_backend(self):
@@ -78,7 +80,9 @@ class Array(object):
         Return the backend corresponding to this array.
         """
         if not hasattr(self, '_backend'):
-            raise RuntimeError('{} has no _backend defined.', self.__class__)
+            msg='{} has no _backend defined.'
+            msg=msg.format(self.__class__)
+            raise RuntimeError(msg)
         return self._backend
    
     handle  = property(get_handle)
diff --git a/hysop/core/graph/graph.py b/hysop/core/graph/graph.py
index 752f918d55b34d908f44ffed9f4ea200632c17eb..f781d5017746a2e08719cbcd696d32a326a51f9d 100644
--- a/hysop/core/graph/graph.py
+++ b/hysop/core/graph/graph.py
@@ -129,7 +129,26 @@ def op_apply(f):
         dbg = ('dbg' in kwds) 
         dbg = dbg and (kwds['dbg'] is not None)
         dbg = dbg and (kwds['dbg'].enable_on_op_apply)
-        if dbg:
+        debug_dump = ('debug_dumper' in kwds) 
+        debug_dump = debug_dump and (kwds['debug_dumper'] is not None)
+        debug_dump = debug_dump and (kwds['debug_dumper'].enable_on_op_apply)
+        if debug_dump:
+            assert 'simulation' in kwds
+            op = args[0]
+            simu = kwds['simulation']
+            it = simu.current_iteration
+            t = simu.t()
+            for dfield in op.input_discrete_fields.values():
+                tag = 'pre_{}_{}'.format(op.name, dfield.name)
+                kwds['debug_dumper'](it, t, tag, 
+                        tuple(df.get().handle for df in dfield.data))
+            ret = f(*args, **kwds)
+            for dfield in op.output_discrete_fields.values():
+                tag = 'post_{}_{}'.format(op.name, dfield.name)
+                kwds['debug_dumper'](it, t, tag, 
+                        tuple(df.get().handle for df in dfield.data))
+            return ret
+        elif dbg:
             import inspect
             msg=inspect.getsourcefile(f)
             kwds['dbg']('pre '+msg, nostack=True)
diff --git a/hysop/fields/cartesian_discrete_field.py b/hysop/fields/cartesian_discrete_field.py
index c391f174191ac0c79a5c7f835b53b6ad833c17f5..999e595d039892774f011c656d2bdedb831cea95 100644
--- a/hysop/fields/cartesian_discrete_field.py
+++ b/hysop/fields/cartesian_discrete_field.py
@@ -548,9 +548,9 @@ CartesianDiscreteFieldView (id={}, tag={})
             raise RuntimeError(msg)
 
         if __debug__:
-            assert all((d.size  == self.size)     for d in data), 'Array size was altered.'
-            assert all((d.dtype == self.dtype)    for d in data), 'Array dtype was altered.'
-            assert all(all(d.shape == self.shape) for d in data), 'Array shape was altered.'
+            assert all((d.size  == self.size)     for d in data), 'Array size has been altered.'
+            assert all((d.dtype == self.dtype)    for d in data), 'Array dtype has been altered.'
+            assert all(all(d.shape == self.shape) for d in data), 'Array shape has been altered.'
         
         if only_finite:
             for (i,d) in enumerate(data):
diff --git a/hysop/fields/ghost_exchangers.py b/hysop/fields/ghost_exchangers.py
index 914bf7727d07c0990cfa1bc8a62ec0207d86dc29..6cc1d8a935b0dafbd47c03965d10890a2d18b614 100644
--- a/hysop/fields/ghost_exchangers.py
+++ b/hysop/fields/ghost_exchangers.py
@@ -205,7 +205,7 @@ class CartesianDiscreteFieldGhostExchanger(GhostExchanger):
                     if all(ghosts[d]>0 for d in directions):
                         for displacements in all_outer_ghost_slices[dim][directions]:
                             ndisplacements = sum(d!=0 for d in displacements)
-                            if (ndisplacements != dim):
+                            if (ndisplacements < 2):
                                 continue
                             slc, shape = all_outer_ghost_slices[dim][directions][displacements]
                             value = default_invalid_value(dtype=base_dtype)
diff --git a/hysop/fields/tests/test_cartesian.py b/hysop/fields/tests/test_cartesian.py
index 5b133403bb987bcb849ccac7675d5600f5c0cde5..507b4bcea01d918b5f515e70d464f48ddd197f36 100644
--- a/hysop/fields/tests/test_cartesian.py
+++ b/hysop/fields/tests/test_cartesian.py
@@ -134,6 +134,10 @@ def test_serial_initialization_3d():
             assert (buf[:,:,nghosts[2]:2*nghosts[2]] == buf[:,:,nghosts[2]+npts[2]-1:]).all()
             assert (buf[:,:,:nghosts[2]] == buf[:,:,npts[2]-1:nghosts[2]+npts[2]-1]).all()
 
+def iter_backends():
+    yield (Backend.HOST, None)
+    for cl_env in iter_clenv():
+        yield (Backend.OPENCL, cl_env)
 
 def test_mpi_ghost_exchange(comm):
     rank = comm.Get_rank()
@@ -166,16 +170,18 @@ def test_mpi_ghost_exchange(comm):
             F0 = Field(domain=domain, name='F0', nb_components=1, dtype=dtype, _register=False) 
             F1 = Field(domain=domain, name='F1', nb_components=2, dtype=dtype, _register=False) 
             
-            for backend in (Backend.HOST, Backend.OPENCL,):
+            for (backend, cl_env) in iter_backends():
                 if rank==0:
-                    print '      >BACKEND.{}'.format(backend)
+                    print '      >BACKEND.{}{}'.format(backend, 
+                            '' if (cl_env is None) else '::{}.{}'.format(
+                                cl_env.platform.name.strip(), cl_env.device.name.strip()))
                 for shape in it.product(xrange(0,size+1), repeat=dim): 
                     if np.prod(shape, dtype=np.uint32)!=size:
                         continue
                     if rank==0:
                         print '         *cart_shape: {}'.format(shape)
                     topo = CartesianTopology(domain=domain, discretization=discretization, 
-                                               backend=backend, cart_shape=shape)
+                                               backend=backend, cart_shape=shape, cl_env=cl_env)
                     assert (topo.proc_shape==shape).all()
 
                     def ghost_base(i,d,rank,local_dir):
@@ -267,16 +273,18 @@ def test_mpi_ghost_accumulate(comm):
             F0 = Field(domain=domain, name='F0', nb_components=1, dtype=dtype, _register=False) 
             F1 = Field(domain=domain, name='F1', nb_components=2, dtype=dtype, _register=False) 
         
-            for backend in (Backend.HOST, Backend.OPENCL,):
+            for (backend, cl_env) in iter_backends():
                 if rank==0:
-                    print '      >BACKEND.{}'.format(backend)
+                    print '      >BACKEND.{}{}'.format(backend, 
+                            '' if (cl_env is None) else '::{}.{}'.format(
+                                cl_env.platform.name.strip(), cl_env.device.name.strip()))
                 for shape in it.product(xrange(0,size+1), repeat=dim): 
                     if np.prod(shape, dtype=np.uint32)!=size:
                         continue
                     if rank==0:
                         print '        *cart_shape: {}'.format(shape)
                     topo = CartesianTopology(domain=domain, discretization=discretization, 
-                                               backend=backend, cart_shape=shape)
+                                               backend=backend, cart_shape=shape, cl_env=cl_env)
                     assert (topo.proc_shape==shape).all()
 
                     def ghost_base(rank, directions, displacements, i):
diff --git a/hysop/numerics/fft/fft.py b/hysop/numerics/fft/fft.py
index ded2b830ac10f40a089d23dfb3b3d8fd17eb2fd9..5a2ff8b079492da2731e50669e888b66e63e6163 100644
--- a/hysop/numerics/fft/fft.py
+++ b/hysop/numerics/fft/fft.py
@@ -5,9 +5,18 @@ Base interface for fast Fourier Transforms.
 """
 
 from abc import ABCMeta, abstractmethod
+import warnings
 import numpy as np
 
+from hysop.tools.types import first_not_None
 from hysop.tools.numerics import float_to_complex_dtype, complex_to_float_dtype
+from hysop.tools.units import bytes2str
+from hysop.backend.host.host_array_backend import HostArrayBackend
+from hysop.backend.host.host_array import HostArray
+
+from hysop.tools.warning import HysopWarning
+class HysopFFTWarning(HysopWarning):
+    pass
 
 def mk_shape(base_shape, axis, N):
     """
@@ -44,19 +53,48 @@ class FFTPlanI(object):
     """
     __metaclass__ = ABCMeta
 
+    def __init__(self):
+        self._setup = False
+
     @abstractmethod
-    def execute(self):
+    def input_array(self):
         """
-        Execute the FFT plan on current input and output array.
+        Return currently planned input array.
+        """
+        pass
+    
+    @abstractmethod
+    def output_array(self):
+        """
+        Return currently planned output array.
         """
         pass
+    
+    def setup(self):
+        """
+        Method that has to be called before any call to execute.
+        """
+        if self._setup:
+            msg='Plan was already setup...'
+            raise RuntimeError(msg)
+        self._setup = True
+        return self
 
     @abstractmethod
-    def __call__(self, a=None, out=None):
+    def execute(self):
+        """
+        Execute the FFT plan on current input and output array.
+        """
+
+    def __call__(self, a=None, out=None, **kwds):
         """
         Apply the FFT plan to possibly new input or output arrays.
         """
-        pass
+        if (a is not None) or (out is not None):
+            msg='New array execute is not available for FFT backend {}.'
+            msg=msg.format(type(self).__name__)
+            raise RuntimeError(msg)
+        self.execute(**kwds)
 
 
 class FFTI(object):
@@ -110,13 +148,31 @@ class FFTI(object):
     __metaclass__ = ABCMeta
 
 
-    def __init__(self):
+    def __init__(self, backend=None, warn_on_allocation=True):
         """Initializes the interface and default supported real and complex types."""
         self.supported_ftypes = (np.float32, np.float64)
         self.supported_ctypes = (np.complex64, np.complex128)
         self.supported_cosine_transforms = (1,2,3)
         self.supported_sine_transforms   = (1,2,3)
+
+        if (backend is None):
+            backend = HostArrayBackend.get_or_create(allocator=None)
+        self.backend = backend
+        self.warn_on_allocation = warn_on_allocation
    
+    def allocate_output(self, out, shape, dtype):
+        """Alocate output if required and check shape and dtype."""
+        if (out is None):
+            if self.warn_on_allocation:
+                nbytes = np.prod(shape, dtype=np.int64)*dtype.itemsize
+                msg='FftwFFT: allocating aligned output array of size {}.'
+                msg=msg.format(bytes2str(nbytes))
+                warnings.warn(msg, HysopFFTWarning)
+            out = self.backend.empty(shape=shape, dtype=dtype)
+        else:
+            assert out.dtype == dtype
+            assert out.shape == shape
+        return out
 
     @abstractmethod
     def fft(self, a, out, axis=-1, **kwds):
@@ -140,7 +196,9 @@ class FFTI(object):
         Notes
         -----
         N = a.shape[axis]
-        out[0] will contain the sum of the signal (zero-frequency term always real for real inputs).
+        out[0] will contain the sum of the signal (zero-frequency term always real for 
+        real inputs).
+
         If N is even:
             out[1:N/2]   contains the positive frequency terms.
             out[N/2]     contains the Nyquist frequency (always real for real inputs).
@@ -185,7 +243,8 @@ class FFTI(object):
     @abstractmethod
     def rfft(self, a, out, axis=-1, **kwds):
         """
-        Compute the unscaled one-dimensional real to hermitian complex discrete Fourier Transform.
+        Compute the unscaled one-dimensional real to hermitian complex discrete Fourier 
+        Transform.
 
         Parameters
         ----------
@@ -206,8 +265,8 @@ class FFTI(object):
         Notes
         -----
         For real inputs there is no information in the negative frequency components that 
-        is not already  available from the positive frequency component because of the Hermitian 
-        symmetry.
+        is not already  available from the positive frequency component because of the 
+        Hermitian symmetry.
 
         N = out.shape[axis] = a.shape[axis]//2 + 1
         out[0] will contain the sum of the signal (zero-frequency term, always real).
@@ -334,19 +393,16 @@ class FFTI(object):
         -------
         (shape, dtype) of the output array determined from the input array.
         """
-        if type==2:
-            type=3
-        elif type==3:
-            type=2
+        itype = [1,3,2,4][type-1]
         n = a.shape[axis]
-        N = 2*(n - (type==1))
-        scaling = 1.0/N
+        N = 2*(n - (itype==1))
+        logical_size = N
         assert a.dtype in self.supported_ftypes, a.dtype
-        assert type in self.supported_cosine_transforms, self.supported_cosine_transforms
+        assert itype in self.supported_cosine_transforms, self.supported_cosine_transforms
         if (out is not None):
             assert a.dtype == out.dtype
             assert np.array_equal(a.shape, out.shape)
-        return (a.shape, a.dtype, type, scaling)
+        return (a.shape, a.dtype, itype, logical_size)
     
     @abstractmethod
     def dst(self, a, out=None, type=2, axis=-1, **kwds):
@@ -397,17 +453,14 @@ class FFTI(object):
         -------
         (shape, dtype) of the output array determined from the input array.
         """
-        if type==2:
-            type=3
-        elif type==3:
-            type=2
+        itype = [1,3,2,4][type-1]
         n = a.shape[axis]
-        N = 2*(n + (type==1))
-        scaling = 1.0/N
+        N = 2*(n + (itype==1))
+        logical_size = N
         assert a.dtype in self.supported_ftypes, a.dtype
         assert type in self.supported_sine_transforms, self.supported_sine_transforms
         if (out is not None):
             assert a.dtype == out.dtype
             assert np.array_equal(a.shape, out.shape)
-        return (a.shape, a.dtype, type, scaling)
+        return (a.shape, a.dtype, itype, logical_size)
 
diff --git a/hysop/numerics/fft/fftw_fft.py b/hysop/numerics/fft/fftw_fft.py
index f0b6b4cb6256b5ffe5f6863bcec107862bd381ff..4086472b4d333513ca499dc241d2325c51f1fdd9 100644
--- a/hysop/numerics/fft/fftw_fft.py
+++ b/hysop/numerics/fft/fftw_fft.py
@@ -9,14 +9,9 @@ import pyfftw
 import numpy as np
 
 from hysop.tools.types import first_not_None
-from hysop.tools.units import bytes2str
 from hysop.tools.misc import prod
-from hysop.tools.warning import HysopWarning
-from hysop.numerics.fft.fft import FFTPlanI, FFTI
-
-class HysopFFTWWarning(HysopWarning):
-    pass
-
+from hysop.numerics.fft.fft import FFTPlanI, FFTI, HostArray, \
+        HysopFFTWarning, bytes2str
 
 class FftwFFTPlan(FFTPlanI):
     """
@@ -25,15 +20,36 @@ class FftwFFTPlan(FFTPlanI):
     Emit warnings when changing input and output alignment.
     """
 
-    def __init__(self, scaling=None, **plan_kwds):
+    def __init__(self, a, out, scaling=None, **plan_kwds):
         super(FftwFFTPlan, self).__init__()
+
+        if isinstance(a, HostArray):
+            plan_kwds['input_array'] = a.handle
+        else:
+            plan_kwds['input_array'] = a
+
+        if isinstance(out, HostArray):
+            plan_kwds['output_array'] = out.handle
+        else:
+            plan_kwds['output_array'] = out
+
         plan = pyfftw.FFTW(**plan_kwds)
         if (not plan.simd_aligned):
             msg='Resulting plan is not SIMD aligned ({} bytes boundary).'
             msg=msg.format(pyfftw.simd_alignment)
-            warnings.warn(msg, HysopFFTWWarning)
+            warnings.warn(msg, HysopFFTWarning)
         self.plan = plan
         self.scaling = scaling
+        self.out = out
+        self.a = a
+    
+    @property
+    def input_array(self):
+        return self.a
+
+    @property
+    def output_array(self):
+        return self.out
 
     def check_new_inputs(self, a, out):
         plan = self.plan
@@ -41,22 +57,27 @@ class FftwFFTPlan(FFTPlanI):
             msg='New input array is not aligned on original planned input alignment of {} bytes.'
             msg+='\nA copy will be made.'
             msg=msg.format(plan.input_alignment)
-            warnings.warn(msg, HysopFFTWWarning)
+            warnings.warn(msg, HysopFFTWarning)
         if (out is not None) and (not pyfftw.is_byte_aligned(out, n=plan.output_alignment)):
             msg='New output array is not aligned on original planned output alignment of {} bytes.'
             msg+='\nA copy will be made.'
             msg=msg.format(plan.output_alignment)
-            warnings.warn(msg, HysopFFTWWarning)
+            warnings.warn(msg, HysopFFTWarning)
+        self.out = out
+        self.a = a
+        if isinstance(a, HostArray):
+            a = a.handle
+        if isinstance(out, HostArray):
+            out = out.handle
+        return (a, out)
 
     def execute(self):
         """
         Execute plan on current input and output array.
         """
-        out = self.plan.__call__()
+        self.plan.__call__()
         if (self.scaling is not None):
-            out[...] *= self.scaling
-        return out
-        
+            self.output_array[...] *= self.scaling
     
     def __call__(self, a=None, out=None):
         """
@@ -65,12 +86,10 @@ class FftwFFTPlan(FFTPlanI):
         will result in a copy being made.
         Return output array for convenience.
         """
-        self.check_new_inputs(a, out)
+        (a, out) = self.check_new_inputs(a, out)
         self.plan(input_array=a, output_array=out, normalize_idft=True)
-        out = self.plan.output_array
         if (self.scaling is not None):
-            out[...] *= self.scaling
-        return out
+            self.output_array[...] *= self.scaling
 
 
 class FftwFFT(FFTI):
@@ -91,8 +110,11 @@ class FftwFFT(FFTI):
                        planning_timelimit=None,
                        destroy_input=False,
                        warn_on_allocation=True,
-                       warn_on_misalignment=True):
-        super(FftwFFT, self).__init__()
+                       warn_on_misalignment=True,
+                       backend=None,
+                       **kwds):
+        super(FftwFFT, self).__init__(backend=backend, 
+                warn_on_allocation=warn_on_allocation, **kwds)
         self.supported_ftypes = (np.float32, np.float64, np.longdouble)
         self.supported_ctypes = (np.complex64, np.complex128, np.clongdouble)
         self.supported_cosine_transforms = (1,2,3,4)
@@ -103,7 +125,6 @@ class FftwFFT(FFTI):
         self.planning_timelimit = planning_timelimit
         self.destroy_input      = destroy_input
 
-        self.warn_on_allocation   = warn_on_allocation
         self.warn_on_misalignment = warn_on_misalignment
 
     @classmethod
@@ -113,29 +134,15 @@ class FftwFFT(FFTI):
         msg0=msg0.format('{}', pyfftw.simd_alignment)
         if (a is not None) and not pyfftw.is_byte_aligned(array=a):
             msg=msg0.format('Input')
-            warnings.warn(msg, HysopFFTWWarning)
+            warnings.warn(msg, HysopFFTWarning)
         elif (out is not None) and not pyfftw.is_byte_aligned(out): 
             msg=msg0.format('Output')
-            warnings.warn(msg, HysopFFTWWarning)
-
-    def allocate_output(self, out, shape, dtype):
-        """Alocate output if required and check shape and dtype."""
-        if (out is None):
-            if self.warn_on_allocation:
-                nbytes = prod(shape)*dtype.itemsize
-                msg='FftwFFT: allocating aligned output array of size {}.'
-                msg=msg.format(bytes2str(nbytes))
-                warnings.warn(msg, HysopFFTWWarning)
-            out = pyfftw.empty_aligned(shape=shape, dtype=dtype)
-        else:
-            assert out.dtype == dtype
-            assert out.shape == shape
-        return out
+            warnings.warn(msg, HysopFFTWarning)
 
     def bake_kwds(self, **kwds):
         plan_kwds = {}
-        plan_kwds['input_array']        = kwds.pop('a')
-        plan_kwds['output_array']       = kwds.pop('out')
+        plan_kwds['a']                  = kwds.pop('a')
+        plan_kwds['out']                = kwds.pop('out')
         plan_kwds['direction']          = kwds.pop('direction')
         plan_kwds['axes']               = kwds.pop('axes', (kwds.pop('axis'),))
         plan_kwds['threads']            = kwds.pop('threads', self.threads)
@@ -153,7 +160,7 @@ class FftwFFT(FFTI):
             msg='Unknown keyword arguments: {}'
             msg=msg.format(', '.join('\'{}\''.format(kwd) for kwd in kwds.keys()))
             raise RuntimeError(msg)
-
+        
         return plan_kwds
 
     
@@ -199,6 +206,7 @@ class FftwFFT(FFTI):
         return plan
 
     def dct(self, a, out=None, type=2, axis=-1, **kwds):
+        """Planning destroys initial arrays content."""
         (shape, dtype) = super(FftwFFT, self).dct(a=a, out=out, type=type, axis=axis, **kwds)
         out = self.allocate_output(out, shape, dtype) 
         if self.warn_on_misalignment:
@@ -210,9 +218,10 @@ class FftwFFT(FFTI):
         return plan
     
     def idct(self, a, out=None, type=2, axis=-1, scaling=None, **kwds):
+        """Planning destroys initial arrays content."""
         (shape, dtype, type, s) = super(FftwFFT, self).idct(a=a, out=out, type=type, axis=axis, 
                                         **kwds)
-        scaling = first_not_None(scaling, s)
+        scaling = first_not_None(scaling, 1.0/s)
         out = self.allocate_output(out, shape, dtype) 
         if self.warn_on_misalignment:
             self.check_alignment(a, out)
@@ -223,6 +232,7 @@ class FftwFFT(FFTI):
         return plan
     
     def dst(self, a, out=None, type=2, axis=-1, **kwds):
+        """Planning destroys initial arrays content."""
         (shape, dtype) = super(FftwFFT, self).dst(a=a, out=out, type=type, axis=axis, **kwds)
         out = self.allocate_output(out, shape, dtype) 
         if self.warn_on_misalignment:
@@ -234,9 +244,10 @@ class FftwFFT(FFTI):
         return plan
 
     def idst(self, a, out=None, type=2, axis=-1, scaling=None, **kwds):
+        """Planning destroys initial arrays content."""
         (shape, dtype, type, s) = super(FftwFFT, self).idst(a=a, out=out, type=type, 
                 axis=axis, **kwds)
-        scaling = first_not_None(scaling, s)
+        scaling = first_not_None(scaling, 1.0/s)
         out = self.allocate_output(out, shape, dtype) 
         if self.warn_on_misalignment:
             self.check_alignment(a, out)
diff --git a/hysop/numerics/fft/gpyfft_fft.py b/hysop/numerics/fft/gpyfft_fft.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a35085c4f90ae7cef8022e0ea28bb7a2ae2d59
--- /dev/null
+++ b/hysop/numerics/fft/gpyfft_fft.py
@@ -0,0 +1,921 @@
+"""
+FFT iterface for fast Fourier Transforms using CLFFT backend.
+:class:`~hysop.numerics.GpyFFT`
+:class:`~hysop.numerics.GpyFFTPlan`
+"""
+
+import warnings, struct
+import numpy as np
+from abc import abstractmethod
+
+from gpyfft.fft import gfft, GFFT, FFT as FFTPlan
+from hysop.numerics.fft.fft import FFTPlanI, FFTI, \
+        mk_shape, float_to_complex_dtype, complex_to_float_dtype
+
+from hysop import vprint
+from hysop.constants import Precision
+from hysop.tools.types import first_not_None
+from hysop.tools.units import bytes2str
+from hysop.tools.misc import prod
+from hysop.tools.warning import HysopWarning
+from hysop.tools.types import first_not_None
+from hysop.tools.numerics import is_complex, is_fp
+
+from hysop.backend.device.opencl import cl, clArray
+from hysop.backend.device.codegen.base.variables import dtype_to_ctype
+from hysop.backend.device.opencl.opencl_array_backend import OpenClArrayBackend
+
+class HysopGpyFftWarning(HysopWarning):
+    pass
+
+
+class GpyFFTPlan(FFTPlanI, FFTPlan):
+    """
+    Build a clFFT plan using the gpyfft python interface.
+    Emit warnings when transform output has an unaligned buffer offset.
+    """
+
+    def __init__(self, cl_env, 
+            in_array, out_array, 
+            axes, direction_forward,
+            scaling=None,
+            scale_by_size=None,
+            warn_on_allocation=True,
+            warn_on_unaligned_output_offset=True):
+        """
+        Wrap gpyfft.FFT to allow more versatile callback settings
+        and buffer allocations.
+        """
+        # we do not want to call FFTPlan.__init__
+        FFTPlanI.__init__(self)
+
+        self.cl_env  = cl_env
+        self.warn_on_allocation = warn_on_allocation
+        self.warn_on_unaligned_output_offset = warn_on_unaligned_output_offset
+
+        self.temp_buffer = None
+        self._setup      = False
+        self._baked      = False
+        self._allocated  = False
+
+        self.in_array  = in_array
+        self.out_array = out_array
+
+        self._setup_kwds = {
+                'in_array': in_array,
+                'out_array': out_array,
+                'axes': axes,
+                'direction_forward': direction_forward,
+                'scaling': scaling,
+                'scale_by_size': scale_by_size
+            }
+
+    def setup(self):
+        super(GpyFFTPlan, self).setup()
+        self.setup_plan(**self._setup_kwds)
+        return self
+
+    def setup_plan(self, in_array, out_array, axes, direction_forward, scaling, scale_by_size):
+        if (scale_by_size is not None):
+            if (scaling is None):
+                scaling = 1.0/scale_by_size
+            else:
+                scaling = scaling / scale_by_size
+
+        axes = np.asarray(axes)
+        assert 0 < axes.size <= 3, axes.size
+
+        # compute strides
+        t_strides_in, t_distance_in, t_batchsize_in, t_shape, axes_transform = \
+                self.calculate_transform_strides(axes, in_array)
+
+        t_strides_out, t_distance_out, t_batchsize_out, t_shape_out, axes_transform_out = \
+                self.calculate_transform_strides(axes, out_array)
+
+        # check axes
+        msg='Error finding transform axis (consider setting axes argument)'
+        assert np.all(axes_transform == axes_transform_out), msg
+
+        # enforce no input and output overlap (unless inplace)
+        t_inplace = False
+        if (in_array.base_data == out_array.base_data):
+            if (in_array.offset < out_array.offset):
+                assert (in_array.offset + in_array.nbytes) < out_array.offset
+            elif (in_array.offset > out_array.offset):
+                assert (out_array.offset + out_array.nbytes) < in_array.offset
+            else:
+                t_inplace = True
+
+        # check input data type
+        if in_array.dtype in (np.float32, np.complex64):
+            t_precision = gfft.CLFFT_SINGLE
+        elif in_array.dtype in (np.float64, np.complex128):
+            t_precision = gfft.CLFFT_DOUBLE
+        else:
+            msg='Unsupported precision {}.'
+            msg=msg.format(in_array.dtype)
+            raise RuntimeError(msg)
+
+        if out_array.dtype in (np.float32, np.complex64):
+            t_precision_out = gfft.CLFFT_SINGLE
+        elif out_array.dtype in (np.float64, np.complex128):
+            t_precision_out = gfft.CLFFT_DOUBLE
+        else:
+            msg='Unsupported precision {}.'
+            msg=msg.format(out_array.dtype)
+            raise RuntimeError(msg)
+
+        if (t_precision != t_precision_out):
+            msg='Incompatible input and output precisions: {} vs {}'
+            msg=msg.format(t_precision, t_precision_out)
+            raise RuntimeError(msg)
+
+        if in_array.dtype in (np.float32, np.float64):
+            layout_in  = gfft.CLFFT_REAL
+            layout_out = gfft.CLFFT_HERMITIAN_INTERLEAVED
+
+            expected_out_shape = list(in_array.shape)
+            expected_out_shape[axes_transform[0]] = \
+                    expected_out_shape[axes_transform[0]]//2 + 1
+            msg='output array shape {} does not match expected shape: {}'
+            msg=msg.format(out_array.shape, expected_out_shape)
+            assert out_array.shape == tuple(expected_out_shape), msg
+        elif in_array.dtype in (np.complex64, np.complex128):
+            if out_array.dtype in (np.complex64, np.complex128):
+                layout_in  = gfft.CLFFT_COMPLEX_INTERLEAVED
+                layout_out = gfft.CLFFT_COMPLEX_INTERLEAVED
+            else:
+                layout_in  = gfft.CLFFT_HERMITIAN_INTERLEAVED
+                layout_out = gfft.CLFFT_REAL
+                t_shape = t_shape_out
+
+        if t_inplace and ((layout_in is gfft.CLFFT_REAL) or
+                          (layout_out is gfft.CLFFT_REAL)):
+            assert ((in_array.strides[axes_transform[0]] == in_array.dtype.itemsize) and \
+                    (out_array.strides[axes_transform[0]] == out_array.dtype.itemsize)), \
+                    'inline real transforms need stride 1 for first transform axis'
+
+        plan = GFFT.create_plan(self.context, t_shape)
+        plan.inplace     = t_inplace
+        plan.strides_in  = t_strides_in
+        plan.strides_out = t_strides_out
+        plan.distances   = (t_distance_in, t_distance_out)
+        plan.batch_size  = t_batchsize_in
+        plan.precision   = t_precision
+        plan.layouts     = (layout_in, layout_out)
+        if (scaling is not None):
+            if direction_forward:
+                plan.scale_forward = scaling
+            else:
+                plan.scale_backward = scaling
+
+        (in_data, out_data) = self.set_callbacks(plan, in_array, out_array, 
+                                                       layout_in, layout_out)
+
+        self.plan      = plan
+        self.in_data   = in_data
+        self.out_data  = out_data
+        self.is_inplace = t_inplace
+        self.direction_forward = direction_forward
+
+    def set_callbacks(self, plan, in_array, out_array, layout_in, layout_out, **kwds):
+        in_data = in_array.base_data
+        pre, user_data = self.pre_offset_callback(in_array, layout_in, **kwds)
+
+        out_data = out_array.base_data
+        post, user_data = self.post_offset_callback(out_array, layout_out, **kwds)
+        
+        # **********************************************************************************
+        # Keep a reference to callback source code to prevent dangling const char* pointers.
+        # Do not remove because clfft only get the pointer and gpyfft does not increase the 
+        # refcount of those strings.
+        self.pre_callback_src  = pre
+        self.post_callback_src = post
+        # **********************************************************************************
+
+        plan.set_callback(u'pre_callback',  pre,  'pre',  user_data=user_data)
+        plan.set_callback(u'post_callback', post, 'post', user_data=user_data)
+        return (in_data, out_data)
+
+    def bake(self, queue=None):
+        if self._baked:
+            msg='Plan was already baked.'
+            raise RuntimeError(msg)
+        queue = first_not_None(queue, self.queue)
+        self.plan.bake(queue)
+        self._baked = True
+        return self
+
+    def allocate(self, buf=None):
+        if self._allocated:
+            msg='Plan was already allocated.'
+            raise RuntimeError(msg)
+        size = self.plan.temp_array_size
+        if (size>0):
+            if (buf is None):
+                if self.warn_on_allocation:
+                    msg='Allocating temporary buffer of size {} for clFFT::{}.'
+                    msg=msg.format(bytes2str(size), id(self))
+                    warnings.warn(msg, HysopWarning)
+                buf = cl.Buffer(self.context, cl.mem_flags.READ_WRITE, size=size)
+                self.temp_buffer = buf
+            elif (buf.size != size):
+                msg='Buffer does not match required size: {} != {}'
+                msg=msg.format(buf.size, size)
+                raise ValueError(msg)
+            else:
+                self.temp_buffer = buf.data
+        else:
+            assert (buf is None)
+        self._allocated = True
+        return self
+
+    def enqueue(self, queue=None, wait_for_events=None):
+        """
+        Enqueue transform with array base_data.
+        """
+        if not self._baked:
+            self.bake(queue)
+        if not self._allocated:
+            self.allocate()
+
+        queue = first_not_None(queue, self.queue)
+        in_data, out_data = self.in_data, self.out_data
+        direction_forward = self.direction_forward
+
+        if self.is_inplace:
+            events = self.plan.enqueue_transform((queue,), 
+                                            (in_data,), 
+                                            direction_forward=direction_forward, 
+                                            temp_buffer=self.temp_buffer, 
+                                            wait_for_events=wait_for_events)
+        else:
+            events = self.plan.enqueue_transform((queue,), 
+                                            (in_data,), (out_data),
+                                            direction_forward=direction_forward, 
+                                            temp_buffer=self.temp_buffer, 
+                                            wait_for_events=wait_for_events)
+        evt, = events
+        return evt
+
+    def enqueue_arrays(self, *args, **kwds):
+        msg='Enqueue arrays is not supported yet.'
+        raise NotImplementedError(msg)
+
+    def execute(self, **kwds):
+        return self.enqueue(**kwds)
+
+    @property
+    def required_buffer_size(self):
+        assert self._baked, 'plan has not been baked yet.'
+        return self.plan.temp_array_size
+
+    @property
+    def ready(self):
+        return (self._baked and self._allocated)
+
+    @property
+    def queue(self):
+        return self.cl_env.default_queue
+
+    @property
+    def context(self):
+        return self.cl_env.context
+
+    @property
+    def input_array(self):
+        return self.in_array
+
+    @property
+    def output_array(self):
+        return self.out_array
+
+    @classmethod
+    def check_dtype(cls, dtype, layout):
+        if layout in (gfft.CLFFT_HERMITIAN_INTERLEAVED, gfft.CLFFT_COMPLEX_INTERLEAVED):
+            if not is_complex(dtype):
+                msg='Layout is {} but got array with dtype {}.'
+                msg=msg.format(layout, dtype)
+                raise RuntimeError(msg)
+        elif layout in (gfft.CLFFT_REAL,):
+            if not is_fp(dtype):
+                msg='Layout is CLFFT_REAL but got array with dtype {}.'
+                msg=msg.format(dtype)
+                raise RuntimeError(msg)
+        else:
+            msg='Unsupported data layout {}.'
+            msg=msg.format(layout)
+            raise NotImplementedError(msg)
+
+    def get_array_offset(self, array):
+        dtype = array.dtype
+        if (array.offset % dtype.itemsize) != 0:
+            msg='Unaligned array offset.'
+            raise RuntimeError(msg)
+        base_offset = (array.offset // dtype.itemsize)
+        return base_offset
+
+    def pre_offset_callback(self, in_array, layout_in, **kwds):
+        dtype = in_array.dtype
+        fp = dtype_to_ctype(dtype)
+        self.check_dtype(dtype, layout_in)
+        base_offset = self.get_array_offset(in_array)
+        callback = \
+        '''{fp} pre_callback(const __global void* input,
+                 const uint offset,
+                 __global void* userdata) {{
+            __global {fp}* in = (__global {fp}*) input; 
+            return in[{base_offset}uL+offset];
+        }}'''.format(fp=fp, base_offset=base_offset)
+
+        return callback, None
+
+    def post_offset_callback(self, out_array, layout_out, **kwds):
+        dtype = out_array.dtype
+        self.check_dtype(dtype, layout_out)
+        fp = dtype_to_ctype(dtype)
+        base_offset = self.get_array_offset(out_array)
+
+        callback = \
+        '''void post_callback(__global void* output,
+                  const uint offset,
+                  __global void* userdata,
+                  const {fp} fftoutput) {{
+            __global {fp}* out = (__global {fp}*) output; 
+            out[{base_offset}uL+offset] = fftoutput;
+        }}'''.format(fp=fp, base_offset=base_offset)
+
+        return callback, None
+
+    def allocate_plans(cls, backend, plans):
+        tmp_size = max(plan.required_buffer_size for plan in plans)
+
+        if (tmp_size>0):
+            msg='Allocating an additional {} temporary buffer for clFFT.'
+            msg=msg.format(bytes2str(tmp_size))
+            vprint(msg)
+            tmp_buffer = backend.empty(shape=(tmp_size), dtype=npw.uint8)
+            for plan in plans:
+                if (plan.required_buffer_size > tmp_buffer.nbytes):
+                    msg='\nFATAL ERROR: Failed to allocate temporary buffer for clFFT.'
+                    msg+='\n => clFFT expected {} bytes but only {} bytes have been allocated.\n'
+                    msg=msg.format(plan.required_buffer_size, tmp_buffer.nbytes)
+                    raise RuntimeError(msg)
+                else:
+                    buf = tmp_buffer[:plan.required_buffer_size]
+                    plan.allocate(buf=buf)
+        else:
+            for plan in plans:
+                assert plan.required_buffer_size == 0
+                plan.allocate()
+            tmp_buffer = None
+        return tmp_buffer
+
+
+class GpyR2RPlan(GpyFFTPlan):
+    """Specialization for real to real transforms built from r2c or c2r transforms."""
+
+    def __init__(self, in_array, out_array, scale_by_size, **kwds):
+        """
+        Handmade R2R transforms rely on a fake output that will
+        never really be written. This output is necessary because
+        clFFT use it as a temporary storage during the FFT computation.
+
+        The real output array will be passed as user data in a post callback.
+        """
+        msg='Incompatible shapes {} vs {}.'.format(in_array.shape, out_array.shape)
+        assert np.array_equal(in_array.shape, out_array.shape), msg
+        msg='Incompatible dtypes {} vs {}.'.format(in_array.dtype, out_array.dtype)
+        assert (in_array.dtype == out_array.dtype)
+        msg='Only single and double precision are supported, got {}.'.format(in_array.dtype)
+        assert in_array.dtype in (np.float32, np.float64), msg
+        
+        scale_by_size = first_not_None(scale_by_size, 1)
+        super(GpyR2RPlan, self).__init__(in_array=in_array, out_array=out_array, 
+                scale_by_size=scale_by_size, **kwds)
+
+    @classmethod
+    def fake_array(cls, shape, dtype, strides=None):
+        class DummyArray(object):
+            def __init__(self, shape, strides, dtype):
+                assert (shape is not None)
+                assert (dtype is not None)
+                dtype = np.dtype(dtype)
+                if (strides is None):
+                    strides = self.compute_strides(shape, dtype)
+                assert len(shape) == len(strides)
+                self.shape = shape
+                self.strides = strides
+                self.dtype = dtype
+
+            @classmethod
+            def compute_strides(cls, shape, dtype):
+                strides = list(shape)[::-1]
+                strides[1:] = np.cumprod(strides[:-1])
+                strides[0] = 1
+                strides = tuple(x*dtype.itemsize for x in strides)
+                return strides
+
+        array = DummyArray(shape=shape, strides=strides, dtype=dtype)
+        return array
+
+    def generate_twiddles(self, name, base, count, typegen, dtype):
+        ctype = float_to_complex_dtype(dtype)
+        fp = dtype_to_ctype(dtype)
+        K = np.arange(count)
+
+        E = np.exp(base*K, dtype=np.complex128)
+        base = '\t\t({fp}2)({}, {})'
+        vals = ',\n'.join(base.format(
+            typegen.dump(x.real), typegen.dump(x.imag), fp=fp) for x in E)
+        twiddles = \
+        '''
+        __constant const {fp}2 {name}[{N}] = {{
+{vals}
+        }};
+        '''.format(fp=fp, name=name, vals=vals, N=count)
+        return twiddles;
+
+    @abstractmethod
+    def setup_plan(self, in_array, out_array, axes, direction_forward, scaling, scale_by_size):
+        """Redefine plan creation behaviour."""
+        pass
+
+
+class GpyDCTIPlan(GpyR2RPlan):
+    def setup_plan(self, in_array, out_array, axes, direction_forward, scaling, scale_by_size):
+        assert in_array.data != out_array.data, 'inplace R2R transforms are not supported.'
+        axis = in_array.ndim - 1
+        assert len(axes)==1
+        assert axes[0] in (-1, axis)
+        axes = np.asarray([axis])
+
+        assert scale_by_size > 0
+
+        # build a fake R2C plan
+        layout_in  = gfft.CLFFT_REAL
+        layout_out = gfft.CLFFT_HERMITIAN_INTERLEAVED
+
+        shape  = in_array.shape
+        dtype  = in_array.dtype
+        fp = dtype_to_ctype(dtype)
+        N  = shape[axis]
+        rshape = mk_shape(shape, axis, 2*N-2)
+        cshape = mk_shape(shape, axis, N)
+        ctype  = float_to_complex_dtype(dtype)
+
+        fake_input  = self.fake_array(shape=rshape, dtype=dtype)
+        fake_output = self.fake_array(shape=cshape, dtype=ctype)
+
+        t_strides_in, t_distance_in, t_batchsize_in, t_shape_in, axes_transform_in = \
+                self.calculate_transform_strides(axes, fake_input)
+        t_strides_out, t_distance_out, t_batchsize_out, t_shape_out, axes_transform_out = \
+                self.calculate_transform_strides(axes, fake_output)
+
+        assert np.array_equal(t_batchsize_in, t_batchsize_out)
+        assert np.array_equal(axes_transform_in, axes_transform_out)
+        t_shape      = t_shape_in
+        t_batch_size = t_batchsize_in
+        t_inplace    = False
+
+        plan = GFFT.create_plan(self.context, t_shape)
+        plan.inplace     = t_inplace
+        plan.batch_size  = t_batch_size
+        plan.strides_in  = t_strides_in
+        plan.strides_out = t_strides_out
+        plan.distances   = (t_distance_in, t_distance_out)
+        plan.layouts     = (layout_in, layout_out)
+        if (dtype == np.float32):
+            plan.precision = gfft.CLFFT_SINGLE
+        else:
+            plan.precision = gfft.CLFFT_DOUBLE
+        assert scaling is None
+        if (scaling is not None):
+            if direction_forward:
+                plan.scale_forward = scaling
+            else:
+                plan.scale_backward = scaling
+
+        (in_data, out_data) = self.set_callbacks(plan, in_array, out_array, 
+                                                       layout_in, layout_out,
+                                                       N=N, fp=fp, S=scale_by_size)
+
+        self.in_data    = in_data
+        self.out_data   = out_data
+        self.is_inplace = t_inplace
+        self.direction_forward = direction_forward
+        self.plan = plan
+
+    def pre_offset_callback(self, in_array, layout_in, N, fp, S):
+        base_offset = self.get_array_offset(in_array)
+        pre = \
+        '''{fp} pre_callback(const __global void* input, uint k,
+                 __global void* userdata) {{
+            __global {fp}* in = (__global {fp}*)(input) + {base_offset}uL;
+            in += k/(2*{N}-2) * {N};
+            {fp} ret;
+            k = k%(2*{N}-2);
+            if (k<{N}) {{
+                ret = in[k];
+            }}
+            else {{
+                ret = in[2*{N}-k-2];
+            }}
+            return ret;
+        }}'''.format(N=N, fp=fp, base_offset=base_offset)
+        return pre, None
+
+    def post_offset_callback(self, out_array, layout_out, N, fp, S):
+        base_offset = self.get_array_offset(out_array)
+        post = \
+        '''void post_callback(__global void* output, uint k,
+                  __global void* userdata, {fp}2 fftoutput) {{
+            __global {fp}* out = (__global {fp}*)(output) + {base_offset}uL;
+            out[k] = fftoutput.x/{S};
+        }}'''.format(N=N, S=S, fp=fp, base_offset=base_offset)
+        return post, None
+
+
+class GpyDCTIIPlan(GpyR2RPlan):
+    def setup_plan(self, in_array, out_array, axes, direction_forward, scaling, scale_by_size):
+        assert in_array.data != out_array.data, 'inplace R2R transforms are not supported.'
+        axis = in_array.ndim - 1
+        assert len(axes)==1
+        assert axes[0] in (-1, axis)
+        axes = np.asarray([axis])
+        
+        assert scale_by_size > 0
+
+        # build a fake R2C plan
+        layout_in  = gfft.CLFFT_REAL
+        layout_out = gfft.CLFFT_HERMITIAN_INTERLEAVED
+
+        shape  = in_array.shape
+        dtype  = in_array.dtype
+        fp = dtype_to_ctype(dtype)
+        N  = shape[axis]
+        rshape = mk_shape(shape, axis, N)
+        cshape = mk_shape(shape, axis, N//2+1)
+        ctype  = float_to_complex_dtype(dtype)
+
+        fake_input  = self.fake_array(shape=rshape, dtype=dtype)
+        fake_output = self.fake_array(shape=cshape, dtype=ctype)
+
+        t_strides_in, t_distance_in, t_batchsize_in, t_shape_in, axes_transform_in = \
+                self.calculate_transform_strides(axes, fake_input)
+        t_strides_out, t_distance_out, t_batchsize_out, t_shape_out, axes_transform_out = \
+                self.calculate_transform_strides(axes, fake_output)
+
+        assert np.array_equal(t_batchsize_in, t_batchsize_out)
+        assert np.array_equal(axes_transform_in, axes_transform_out)
+        t_shape      = t_shape_in
+        t_batch_size = t_batchsize_in
+        t_inplace    = False
+
+        plan = GFFT.create_plan(self.context, t_shape)
+        plan.inplace     = t_inplace
+        plan.batch_size  = t_batch_size
+        plan.strides_in  = t_strides_in
+        plan.strides_out = t_strides_out
+        plan.distances   = (t_distance_in, t_distance_out)
+        plan.layouts     = (layout_in, layout_out)
+        if (dtype == np.float32):
+            plan.precision = gfft.CLFFT_SINGLE
+            precision = Precision.FLOAT
+        else:
+            plan.precision = gfft.CLFFT_DOUBLE
+            precision = Precision.DOUBLE
+        if (scaling is not None):
+            if direction_forward:
+                plan.scale_forward  = scaling
+            else:
+                plan.scale_backward = scaling
+
+        typegen = self.cl_env.build_typegen(precision=precision,
+                float_dump_mode='hex', use_short_circuit_ops=False, unroll_loops=False) 
+
+        (in_data, out_data) = self.set_callbacks(plan, in_array, out_array, 
+                                                       layout_in, layout_out,
+                                                       N=N, S=scale_by_size,
+                                                       fp=fp, typegen=typegen)
+
+        self.in_data    = in_data
+        self.out_data   = out_data
+        self.is_inplace = t_inplace
+        self.direction_forward = direction_forward
+        self.plan = plan
+
+    def pre_offset_callback(self, in_array, layout_in, N, S, fp, typegen):
+        base_offset = self.get_array_offset(in_array)
+        n = (N-1)//2 + 1
+        pre = \
+        '''
+        {fp} pre_callback(const __global void* input, uint k, __global void* userdata) {{
+            __global {fp}* in = (__global {fp}*)(input) + {base_offset}uL;
+            {fp} ret;
+            if (k<{n}) {{
+                ret = in[2*k];
+            }}
+            else {{
+                ret = in[2*({N}-k)-1];
+            }}
+            return ret;
+        }}'''.format(n=n, N=N, fp=fp, base_offset=base_offset)
+        return pre, None
+
+    def post_offset_callback(self, out_array, layout_out, N, S, fp, typegen):
+        base_offset = self.get_array_offset(out_array)
+        n = (N-1)//2 + 1
+        twiddles = self.generate_twiddles('dct2_twiddles', base=-1.0j*np.pi/(2*N), count=N//2+1,
+                                            dtype=out_array.dtype, typegen=typegen)
+        post = \
+        '''
+        {twiddles}
+        void post_callback(__global void* output, uint k, __global void* userdata, {fp}2 R) {{
+            __global {fp}* out = (__global {fp}*)(output) + {base_offset}uL;
+            //const {fp}2 T = ({fp}2)(cos(-pi*k/(2*N)), sin(-pi*k/(2*N)));
+            const {fp}2 T = dct2_twiddles[k];
+            if (k < {n}) {{
+                out[k] = +2*(R.x*T.x - R.y*T.y)/{S};
+            }}
+            if (k > 0) {{
+                out[{N}-k] = -2*(R.x*T.y + R.y*T.x)/{S};
+            }}
+        }}'''.format(N=N, S=S, n=n, fp=fp, base_offset=base_offset, 
+                twiddles=twiddles)
+        return post, None
+
+
+class GpyDCTIIIPlan(GpyR2RPlan):
+    def setup_plan(self, in_array, out_array, axes, direction_forward, scaling, scale_by_size):
+        assert in_array.data != out_array.data, 'inplace R2R transforms are not supported.'
+        axis = in_array.ndim - 1
+        assert len(axes)==1
+        assert axes[0] in (-1, axis)
+        axes = np.asarray([axis])
+
+        assert scale_by_size>0
+
+        # build a fake R2C plan
+        layout_in  = gfft.CLFFT_HERMITIAN_INTERLEAVED
+        layout_out = gfft.CLFFT_REAL
+
+        shape  = in_array.shape
+        dtype  = in_array.dtype
+        fp = dtype_to_ctype(dtype)
+        N  = shape[axis]
+        rshape = mk_shape(shape, axis, N)
+        cshape = mk_shape(shape, axis, N//2+1)
+        ctype  = float_to_complex_dtype(dtype)
+
+        fake_input  = self.fake_array(shape=cshape, dtype=ctype)
+        fake_output = self.fake_array(shape=rshape, dtype=dtype)
+
+        t_strides_in, t_distance_in, t_batchsize_in, t_shape_in, axes_transform_in = \
+                self.calculate_transform_strides(axes, fake_input)
+        t_strides_out, t_distance_out, t_batchsize_out, t_shape_out, axes_transform_out = \
+                self.calculate_transform_strides(axes, fake_output)
+
+        assert np.array_equal(t_batchsize_in, t_batchsize_out)
+        assert np.array_equal(axes_transform_in, axes_transform_out)
+        t_shape      = t_shape_out
+        t_batch_size = t_batchsize_in
+        t_inplace    = False
+
+        plan = GFFT.create_plan(self.context, t_shape)
+        plan.inplace     = t_inplace
+        plan.batch_size  = t_batch_size
+        plan.strides_in  = t_strides_in
+        plan.strides_out = t_strides_out
+        plan.distances   = (t_distance_in, t_distance_out)
+        plan.layouts     = (layout_in, layout_out)
+        if (dtype == np.float32):
+            plan.precision = gfft.CLFFT_SINGLE
+            precision = Precision.FLOAT
+        else:
+            plan.precision = gfft.CLFFT_DOUBLE
+            precision = Precision.DOUBLE
+        if (scaling is not None):
+            plan.scale_forward = scaling/float(N)
+            plan.scale_backward = scaling/float(N)
+
+        typegen = self.cl_env.build_typegen(precision=precision,
+                float_dump_mode='hex', use_short_circuit_ops=False, unroll_loops=False) 
+
+        (in_data, out_data) = self.set_callbacks(plan, in_array, out_array, 
+                                                       layout_in, layout_out,
+                                                       N=N, S=scale_by_size,
+                                                       fp=fp, typegen=typegen)
+
+        self.in_data    = in_data
+        self.out_data   = out_data
+        self.is_inplace = t_inplace
+        self.direction_forward = direction_forward
+        self.plan = plan
+
+    def pre_offset_callback(self, in_array, layout_in, N, S, fp, typegen):
+        base_offset = self.get_array_offset(in_array)
+        twiddles = self.generate_twiddles('dct3_twiddles', base=+1.0j*np.pi/(2*N), count=N//2+1,
+                                            dtype=in_array.dtype, typegen=typegen)
+        pre = \
+        '''
+        {twiddles}
+        {fp}2 pre_callback(const __global void* input, uint k, __global void* userdata) {{
+            __global {fp}* in = (__global {fp}*)(input) + {base_offset}uL;
+            const {fp}2 T = ({fp}2)(cos({pi}*k/(2*{N})), sin({pi}*k/(2*{N})));
+            //const {fp}2 T = dct3_twiddles[k];
+            {fp}2 C, R;
+            R.x = in[k];
+            if ( k == 0 ) {{
+                R.y = 0;
+            }}
+            else {{
+                R.y = -in[{N}-k];
+            }}
+            C.x = R.x*T.x - R.y*T.y;
+            C.y = R.x*T.y + R.y*T.x;
+            return C;
+        }}'''.format(N=N, fp=fp, base_offset=base_offset,
+                pi=typegen.dump(np.pi), twiddles=twiddles)
+        return pre, None
+
+    def post_offset_callback(self, out_array, layout_out, N, S, fp, typegen):
+        base_offset = self.get_array_offset(out_array)
+        n = (N-1)//2 + 1
+        post = \
+        '''
+        void post_callback(__global void* output, uint k, __global void* userdata, {fp} R) {{
+            __global {fp}* out = (__global {fp}*)(output) + {base_offset}uL;
+            if (k < {n}) {{
+                out[2*k] = {N}*R/{S};
+            }}
+            else {{
+                out[2*({N}-k)-1] = {N}*R/{S};
+            }}
+        }}'''.format(N=N, S=S, n=n, fp=fp, base_offset=base_offset)
+        return post, None
+
+
+
+class GpyDSTIPlan(GpyR2RPlan):
+    pass
+class GpyDSTIIPlan(GpyR2RPlan):
+    pass
+class GpyDSTIIIPlan(GpyR2RPlan):
+    pass
+
+
+class GpyFFT(FFTI):
+    """
+    Interface to compute local to process FFT-like transforms using the clFFT backend
+    trough the gpyfft python interface.
+
+    clFFT backend has many advantages: 
+        - single and double precision supported
+        - no intermediate temporary buffers created at each call.
+        - all required temporary buffers can be supplied or are auto-allocated only once.
+        - planning capability but no caching capabilities
+        - injection of custom opencl code for pre and post processing.
+
+    Planning may destroy initial arrays content.
+    Executing a plan may result in unwanted writes to output data, see notes.
+
+    Notes
+    -----
+    Output array is used during transform and if out.data is not aligned 
+    on device.MEM_BASE_ADDR_ALIGN the begining of the buffer may be overwritten by 
+    intermediate transform results.
+
+    out.data = out.base_data + out.offset 
+    if (offset%alignment > 0)
+        out.base_data[0:out.size] 
+        will be trashed during computation and the result of the transform will go to 
+        out.base_data[out.offset:out.offset+out.size]
+
+    Thus for every transforms out.base_data[0:min(out.offset,out.size)] may be overwritten with trash data.
+    The default behaviour is to warn when output data is not aligned on device memory boundary.
+    """
+
+    def __init__(self, cl_env, backend=None,
+                       warn_on_allocation=True,
+                       warn_on_unaligned_output_offset=True):
+
+        if (backend is None):
+            backend = OpenClArrayBackend.get_or_create(cl_env=cl_env, 
+                            queue=cl_env.default_queue, allocator=None)
+
+        super(GpyFFT, self).__init__(backend=backend)
+        self.supported_ftypes = (np.float32, np.float64)
+        self.supported_ctypes = (np.complex64, np.complex128)
+        self.supported_cosine_transforms = (1,2,3)
+        self.supported_sine_transforms   = ()
+
+        self.cl_env  = cl_env
+        self.warn_on_allocation = warn_on_allocation
+        self.warn_on_unaligned_output_offset = warn_on_unaligned_output_offset
+
+    def allocate_output(self, out, shape, dtype):
+        """Alocate output if required and check shape and dtype."""
+        if (out is None):
+            if self.warn_on_allocation:
+                nbytes = prod(shape)*dtype.itemsize
+                msg='GpyFFT: allocating output array of size {}.'
+                msg=msg.format(bytes2str(nbytes))
+                warnings.warn(msg, HysopGpyFftWarning)
+            out = self.backend.empty(shape=shape, dtype=dtype)
+        else:
+            assert out.dtype == dtype
+            assert out.shape == shape
+        return out
+
+    def bake_kwds(self, **kwds):
+        plan_kwds = {}
+        plan_kwds['in_array']           = kwds.pop('a')
+        plan_kwds['out_array']          = kwds.pop('out')
+        plan_kwds['scaling']            = kwds.pop('scaling', None)
+        plan_kwds['scale_by_size']      = kwds.pop('scale_by_size', None)
+        plan_kwds['direction_forward']  = kwds.pop('direction_forward', True)
+        plan_kwds['axes']               = kwds.pop('axes', (kwds.pop('axis'),))
+        plan_kwds['cl_env']             = kwds.pop('cl_env',  self.cl_env)
+        plan_kwds['warn_on_allocation'] = kwds.pop('warn_on_allocation', self.warn_on_allocation)
+        plan_kwds['warn_on_unaligned_output_offset'] = \
+                kwds.pop('warn_on_unaligned_output_offset', 
+                        self.warn_on_unaligned_output_offset)
+
+        if kwds:
+            msg='Unknown keyword arguments: {}'
+            msg=msg.format(', '.join('\'{}\''.format(kwd) for kwd in kwds.keys()))
+            raise RuntimeError(msg)
+
+        return plan_kwds
+
+    def fft(self, a, out=None, axis=-1, **kwds):
+        (shape, dtype) = super(GpyFFT, self).fft(a=a, out=out, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
+        kwds = self.bake_kwds(a=a, out=out, axis=axis, **kwds)
+        plan = GpyFFTPlan(**kwds)
+        return plan
+
+    def ifft(self, a, out=None, axis=-1, **kwds):
+        (shape, dtype) = super(GpyFFT, self).ifft(a=a, out=out, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
+        kwds = self.bake_kwds(a=a, out=out, axis=axis, direction_forward=False, **kwds)
+        plan = GpyFFTPlan(**kwds)
+        return plan
+
+    def rfft(self, a, out=None, axis=-1, **kwds):
+        (shape, dtype) = super(GpyFFT, self).rfft(a=a, out=out, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
+        kwds = self.bake_kwds(a=a, out=out, axis=axis, **kwds)
+        plan = GpyFFTPlan(**kwds)
+        return plan
+
+    def irfft(self, a, out=None, n=None, axis=-1, **kwds):
+        (shape, dtype) = super(GpyFFT, self).irfft(a=a, out=out, axis=axis, 
+                                                        n=n, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
+        kwds = self.bake_kwds(a=a, out=out, axis=axis, **kwds)
+        plan = GpyFFTPlan(**kwds)
+        return plan
+
+    def dct(self, a, out=None, type=2, axis=-1, **kwds):
+        (shape, dtype) = super(GpyFFT, self).dct(a=a, out=out, type=type, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
+        kwds = self.bake_kwds(a=a, out=out, axis=axis, **kwds)
+        if type==1:
+            plan = GpyDCTIPlan(**kwds)
+        elif type==2:
+            plan = GpyDCTIIPlan(**kwds)
+        elif type==3:
+            plan = GpyDCTIIIPlan(**kwds)
+        else:
+            msg='Unimplemented cosine transform type {}'.format(itype)
+            raise RuntimeError(msg)
+        return plan
+
+    def idct(self, a, out=None, type=2, axis=-1, **kwds):
+        (shape, dtype, itype, logical_size) = super(GpyFFT, self).idct(a=a, out=out, type=type,
+                axis=axis, **kwds)
+        return self.dct(a=a, out=out, type=itype, axis=axis, scale_by_size=logical_size, **kwds)
+
+    def dst(self, a, out=None, type=2, axis=-1, **kwds):
+        (shape, dtype) = super(GpyFFT, self).dst(a=a, out=out, type=type, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
+        kwds = self.bake_kwds(a=a, out=out, axis=axis, **kwds)
+        if type==1:
+            plan = GpyDSTIPlan(**kwds)
+        elif type==2:
+            plan = GpyDSTIIPlan(**kwds)
+        elif type==3:
+            plan = GpyDSTIIIPlan(**kwds)
+        else:
+            msg='Unimplemented sine transform type {}'.format(itype)
+            raise RuntimeError(msg)
+        return plan
+
+    def idst(self, a, out=None, type=2, axis=-1, **kwds):
+        (shape, dtype, itype, logical_size) = super(GpyFFT, self).idst(a=a, out=out, type=type, 
+                axis=axis, **kwds)
+        kwds = self.bake_kwds(a=a, out=out, axis=axis, **kwds)
+        return self.dst(a=a, out=out, type=itype, axis=axis, scale_by_size=logical_size, **kwds)
+
diff --git a/hysop/numerics/fft/numpy_fft.py b/hysop/numerics/fft/numpy_fft.py
index 964b04ba246726377f6d0bcaee9d250a54e84643..e49713d66cb72804c1ca17f27abf48ee3b1f6132 100644
--- a/hysop/numerics/fft/numpy_fft.py
+++ b/hysop/numerics/fft/numpy_fft.py
@@ -9,10 +9,10 @@ from numpy import fft as _FFT
 
 from hysop.tools.types import first_not_None
 from hysop.numerics.fft.fft import FFTPlanI, FFTI, \
-            mk_view, mk_shape,                     \
-            complex_to_float_dtype, float_to_complex_dtype
+            complex_to_float_dtype, float_to_complex_dtype, \
+            mk_view, mk_shape, HostArray
 
-def dct(a, type=2, axis=-1):
+def dct(a, out=None, type=2, axis=-1):
     ndim  = a.ndim
     shape = a.shape
     N     = a.shape[axis]
@@ -24,8 +24,13 @@ def dct(a, type=2, axis=-1):
         X = np.empty(shape=s0, dtype=a.dtype)
         np.concatenate((a, a[slc0][slc1]), axis=axis, out=X)
         res = _FFT.rfft(X, axis=axis).real
+        if (out is None):
+            out = res
+        else:
+            assert out.shape == res.shape
+            out[...] = res
     elif (type==2):
-        # O(sqrt(log(N))) error, O(N) complexity, O(4N) memory
+        # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory
         n0 = N//2 + 1
         n1 = (N-1)//2 + 1
         slc0  = mk_view(ndim, axis, 1,    None, None)
@@ -41,11 +46,14 @@ def dct(a, type=2, axis=-1):
         X = _FFT.rfft(X, axis=axis)
         X *= (2*np.exp(-1j*np.pi*np.arange(n0)/(2*N)))[slc4]
         
-        res = np.empty_like(a)
-        res[slc5] = +X.real[slc5]
-        res[slc6] = -X.imag[slc0][slc3]
+        if (out is None):
+            out = np.empty_like(a)
+        else:
+            assert out.shape == a.shape
+        out[slc5] = +X.real[slc5]
+        out[slc6] = -X.imag[slc0][slc3]
     elif (type==3):
-        # O(sqrt(log(N))) error, O(N) complexity, O(4N) memory
+        # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory
         ctype = float_to_complex_dtype(a.dtype)
         n0 = N//2 + 1
         n1 = (N-1)//2 + 1
@@ -66,17 +74,20 @@ def dct(a, type=2, axis=-1):
         X = _FFT.irfft(X, axis=axis, n=N)
         X *= N
 
-        res = np.empty_like(a)
-        res[slc4] = X[slc5]
-        res[slc6] = X[slc2][slc3]
+        if (out is None):
+            out = np.empty_like(a)
+        else:
+            assert out.shape == a.shape
+        out[slc4] = X[slc5]
+        out[slc6] = X[slc2][slc3]
     else:
         stypes=['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII']
         msg='DCT-{} has not been implemented yet.'
         msg=msg.format(stypes[type-1])
         raise NotImplementedError(msg)
-    return res
+    return out
 
-def dst(a, type=2, axis=-1):
+def dst(a, out=None, type=2, axis=-1):
     ndim  = a.ndim
     shape = a.shape
     N     = a.shape[axis]
@@ -90,9 +101,13 @@ def dst(a, type=2, axis=-1):
         Z = np.zeros(shape=s1, dtype=a.dtype)
         np.concatenate((Z, -a, Z, a[slc0]), axis=axis, out=X)
         res = _FFT.rfft(X, axis=axis).imag
-        res = res[slc1]
+        if (out is None):
+            out = np.empty_like(a)
+        else:
+            assert out.shape == a.shape
+        out[...] = res[slc1]
     elif (type==2):
-        # O(sqrt(log(N))) error, O(N) complexity, O(4N) memory
+        # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory
         n0 = N//2 + 1
         n1 = (N-1)//2 + 1
         slc0  = mk_view(ndim, axis, 1,    None, None)
@@ -109,54 +124,55 @@ def dst(a, type=2, axis=-1):
         X = _FFT.rfft(X, axis=axis)
         X *= (2*np.exp(-1j*np.pi*np.arange(n0)/(2*N)))[slc4]
         
-        res = np.empty_like(a)
-        res[slc5] = -X.imag[slc7]
-        res[slc6] = +X.real[slc3]
+        if (out is None):
+            out = np.empty_like(a)
+        else:
+            assert out.shape == a.shape
+        out[slc5] = -X.imag[slc7]
+        out[slc6] = +X.real[slc3]
     elif (type==3):
-        # O(sqrt(log(N))) error, O(N) complexity, O(4N) memory
+        # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory
         ctype = float_to_complex_dtype(a.dtype)
         n0 = N//2 + 1
         n1 = (N-1)//2 + 1
         slc0  = mk_view(ndim, axis, None, n0,   None)
-        slc1  = mk_view(ndim, axis, 1   , None, None)
-        slc2  = mk_view(ndim, axis, n1  , None, None)
-        slc3  = mk_view(ndim, axis, None, None, -1)
-        slc4  = mk_view(ndim, axis, None, None, +2)
-        slc5  = mk_view(ndim, axis, None, n1,   None)
-        slc6  = mk_view(ndim, axis, 1,    None, +2)
-        slc7  = mk_view(ndim, axis, None, None, None, default=None)
+        slc1  = mk_view(ndim, axis, None, None, -1)
+        slc2  = mk_view(ndim, axis, 1,    None, None)
+        slc3  = mk_view(ndim, axis, None, N-n1, None)
+        slc4  = mk_view(ndim, axis, None, None, None, default=None)
+        slc5  = mk_view(ndim, axis, None, None, 2)
+        slc6  = mk_view(ndim, axis, None, n1,   None)
+        slc7  = mk_view(ndim, axis, 1,    None, 2)
+        slc8  = mk_view(ndim, axis, n1,   None, None)
         s0    = mk_shape(shape, axis, n0)
 
         X = np.zeros(shape=s0, dtype=ctype)
-        X.real[slc0] = +a[slc0]
-        X.imag[slc1] = -a[slc2][slc3]
-        X *= np.exp(+1j*np.pi*np.arange(n0)/(2*N))[slc7]
+        X.real[slc0] = +a[slc1][slc0]
+        X.imag[slc2] = -a[slc3]
+        X *= np.exp(+1j*np.pi*np.arange(n0)/(2*N))[slc4]
         X = _FFT.irfft(X, axis=axis, n=N)
-        X *= N
+        X[...] *= N
 
-        res = np.empty_like(a)
-        res[slc4] = X[slc5]
-        res[slc6] = X[slc2][slc3]
+        if (out is None):
+            out = np.empty_like(a)
+        else:
+            assert out.shape == a.shape
+        out[slc5] = +X[slc6]
+        out[slc7] = -X[slc8][slc1]
     else:
         stypes=['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII']
         msg='DCT-{} has not been implemented yet.'
         msg=msg.format(stypes[type-1])
         raise NotImplementedError(msg)
-    return res
+    return out
 
-def idct(a, type=2, axis=-1, **kwds):
+def idct(a, out=None, type=2, axis=-1, **kwds):
     itype = [1,3,2,4][type-1]
-    return dct(a=a, type=itype, axis=axis, **kwds)
+    return dct(a=a, out=out, type=itype, axis=axis, **kwds)
 
-def idst(a, type=2, axis=-1, **kwds):
+def idst(a, out=None, type=2, axis=-1, **kwds):
     itype = [1,3,2,4][type-1]
-    return dst(a=a, type=itype, axis=axis, **kwds)
-
-# extend numpy fft interface with R2R transforms
-for (fname,fn) in zip( ('dct', 'dst', 'idct', 'idst'),
-                       (dct, dst, idct, idst) ):
-    if not hasattr(_FFT, fname):
-        setattr(_FFT, fname, fn)
+    return dst(a=a, out=out, type=itype, axis=axis, **kwds)
 
 
 class NumpyFFTPlan(FFTPlanI):
@@ -164,58 +180,44 @@ class NumpyFFTPlan(FFTPlanI):
     Wrap a numpy fft call (numpy.fft does not offer real planning capabilities). 
     """
 
-    def __init__(self, fn, out, 
-                    scaling=None,
-                    output_dtype=lambda dtype: dtype, 
-                    **kwds):
+    def __init__(self, fn, a, out, scaling=None, **kwds):
         super(NumpyFFTPlan, self).__init__()
-        assert callable(output_dtype)
-        self.fn   = fn
-        self.out  = out
-        self.kwds = kwds.copy()
-        self.scaling = scaling
-        self.output_dtype = output_dtype
+        
+        self.fn           = fn
+        self.a            = a
+        self.out          = out
+        self.scaling      = scaling
+
+        if isinstance(a, HostArray):
+            a = a.handle
+        if isinstance(out, HostArray):
+            out = out.handle
+    
+        kwds = kwds.copy()
+        kwds['a'] = a
+        if fn in (dct, idct, dst, idst):
+            kwds['out'] = out
+        self.kwds         = kwds
+
+    @property
+    def input_array(self):
+        return self.a
+
+    @property
+    def output_array(self):
+        return self.out
 
     def execute(self):
         out     = self.out
         scaling = self.scaling
-
-        res = self.fn(**self.kwds)
-
-        if (scaling is not None):
-            res[...] *= scaling
-
-        if (out is None):
-            # explicit cast
-            cast = self.output_dtype(self.kwds['a'].dtype)
-            return res.astype(cast)
+        
+        if self.fn in (dct, idct, dst, idst):
+            self.fn(**self.kwds)
         else:
-            # implicit cast (during copy)
-            out[...] = res
-            return out
-
-    def __call__(self, out=None, **kwds):
-        fn_kwds = self.kwds.copy()
-        fn_kwds.update(kwds)
-        if (self.fn is _FFT.irfft):
-            assert 'n' in fn_kwds
+            out[...] = self.fn(**self.kwds)
 
-        out     = first_not_None(out, self.out)
-        scaling = self.scaling
-
-        res = self.fn(**fn_kwds)
-        
         if (scaling is not None):
-            res[...] *= scaling
-
-        if (out is None):
-            # explicit cast
-            cast = self.output_dtype(fn_kwds['a'].dtype)
-            return res.astype(cast)
-        else:
-            # implicit cast (during copy)
-            out[...] = res
-            return out
+            out[...] *= scaling
 
 
 class NumpyFFT(FFTI):
@@ -231,55 +233,63 @@ class NumpyFFT(FFTI):
     The only advantage is that planning won't destroy original inputs.
     """
 
-    def __init__(self):
-        super(NumpyFFT, self).__init__()
+    def __init__(self, backend=None, **kwds):
+        super(NumpyFFT, self).__init__(backend=backend, **kwds)
         self.supported_ftypes = (np.float32, np.float64,)
         self.supported_ctypes = (np.complex64, np.complex128,)
     
+    
     def fft(self, a, out=None, axis=-1, **kwds):
-        super(NumpyFFT, self).fft(a=a, out=out, axis=axis, **kwds)
+        (shape, dtype) = super(NumpyFFT, self).fft(a=a, out=out, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
         plan = NumpyFFTPlan(fn=_FFT.fft, a=a, out=out, axis=axis, **kwds)
         return plan
 
     def ifft(self, a, out=None, axis=-1, **kwds):
-        super(NumpyFFT, self).ifft(a=a, out=out, axis=axis, **kwds)
+        (shape, dtype) = super(NumpyFFT, self).ifft(a=a, out=out, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
         plan = NumpyFFTPlan(fn=_FFT.ifft, a=a, out=out, axis=axis, **kwds)
         return plan
 
     def rfft(self, a, out=None, axis=-1, **kwds):
-        super(NumpyFFT, self).rfft(a=a, out=out, axis=axis, **kwds)
+        (shape, dtype) = super(NumpyFFT, self).rfft(a=a, out=out, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
         plan = NumpyFFTPlan(fn=_FFT.rfft, a=a, out=out, axis=axis, 
-                            output_dtype=lambda x: float_to_complex_dtype(x),
                             **kwds)
         return plan
 
     def irfft(self, a, out=None, n=None, axis=-1, **kwds):
-        (rshape, _) = super(NumpyFFT, self).irfft(a=a, out=out, n=n, axis=axis, **kwds)
+        (shape, dtype) = super(NumpyFFT, self).irfft(a=a, out=out, n=n, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
         plan = NumpyFFTPlan(fn=_FFT.irfft, a=a, out=out, axis=axis, 
-                            output_dtype=lambda x: complex_to_float_dtype(x),
-                            n=rshape[axis], **kwds)
+                            n=shape[axis], **kwds)
         return plan
     
     def dct(self, a, out=None, type=2, axis=-1, **kwds):
         (shape, dtype) = super(NumpyFFT, self).dct(a=a, out=out, type=type, axis=axis, **kwds)
-        plan = NumpyFFTPlan(fn=_FFT.dct, a=a, out=out, axis=axis, type=type, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
+        plan = NumpyFFTPlan(fn=dct, a=a, out=out, axis=axis, type=type, **kwds)
         return plan
     
     def idct(self, a, out=None, type=2, axis=-1, scaling=None, **kwds):
         (shape, dtype, _, s) = super(NumpyFFT, self).idct(a=a, out=out, type=type, 
                                     axis=axis, **kwds)
-        plan = NumpyFFTPlan(fn=_FFT.idct, a=a, out=out, axis=axis, type=type, 
-                                scaling=first_not_None(scaling, s), **kwds)
+        out = self.allocate_output(out, shape, dtype) 
+        plan = NumpyFFTPlan(fn=idct, a=a, out=out, axis=axis, type=type, 
+                                scaling=first_not_None(scaling, 1.0/s), **kwds)
         return plan
     
     def dst(self, a, out=None, type=2, axis=-1, **kwds):
         (shape, dtype) = super(NumpyFFT, self).dst(a=a, out=out, type=type, axis=axis, **kwds)
-        plan = NumpyFFTPlan(fn=_FFT.dst, a=a, out=out, axis=axis, type=type, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
+        plan = NumpyFFTPlan(fn=dst, a=a, out=out, axis=axis, type=type, **kwds)
         return plan
 
     def idst(self, a, out=None, type=2, axis=-1, scaling=None, **kwds):
-        (shape, dtype, _, s) = super(NumpyFFT, self).idst(a=a, out=out, type=type, axis=axis, **kwds)
-        plan = NumpyFFTPlan(fn=_FFT.idst, a=a, out=out, axis=axis, type=type,
-                                scaling=first_not_None(scaling, s), **kwds)
+        (shape, dtype, _, s) = super(NumpyFFT, self).idst(a=a, out=out, type=type, axis=axis, 
+                **kwds)
+        out = self.allocate_output(out, shape, dtype) 
+        plan = NumpyFFTPlan(fn=idst, a=a, out=out, axis=axis, type=type,
+                                scaling=first_not_None(scaling, 1.0/s), **kwds)
         return plan
 
diff --git a/hysop/numerics/fft/scipy_fft.py b/hysop/numerics/fft/scipy_fft.py
index c1d5fb68fe3f9c1e31b67156b95c1d7afbd45883..cf7c76042240e34f924f80f0b442836a1333114b 100644
--- a/hysop/numerics/fft/scipy_fft.py
+++ b/hysop/numerics/fft/scipy_fft.py
@@ -9,7 +9,9 @@ import scipy as sp
 from scipy import fftpack as _FFT
 
 from hysop.tools.types import first_not_None
-from hysop.numerics.fft.fft import FFTPlanI, FFTI, complex_to_float_dtype, float_to_complex_dtype
+from hysop.numerics.fft.fft import FFTPlanI, FFTI, \
+        complex_to_float_dtype, float_to_complex_dtype, \
+        mk_shape, mk_view, HostArray
 
 
 class ScipyFFTPlan(FFTPlanI):
@@ -17,74 +19,97 @@ class ScipyFFTPlan(FFTPlanI):
     Wrap a scipy fftpack call (scipy.fftpack does not offer real planning capabilities). 
     """
 
-    def __init__(self, fn, out, 
+    def __init__(self, fn, x, out, axis,
                     scaling=None,
                     **kwds):
         super(ScipyFFTPlan, self).__init__()
         self.fn      = fn
+        self.x       = x
         self.out     = out
+        self.axis    = axis
         self.scaling = scaling
-        self.kwds = kwds.copy()
 
-    def execute(self):
-        return self.__call__()
+        kwds = kwds.copy()
+        kwds['x']    = x
+        kwds['axis'] = axis
+        self.kwds = kwds
+
+    @property
+    def input_array(self):
+        return self.x
+
+    @property
+    def output_array(self):
+        return self.out
 
-    def __call__(self, out=None, **kwds):
+    def execute(self):
         fn = self.fn
         fn_kwds = self.kwds.copy()
-        fn_kwds.update(kwds)
         if (fn is _FFT.irfft):
             assert 'n' in fn_kwds
         
-        in_  = fn_kwds['x']
-        axis = fn_kwds['axis']
+        x   = self.x
+        out = self.out
+        if isinstance(x, HostArray):
+            x = x.handle
+        if isinstance(out, HostArray):
+            out = out.handle
+
+        axis = self.axis
+        mkv = lambda *args, **kwds: mk_view(x.ndim, axis, *args, **kwds)
 
         if fn is _FFT.irfft:
-            assert axis in (in_.ndim-1, -1)
-            in_ = in_.view(dtype=complex_to_float_dtype(in_.dtype))
+            x = x.view(dtype=complex_to_float_dtype(x.dtype))
             is_even = (fn_kwds['n']%2==0)
-            rshape = list(in_.shape)
-            rshape[-1] -= 1 + is_even
-            _in = np.empty(shape=rshape, dtype=in_.dtype)
-            _in[...,1:] = in_[...,2:in_.shape[-1]-is_even]
-            _in[...,0]  = in_[...,0]
+            rshape = list(x.shape)
+            rshape[axis] -= (1 + is_even)
+            _in = np.empty(shape=rshape, dtype=x.dtype)
+            _in[mkv(1,None)] = x[mkv(2,x.shape[axis]-is_even)]
+            _in[mkv(0)]  = x[mkv(0)]
             fn_kwds['x'] = _in
         
         # custom implementation of DCT-I and DST-I
         # default scipy FFTPACK implementation has optimal complexity
         # but O(sqrt(N)) error.
         if fn in (_FFT.dct, _FFT.idct) and fn_kwds['type']==1:
-            assert axis in (in_.ndim-1, -1)
-            N = in_.shape[axis]
-            X = np.hstack((in_, in_[1:-1][::-1])).astype(in_.dtype)
+            N     = x.shape[axis]
+            shape = x.shape
+            ndim  = x.ndim
+            slc0  = mk_view(ndim, axis, 1, -1)
+            slc1  = mk_view(ndim, axis, None, None, -1)
+            slc2  = mk_view(ndim, axis, 2, N, None)
+            slc3  = mk_view(ndim, axis, 3, None, 2)
+            slc4  = mk_view(ndim, axis, None, N, None)
+            s0    = mk_shape(shape, axis, 2*N-2)
+            X  = np.empty(shape=s0, dtype=x.dtype)
+            np.concatenate((x, x[slc0][slc1]), axis=axis, out=X)
             res = _FFT.rfft(X, axis=axis)
-            res[2:N] = res[3::2]
-            res = res[:N]
+            res[slc2] = res[slc3]
+            res = res[slc4]
         elif fn in (_FFT.dst, _FFT.idst) and fn_kwds['type']==1:
-            assert axis in (in_.ndim-1, -1)
-            N = in_.shape[axis]
-            X = np.hstack((0, -in_, 0, +in_[::-1])).astype(in_.dtype)
+            assert axis in (x.ndim-1, -1)
+            N = x.shape[axis]
+            X = np.hstack((0, -x, 0, +x[::-1])).astype(x.dtype)
             res = _FFT.rfft(X, axis=axis)[2::2]
         else:
             res = fn(**fn_kwds)
 
-        out = first_not_None(out, self.out)
         if fn is _FFT.rfft:
-            assert axis in (in_.ndim-1, -1)
-            is_even = (in_.shape[axis] % 2 == 0) 
+            assert axis in (x.ndim-1, -1)
+            is_even = (x.shape[axis] % 2 == 0) 
             if (out is None):
                 rshape = list(res.shape)
                 rshape[axis] += 1 + is_even
-                out = np.empty(dtype=in_.dtype, shape=rshape)
+                out = np.empty(dtype=x.dtype, shape=rshape)
             else:
                 rtype = complex_to_float_dtype(out.dtype)
                 out = out.view(dtype=rtype)
             ctype = float_to_complex_dtype(out.dtype)
-            out[..., 1:out.shape[-1]-is_even] = res
-            out[..., 0] = out[..., 1]
-            out[..., 1] = 0.0
+            out[mkv(1,out.shape[axis]-is_even)] = res
+            out[mkv(0)] = out[mkv(1)]
+            out[mkv(1)] = 0.0
             if is_even:
-                out[..., -1] = 0.0
+                out[mkv(-1)] = 0.0
             out = out.view(dtype=ctype)
         elif (out is not None):
             out[...] = res
@@ -111,53 +136,62 @@ class ScipyFFT(FFTI):
     Planning won't destroy original inputs.
     """
 
-    def __init__(self):
-        super(ScipyFFT, self).__init__()
+    def __init__(self, **kwds):
+        super(ScipyFFT, self).__init__(**kwds)
         self.supported_ftypes = (np.float32, np.float64,)
         self.supported_ctypes = (np.complex64, np.complex128,)
     
     def fft(self, a, out=None, axis=-1, **kwds):
-        super(ScipyFFT, self).fft(a=a, out=out, axis=axis, **kwds)
+        (shape, dtype) = super(ScipyFFT, self).fft(a=a, out=out, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
         plan = ScipyFFTPlan(fn=_FFT.fft, x=a, out=out, axis=axis, **kwds)
         return plan
 
     def ifft(self, a, out=None, axis=-1, **kwds):
-        super(ScipyFFT, self).ifft(a=a, out=out, axis=axis, **kwds)
+        (shape, dtype) = super(ScipyFFT, self).ifft(a=a, out=out, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
         plan = ScipyFFTPlan(fn=_FFT.ifft, x=a, out=out, axis=axis, **kwds)
         return plan
 
     def rfft(self, a, out=None, axis=-1, **kwds):
-        super(ScipyFFT, self).rfft(a=a, out=out, axis=axis, **kwds)
+        (shape, dtype) = super(ScipyFFT, self).rfft(a=a, out=out, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
         plan = ScipyFFTPlan(fn=_FFT.rfft, x=a, out=out, axis=axis, 
                             **kwds)
         return plan
 
     def irfft(self, a, out=None, n=None, axis=-1, **kwds):
-        (rshape, _) = super(ScipyFFT, self).irfft(a=a, out=out, n=n, axis=axis, **kwds)
+        (shape, dtype) = super(ScipyFFT, self).irfft(a=a, out=out, n=n, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
         plan = ScipyFFTPlan(fn=_FFT.irfft, x=a, out=out, axis=axis, 
-                            n=rshape[axis], **kwds)
+                            n=shape[axis], **kwds)
         return plan
     
     def dct(self, a, out=None, type=2, axis=-1, **kwds):
         (shape, dtype) = super(ScipyFFT, self).dct(a=a, out=out, type=type, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
         plan = ScipyFFTPlan(fn=_FFT.dct, x=a, out=out, axis=axis, type=type, **kwds)
         return plan
     
     def idct(self, a, out=None, type=2, axis=-1, scaling=None, **kwds):
         (shape, dtype, _, s) = super(ScipyFFT, self).idct(a=a, out=out, type=type, 
                                     axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
         plan = ScipyFFTPlan(fn=_FFT.idct, x=a, out=out, axis=axis, type=type, 
-                                scaling=first_not_None(scaling, s), **kwds)
+                                scaling=first_not_None(scaling, 1.0/s), **kwds)
         return plan
     
     def dst(self, a, out=None, type=2, axis=-1, **kwds):
         (shape, dtype) = super(ScipyFFT, self).dst(a=a, out=out, type=type, axis=axis, **kwds)
+        out = self.allocate_output(out, shape, dtype) 
         plan = ScipyFFTPlan(fn=_FFT.dst, x=a, out=out, axis=axis, type=type, **kwds)
         return plan
 
     def idst(self, a, out=None, type=2, axis=-1, scaling=None, **kwds):
-        (shape, dtype, _, s) = super(ScipyFFT, self).idst(a=a, out=out, type=type, axis=axis, **kwds)
+        (shape, dtype, _, s) = super(ScipyFFT, self).idst(a=a, out=out, type=type, axis=axis, 
+                **kwds)
+        out = self.allocate_output(out, shape, dtype) 
         plan = ScipyFFTPlan(fn=_FFT.idst, x=a, out=out, axis=axis, type=type, 
-                                scaling=first_not_None(scaling, s), **kwds)
+                                scaling=first_not_None(scaling, 1.0/s), **kwds)
         return plan
 
diff --git a/hysop/numerics/tests/test_fft.py b/hysop/numerics/tests/test_fft.py
index 60e630514c457054e175dcda7ccfa584ec266712..b3ce1078de7927c26a2bfbfb8d3e0acb24b77c56 100644
--- a/hysop/numerics/tests/test_fft.py
+++ b/hysop/numerics/tests/test_fft.py
@@ -8,25 +8,40 @@ import numpy as np
 import itertools as it
 
 from hysop.deps import it, sm, random
-from hysop.constants import HYSOP_REAL
+from hysop.constants import Implementation
 from hysop.testsenv import __ENABLE_LONG_TESTS__, __HAS_OPENCL_BACKEND__
 from hysop.testsenv import opencl_failed, iter_clenv
 from hysop.tools.contexts import printoptions
 from hysop.tools.numerics import float_to_complex_dtype
 from hysop.tools.types import check_instance, first_not_None
 
-from hysop.numerics.fft.numpy_fft import NumpyFFT
-from hysop.numerics.fft.scipy_fft import ScipyFFT
-from hysop.numerics.fft.fftw_fft  import FftwFFT
+from hysop.numerics.fft.fft  import mk_shape
+from hysop.numerics.fft.numpy_fft  import NumpyFFT
+from hysop.numerics.fft.scipy_fft  import ScipyFFT
+from hysop.numerics.fft.fftw_fft   import FftwFFT
+from hysop.numerics.fft.gpyfft_fft import GpyFFT
 
 
 class TestFFT(object):
 
     implementations = {
-            'numpy': NumpyFFT(), 
-            'scipy': ScipyFFT(), 
-            'fftw':  FftwFFT(warn_on_allocation=False)
+        Implementation.PYTHON: {
+            'numpy': NumpyFFT(warn_on_allocation=False), 
+            'scipy': ScipyFFT(warn_on_allocation=False), 
+            'fftw':  FftwFFT(warn_on_allocation=False,
+                             warn_on_misalignment=False)
+        },
+        Implementation.OPENCL: {}
     }
+    
+    for (i,cl_env) in enumerate(iter_clenv()):
+        print '> Registering opencl backend {} as:\n{}'.format(
+                i, cl_env)
+        name = 'clfft{}'.format(i)
+        implementations[Implementation.OPENCL][name] = \
+            GpyFFT(cl_env=cl_env,
+                   warn_on_allocation=False,
+                   warn_on_unaligned_output_offset=False)
 
     msg_shape = 'Expected output array shape to be {} but got {} for implementation {}.'
     msg_dtype = 'Expected output array dtype to be {} but got {} for implementation {}.'
@@ -41,241 +56,233 @@ class TestFFT(object):
         print '::Testing 1D transform, precision {}::'.format(dtype.__name__)
         eps = np.finfo(dtype).eps
         ctype = float_to_complex_dtype(dtype)
-
-        print '\n FORWARD C2C: complex to complex forward transform'
-        for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes():
-            print '   FFT shape={:9s} '.format(str(shape)+':'),
-            Href = np.random.rand(2*N).astype(dtype).view(dtype=ctype).reshape(shape)
-            H    = np.empty_like(Href)
-            H, Href = self.simd_align(H, Href)
-            results = {}
-            for (name, impl) in self.implementations.iteritems():
-                if dtype in impl.supported_ftypes:
-                    plan = impl.fft(a=H)
-                    H[...] = Href
-                    out = plan.execute()
-                    assert out.shape == shape, self.msg_shape.format(shape, out.shape, name)
-                    assert out.dtype == ctype, self.msg_dtype.format(ctype, out.dtype, name)
-                    assert np.array_equal(Href, H), self.msg_input_modified.format(name)
-                    refout = out.copy()
-                    out[...] = 0
-                    out = plan.execute()
-                    assert np.array_equal(out, refout), self.msg_output_modified.format(name)
-                    results[name] = refout / N # forward normalization
-            self.check_distances(results, eps, self.report_eps, 'forward C2C', failures)
-        
-        print '\n BACKWARD C2C: complex to complex backward transform'
-        for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes():
-            print '   IFFT shape={:9s} '.format(str(shape)+':'),
-            Href = np.random.rand(2*N).astype(dtype).view(dtype=ctype).reshape(shape)
-            H    = np.empty_like(Href)
-            H, Href = self.simd_align(H, Href)
-            results = {}
-            for (name, impl) in self.implementations.iteritems():
-                if dtype in impl.supported_ftypes:
-                    plan = impl.ifft(a=H)
-                    H[...] = Href
-                    out = plan.execute()
-                    assert out.shape == shape, self.msg_shape.format(shape, out.shape, name)
-                    assert out.dtype == ctype, self.msg_dtype.format(ctype, out.dtype, name)
-                    assert np.array_equal(Href, H), self.msg_input_modified.format(name)
-                    refout = out.copy()
-                    out[...] = 0
-                    out = plan.execute()
-                    assert np.array_equal(out, refout), self.msg_output_modified.format(name)
-                    results[name] = refout
-            self.check_distances(results, eps, self.report_eps, 'backward C2C', failures)
         
+        if False:
+            print '\n FORWARD C2C: complex to complex forward transform'
+            for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes():
+                print '   FFT shape={:9s} '.format(str(shape)+':'),
+                Href = np.random.rand(2*N).astype(dtype).view(dtype=ctype).reshape(shape)
+                results = {}
+                for (kind, implementations) in self.implementations.iteritems():
+                    for (name, impl) in implementations.iteritems():
+                        if dtype not in impl.supported_ftypes:
+                            continue
+                        Bin = impl.backend.empty(shape=shape, dtype=ctype)
+                        plan = impl.fft(a=Bin).setup()
+                        Bout = plan.output_array
+                        assert Bout.shape == shape, self.msg_shape.format(shape, Bout.shape, 
+                                name)
+                        assert Bout.dtype == ctype, self.msg_dtype.format(ctype, Bout.dtype, 
+                                name)
+                        Bin[...] = Href
+                        plan.execute()
+                        H0 = Bin.get()
+                        H1 = Bout.get()
+                        assert np.array_equal(Href, H0), self.msg_input_modified.format(name)
+                        Bout[...] = 0
+                        evt = plan.execute()
+                        assert plan.output_array is Bout
+                        H2 = Bout.get()
+                        assert np.array_equal(H1, H2), self.msg_output_modified.format(name)
+                        results[name] = H1 / N # forward normalization
+                self.check_distances(results, eps, self.report_eps, 'forward C2C', failures)
+            
+            print '\n BACKWARD C2C: complex to complex backward transform'
+            for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes():
+                print '   IFFT shape={:9s} '.format(str(shape)+':'),
+                Href = np.random.rand(2*N).astype(dtype).view(dtype=ctype).reshape(shape)
+                results = {}
+                for (kind, implementations) in self.implementations.iteritems():
+                    for (name, impl) in implementations.iteritems():
+                        if dtype not in impl.supported_ftypes:
+                            continue
+                        Bin = impl.backend.empty(shape=shape, dtype=ctype)
+                        plan = impl.ifft(a=Bin).setup()
+                        Bout = plan.output_array
+                        assert Bout.shape == shape, self.msg_shape.format(shape, Bout.shape, 
+                                name)
+                        assert Bout.dtype == ctype, self.msg_dtype.format(ctype, Bout.dtype, 
+                                name)
+                        Bin[...] = Href
+                        plan.execute()
+                        H0 = Bin.get()
+                        H1 = Bout.get()
+                        assert np.array_equal(Href, H0), self.msg_input_modified.format(name)
+                        Bout[...] = 0
+                        evt = plan.execute()
+                        assert plan.output_array is Bout
+                        H2 = Bout.get()
+                        assert np.array_equal(H1, H2), self.msg_output_modified.format(name)
+                        results[name] = H1 / N # forward normalization
+                self.check_distances(results, eps, self.report_eps, 'backward C2C', failures)
+            
         print '\n FORWARD R2C: real to hermitian complex transform'
         for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes():
             print '   RFFT shape={:9s} '.format(str(shape)+':'),
             Href = np.random.rand(N).astype(dtype).reshape(shape)
-            H    = np.empty_like(Href)
-            H, Href = self.simd_align(H, Href)
             results = {}
-            for (name, impl) in self.implementations.iteritems():
-                if dtype in impl.supported_ftypes:
-                    plan = impl.rfft(a=H)
-                    H[...] = Href
-                    out = plan.execute()
-                    assert out.shape == cshape, self.msg_shape.format(cshape, out.shape, name)
-                    assert out.dtype == ctype,  self.msg_dtype.format(ctype, out.dtype, name)
-                    assert np.array_equal(Href, H), self.msg_input_modified.format(name)
-                    refout = out.copy()
-                    out[...] = 0
-                    out = plan.execute()
-                    assert np.array_equal(out, refout), self.msg_output_modified.format(name)
-                    results[name] = refout / N  #forward normalization
+            for (kind, implementations) in self.implementations.iteritems():
+                for (name, impl) in implementations.iteritems():
+                    if dtype not in impl.supported_ftypes:
+                        continue
+                    Bin = impl.backend.empty(shape=shape, dtype=dtype)
+                    plan = impl.rfft(a=Bin).setup()
+                    Bout = plan.output_array
+                    assert Bout.shape == cshape, self.msg_shape.format(cshape, Bout.shape, 
+                            name)
+                    assert Bout.dtype == ctype, self.msg_dtype.format(ctype, Bout.dtype, 
+                            name)
+                    Bin[...] = Href
+                    plan.execute()
+                    H0 = Bin.get()
+                    H1 = Bout.get()
+                    assert np.array_equal(Href, H0), self.msg_input_modified.format(name)
+                    Bout[...] = 0
+                    evt = plan.execute()
+                    assert plan.output_array is Bout
+                    H2 = Bout.get()
+                    assert np.array_equal(H1, H2), self.msg_output_modified.format(name)
+                    results[name] = H1 / N # forward normalization
             self.check_distances(results, eps, self.report_eps, 'R2C', failures)
         
         print '\n BACKWARD C2R: real to hermitian complex transform'
         for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes():
-            print '   IRFFT shape={:9s} '.format(str(rshape)+':'),
+            print '   IRFFT shape={:9s} '.format(str(shape)+':'),
             Href = np.random.rand(2*Nc).astype(dtype).view(dtype=ctype).reshape(cshape)
-            H    = np.empty_like(Href)
-            H, Href = self.simd_align(H, Href)
             results = {}
-            for (name, impl) in self.implementations.iteritems():
-                if dtype in impl.supported_ftypes:
-                    plan = impl.irfft(a=H)
-                    H[...] = Href
-                    out = plan.execute()
-                    assert out.shape == rshape, self.msg_shape.format(rshape, out.shape, name)
-                    assert out.dtype == dtype, self.msg_dtype.format(dtype, out.dtype, name)
-                    assert np.array_equal(Href, H), self.msg_input_modified.format(name)
-                    refout = out.copy()
-                    out[...] = 0
-                    out = plan.execute()
-                    assert np.array_equal(out, refout), self.msg_output_modified.format(name)
-                    results[name] = refout
+            for (kind, implementations) in self.implementations.iteritems():
+                for (name, impl) in implementations.iteritems():
+                    if dtype not in impl.supported_ftypes:
+                        continue
+                    Bin = impl.backend.empty(shape=cshape, dtype=ctype)
+                    plan = impl.irfft(a=Bin).setup()
+                    Bout = plan.output_array
+                    assert Bout.shape == rshape, self.msg_shape.format(rshape, Bout.shape, 
+                            name)
+                    assert Bout.dtype == dtype, self.msg_dtype.format(dtype, Bout.dtype, 
+                            name)
+                    Bin[...] = Href
+                    plan.execute()
+                    H0 = Bin.get()
+                    H1 = Bout.get()
+                    assert np.array_equal(Href, H0), self.msg_input_modified.format(name)
+                    Bout[...] = 0
+                    evt = plan.execute()
+                    assert plan.output_array is Bout
+                    H2 = Bout.get()
+                    assert np.array_equal(H1, H2), self.msg_output_modified.format(name)
+                    results[name] = H1
             self.check_distances(results, eps, self.report_eps, 'normal C2R', failures)
         
-        print '\n BACKWARD FORCED C2R: real to hermitian complex transform with specified shape'
+        print ('\n BACKWARD FORCED C2R: real to hermitian complex transform with specified '
+                +'shape')
         for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes():
             print '   IRFFT shape={:9s} '.format(str(shape)+':'),
             Href = np.random.rand(2*Nc).astype(dtype).view(dtype=ctype).reshape(cshape)
-            H    = np.empty_like(Href)
-            H, Href = self.simd_align(H, Href)
             results = {}
-            for (name, impl) in self.implementations.iteritems():
-                if dtype in impl.supported_ftypes:
-                    plan = impl.irfft(a=H, n=shape[-1])
-                    H[...] = Href
-                    out = plan.execute()
-                    assert out.shape == shape, self.msg_shape.format(shape, out.shape, name)
-                    assert out.dtype == dtype, self.msg_dtype.format(dtype, out.dtype, name)
-                    assert np.array_equal(Href, H), self.msg_input_modified.format(name)
-                    refout = out.copy()
-                    out[...] = 0
-                    out = plan.execute()
-                    assert np.array_equal(out, refout), self.msg_output_modified.format(name)
-                    results[name] = refout
+            for (kind, implementations) in self.implementations.iteritems():
+                for (name, impl) in implementations.iteritems():
+                    if dtype not in impl.supported_ftypes:
+                        continue
+                    Bin = impl.backend.empty(shape=cshape, dtype=ctype)
+                    plan = impl.irfft(a=Bin, n=shape[-1]).setup()
+                    Bout = plan.output_array
+                    assert Bout.shape == shape, self.msg_shape.format(shape, Bout.shape, 
+                            name)
+                    assert Bout.dtype == dtype, self.msg_dtype.format(dtype, Bout.dtype, 
+                            name)
+                    Bin[...] = Href
+                    plan.execute()
+                    H0 = Bin.get()
+                    H1 = Bout.get()
+                    assert np.array_equal(Href, H0), self.msg_input_modified.format(name)
+                    Bout[...] = 0
+                    evt = plan.execute()
+                    assert plan.output_array is Bout
+                    H2 = Bout.get()
+                    assert np.array_equal(H1, H2), self.msg_output_modified.format(name)
+                    results[name] = H1
             self.check_distances(results, eps, self.report_eps, 'forced C2R', failures)
-        
+
         types = ['I  ','II ','III','IV ']
         for (itype,stype) in enumerate(types, 1):
             print '\n DCT-{}: real to real discrete cosine transform {}'.format(
                     stype.strip(), itype)
             for (shape, _, _, N, _, _) in self.iter_shapes():
                 print '   DCT-{} shape={:9s} '.format(stype, str(shape)+':'),
+                if (itype==1): # real size is 2*(N-1)
+                    N += 1
+                    shape = mk_shape(shape, -1, shape[-1] + 1)
                 Href = np.random.rand(N).astype(dtype).reshape(shape)
-                H    = np.empty_like(Href)
-                H, Href = self.simd_align(H, Href)
                 results = {}
-                for (name, impl) in self.implementations.iteritems():
-                    if dtype not in impl.supported_ftypes:
-                        continue
-                    if itype not in impl.supported_cosine_transforms:
-                        continue
-                    plan = impl.dct(a=H, type=itype)
-                    H[...] = Href
-                    out = plan.execute()
-                    assert out.shape == shape, self.msg_shape.format(shape, out.shape, name)
-                    assert out.dtype == dtype, self.msg_dtype.format(dtype, out.dtype, name)
-                    assert np.array_equal(Href, H), self.msg_input_modified.format(name)
-                    refout = out.copy()
-                    out[...] = 0
-                    out = plan.execute()
-                    assert np.array_equal(out, refout), self.msg_output_modified.format(name)
-                    results[name] = out / N # forward normalization
+                for (kind, implementations) in self.implementations.iteritems():
+                    for (name, impl) in implementations.iteritems():
+                        if dtype not in impl.supported_ftypes:
+                            continue
+                        if itype not in impl.supported_cosine_transforms:
+                            continue
+
+                        Bin = impl.backend.empty(shape=shape, dtype=dtype)
+                        plan = impl.dct(a=Bin, type=itype).setup()
+                        Bout = plan.output_array
+                        assert Bout.shape == shape, self.msg_shape.format(shape, Bout.shape, 
+                                name)
+                        assert Bout.dtype == dtype, self.msg_dtype.format(dtype, Bout.dtype, 
+                                name)
+                        Bin[...] = Href
+                        plan.execute()
+                        H0 = Bin.get()
+                        H1 = Bout.get()
+                        assert np.array_equal(Href, H0), self.msg_input_modified.format(name)
+                        Bout[...] = 0
+                        evt = plan.execute()
+                        assert plan.output_array is Bout
+                        H2 = Bout.get()
+                        assert np.allclose(H1, H2, atol=eps), \
+                                self.msg_output_modified.format(name)
+                        results[name] = H1 / N
                 self.check_distances(results, eps, self.report_eps, 
                         'DCT-{}'.format(stype), failures)
-            
-        for (itype,stype) in enumerate(types, 1):
-            print '\n DST-{}: real to real discrete sine transform {}'.format(
-                    stype.strip(), itype)
-            for (shape, _, _, N, _, _) in self.iter_shapes():
-                print '   DST-{} shape={:9s} '.format(stype, str(shape)+':'),
-                Href = np.random.rand(N).astype(dtype).reshape(shape)
-                H    = np.empty_like(Href)
-                H, Href = self.simd_align(H, Href)
-                results = {}
-                for (name, impl) in self.implementations.iteritems():
-                    if dtype not in impl.supported_ftypes:
-                        continue
-                    if itype not in impl.supported_sine_transforms:
-                        continue
-                    plan = impl.dst(a=H, type=itype)
-                    H[...] = Href
-                    out = plan.execute()
-                    assert out.shape == shape, self.msg_shape.format(shape, out.shape, name)
-                    assert out.dtype == dtype, self.msg_dtype.format(dtype, out.dtype, name)
-                    assert np.array_equal(Href, H), self.msg_input_modified.format(name)
-                    refout = out.copy()
-                    out[...] = 0
-                    out = plan.execute()
-                    assert np.array_equal(out, refout), self.msg_output_modified.format(name)
-                    results[name] = out / N # forward normalization
-                self.check_distances(results, eps, self.report_eps, 
-                        'DST-{}'.format(stype), failures)
         
         for (itype,stype) in enumerate(types, 1):
+            iitype = [1,3,2,4][itype-1]
             print '\n IDCT-{}: real to real discrete cosine transform {}'.format(
                     stype.strip(), itype)
             for (shape, _, _, N, _, _) in self.iter_shapes():
                 print '   IDCT-{} shape={:9s} '.format(stype, str(shape)+':'),
+                if (iitype==1): # real size is 2*(N-1)
+                    N += 1
+                    shape = mk_shape(shape, -1, shape[-1] + 1)
                 Href = np.random.rand(N).astype(dtype).reshape(shape)
-                H    = np.empty_like(Href)
-                H, Href = self.simd_align(H, Href)
                 results = {}
-                for (name, impl) in self.implementations.iteritems():
-                    if itype in (1,4):
-                        iitype = itype
-                    elif itype == 2:
-                        iitype = 3
-                    else:
-                        iitype = 2
-                    if dtype not in impl.supported_ftypes:
-                        continue
-                    if iitype not in impl.supported_cosine_transforms:
-                        continue
-                    plan = impl.idct(a=H, type=itype)
-                    H[...] = Href
-                    out = plan.execute()
-                    assert out.shape == shape, self.msg_shape.format(shape, out.shape, name)
-                    assert out.dtype == dtype, self.msg_dtype.format(dtype, out.dtype, name)
-                    assert np.array_equal(Href, H), self.msg_input_modified.format(name)
-                    refout = out.copy()
-                    out[...] = 0
-                    out = plan.execute()
-                    assert np.array_equal(out, refout), self.msg_output_modified.format(name)
-                    results[name] = out
+                for (kind, implementations) in self.implementations.iteritems():
+                    for (name, impl) in implementations.iteritems():
+                        if dtype not in impl.supported_ftypes:
+                            continue
+                        if iitype not in impl.supported_cosine_transforms:
+                            continue
+                        Bin = impl.backend.empty(shape=shape, dtype=dtype)
+                        plan = impl.idct(a=Bin, type=itype).setup()
+                        Bout = plan.output_array
+                        assert Bout.shape == shape, self.msg_shape.format(shape, Bout.shape, 
+                                name)
+                        assert Bout.dtype == dtype, self.msg_dtype.format(dtype, Bout.dtype, 
+                                name)
+                        Bin[...] = Href
+                        plan.execute()
+                        H0 = Bin.get()
+                        H1 = Bout.get()
+                        assert np.array_equal(Href, H0), self.msg_input_modified.format(name)
+                        Bout[...] = 0
+                        evt = plan.execute()
+                        assert plan.output_array is Bout
+                        H2 = Bout.get()
+                        assert np.allclose(H1, H2, atol=eps), \
+                                self.msg_output_modified.format(name)
+                        results[name] = H1
                 self.check_distances(results, eps, self.report_eps, 
                         'IDCT-{}'.format(stype), failures)
-        
-        for (itype,stype) in enumerate(types, 1):
-            print '\n IDST-{}: real to real discrete sine transform {}'.format(
-                    stype.strip(), itype)
-            for (shape, _, _, N, _, _) in self.iter_shapes():
-                print '   IDST-{} shape={:9s} '.format(stype, str(shape)+':'),
-                Href = np.random.rand(N).astype(dtype).reshape(shape)
-                H    = np.empty_like(Href)
-                H, Href = self.simd_align(H, Href)
-                results = {}
-                for (name, impl) in self.implementations.iteritems():
-                    if itype in (1,4):
-                        iitype = itype
-                    elif itype == 2:
-                        iitype = 3
-                    else:
-                        iitype = 2
-                    if dtype not in impl.supported_ftypes:
-                        continue
-                    if iitype not in impl.supported_sine_transforms:
-                        continue
-                    plan = impl.idst(a=H, type=itype)
-                    H[...] = Href
-                    out = plan.execute()
-                    assert out.shape == shape, self.msg_shape.format(shape, out.shape, name)
-                    assert out.dtype == dtype, self.msg_dtype.format(dtype, out.dtype, name)
-                    assert np.array_equal(Href, H), self.msg_input_modified.format(name)
-                    refout = out.copy()
-                    out[...] = 0
-                    out = plan.execute()
-                    assert np.array_equal(out, refout), self.msg_output_modified.format(name)
-                    results[name] = out
-                self.check_distances(results, eps, self.report_eps, 
-                        'IDST-{}'.format(stype), failures)
+
     
     def check_distances(self, results, eps, report_eps, tag, failures):
         if len(results.keys())==0:
@@ -319,7 +326,7 @@ class TestFFT(object):
             ss=()
             for (name, Einf) in distances.iteritems():
                 Eeps = int(np.round(Einf/eps))
-                failed |= (Eeps > self.report_eps)
+                failed |= (Eeps >= self.fail_eps)
                 s='{}={}eps'.format(name,Eeps)
                 ss += (s,)
             print ', '.join(ss)
@@ -378,12 +385,7 @@ class TestFFT(object):
                 H0, H1, Href = self.simd_align(H0, H1, Href)
                 results = {}
                 for (name, impl) in self.implementations.iteritems():
-                    if itype in (1,4):
-                        iitype = itype
-                    elif itype == 2:
-                        iitype = 3
-                    else:
-                        iitype = 2
+                    iitype = [1,3,2,4][itype-1]
                     if dtype not in impl.supported_ftypes:
                         continue
                     if itype not in impl.supported_cosine_transforms:
@@ -410,12 +412,7 @@ class TestFFT(object):
                 H0, H1, Href = self.simd_align(H0, H1, Href)
                 results = {}
                 for (name, impl) in self.implementations.iteritems():
-                    if itype in (1,4):
-                        iitype = itype
-                    elif itype == 2:
-                        iitype = 3
-                    else:
-                        iitype = 2
+                    iitype = [1,3,2,4][itype-1]
                     if dtype not in impl.supported_ftypes:
                         continue
                     if itype not in impl.supported_sine_transforms:
@@ -431,15 +428,17 @@ class TestFFT(object):
                 check_distances(results)
     
     def iter_shapes(self):
-        # minj=(5,1)
-        # maxj=(15,11)
-        minj=(2,2)
-        maxj=(5,5)
+        minj=(1,1)
+        maxj=(6,6)
+        # maxj=(12,8)
+        blacklists = ((), (7,))
         msg = ('EVEN', 'ODD')
         for i in xrange(2):
             base = 2+i
             print '  '+msg[i]
             for j1 in xrange(minj[i],maxj[i]):
+                if j1 in blacklists[i]:
+                    continue
                 shape = (base**j1,)
                 cshape = list(shape)
                 cshape[-1] = cshape[-1]//2 + 1
@@ -530,11 +529,11 @@ class TestFFT(object):
 
     def perform_tests(self):
         # not testing np.longdouble because only one implementation supports it
-        # ie. we cannot compare results between different 
+        # ie. we cannot compare results between different implementations
         failures = {}
-        for dtype in (np.float32, np.float64,):
-            self._test_forward_backward_1d(dtype=dtype)
-            # self._test_1d(dtype=dtype, failures=failures.setdefault(dtype.__name__, {}))
+        for dtype in (np.float64,):
+            #self._test_forward_backward_1d(dtype=dtype)
+            self._test_1d(dtype=dtype, failures=failures.setdefault(dtype.__name__, {}))
         self.report_failures(failures)
     
 if __name__ == '__main__':
diff --git a/hysop/tools/debug_dumper.py b/hysop/tools/debug_dumper.py
index 8e5005fafc7b229fec30e681ff07078b9211801a..90dc4eb8878fd223f12fa1cbc08ee6a5c2669d86 100644
--- a/hysop/tools/debug_dumper.py
+++ b/hysop/tools/debug_dumper.py
@@ -5,9 +5,9 @@ import numpy as np
 from hysop.tools.types import check_instance
 from hysop.fields.discrete_field import DiscreteFieldView
 
-
 class DebugDumper(object):
-    def __init__(self, name, path='/tmp/hysop/debug', force_overwrite=False):
+    def __init__(self, name, path='/tmp/hysop/debug', force_overwrite=False,
+            enable_on_op_apply=False):
         directory = path+'/'+name
         blobs_directory = directory + '/data'
         
@@ -28,6 +28,7 @@ class DebugDumper(object):
         self.blobs_directory = blobs_directory
         self.runfile = runfile
         self.dump_id = 0
+        self.enable_on_op_apply = enable_on_op_apply
 
         self.print_header()
 
diff --git a/hysop/tools/numerics.py b/hysop/tools/numerics.py
index d0f9c9bb2893b3bde459a286a1f85a558efb6ca6..877c39232ab7256d200faf415b9e17f799b6a880 100644
--- a/hysop/tools/numerics.py
+++ b/hysop/tools/numerics.py
@@ -74,7 +74,7 @@ def default_invalid_value(dtype):
     elif is_signed(dtype):
         return 0
     else:
-        raise NotImplementedError()
+        raise NotImplementedError
 
 # promote_dtype
 def match_dtype(x, dtype):