diff --git a/examples/example_utils.py b/examples/example_utils.py
index 73e0c55b870c7665514ad7b14e7d3f71ceadf60f..638918a4606d34181d364591d7117ed35fd42d74 100644
--- a/examples/example_utils.py
+++ b/examples/example_utils.py
@@ -697,6 +697,12 @@ class HysopArgParser(argparse.ArgumentParser):
         threading.add_argument('--mkl-threads', type=str, default=None, 
                 dest='mkl_threads',
                 help='This parameter will set MKL_NUM_THREADS to a custom value (overrides --max-threads).')
+        threading.add_argument('--mkl-domain-threads', type=str, default=None, 
+                dest='mkl_domain_threads',
+                help='This parameter will set MKL_DOMAIN_NUM_THREADS to a custom value (overrides --max-threads).')
+        threading.add_argument('--mkl-threading-layer', type=str, default='TBB', 
+                dest='mkl_threading_layer',
+                help="This parameter will set MKL_THREADING_LAYER to a custom value ('TBB', 'GNU', 'INTEL', 'SEQUENTIAL').")
         threading.add_argument('--numba-threads', type=str, default=None, 
                 dest='numba_threads',
                 help='This parameter will set NUMBA_NUM_THREADS to a custom value (overrides --max-threads).')
@@ -753,9 +759,9 @@ class HysopArgParser(argparse.ArgumentParser):
         return opencl
     
     def _check_threading_args(self, args):
-        self._check_default(args, ('enable_threading', 'max_threads', 'numba_threading_layer',
+        self._check_default(args, ('enable_threading', 'max_threads', 'mkl_threading_layer', 'numba_threading_layer',
                                    'fftw_planner_effort', 'fftw_planner_timelimit'), str, allow_none=False)
-        self._check_default(args, ('openmp_threads', 'mkl_threads', 'numba_threads', 'fftw_threads'), 
+        self._check_default(args, ('openmp_threads', 'mkl_threads', 'mkl_domain_threads', 'numba_threads', 'fftw_threads'), 
                             str, allow_none=True)
 
         args.enable_threading = self._convert_bool('enable_threading', args.enable_threading)
@@ -766,6 +772,8 @@ class HysopArgParser(argparse.ArgumentParser):
         for argname in ('openmp_threads', 'mkl_threads', 'numba_threads', 'fftw_threads'):
             setattr(args, argname, self._convert_threads(argname, getattr(args, argname), 
                 default=args.max_threads))
+        args.mkl_threading_layer = self._convert_mkl_threading_layer('mkl_threading_layer', 
+                args.mkl_threading_layer)
         args.numba_threading_layer = self._convert_numba_threading_layer('numba_threading_layer', 
                 args.numba_threading_layer)
         args.fftw_planner_effort = self._convert_fftw_planner_effort('fftw_planner_effort',
@@ -1422,6 +1430,8 @@ class HysopArgParser(argparse.ArgumentParser):
         # those environment variables are not part of HySoP
         self.set_env('OMP_NUM_THREADS',        args.openmp_threads,            False)
         self.set_env('MKL_NUM_THREADS',        args.mkl_threads,               False)
+        self.set_env('MKL_DOMAIN_NUM_THREADS', args.mkl_domain_threads,        False)
+        self.set_env('MKL_THREADING_LAYER',    args.mkl_threading_layer,       False)
         self.set_env('NUMBA_NUM_THREADS',      args.numba_threads,             False)
         self.set_env('NUMBA_THREADING_LAYER',  args.numba_threading_layer,     False)
 
@@ -1531,6 +1541,15 @@ class HysopArgParser(argparse.ArgumentParser):
         }
         return self._check_convert(argname, val, values)
     
+    def _convert_mkl_threading_layer(self, argname, val):
+        values = {
+            'seq': 'SEQUENTIAL',
+            'omp': 'OMP',
+            'tbb': 'TBB',
+            'intel': 'INTEL'
+        }
+        return self._check_convert(argname, val, values)
+    
     def _convert_numba_threading_layer(self, argname, val):
         values = {
             'workqueue': 'workqueue',
diff --git a/examples/shear_layer/shear_layer.py b/examples/shear_layer/shear_layer.py
index 75d894648d74abb582af8ee22a6565e03f2adc26..520d42899942b7a3868b3961a7a9279df2dc4433 100644
--- a/examples/shear_layer/shear_layer.py
+++ b/examples/shear_layer/shear_layer.py
@@ -124,10 +124,16 @@ def compute(args):
                             variables={velo:npts, vorti: npts}, 
                             projection=args.reprojection_frequency,
                             diffusion=nu, dt=dt,
-                            implementation=impl, **extra_op_kwds)
+                            implementation=impl, 
+                            enforce_implementation=args.enforce_implementation,
+                            **extra_op_kwds)
     #> We ask to dump the inputs and the outputs of this operator
-    poisson.dump_outputs(fields=(vorti,), frequency=args.dump_freq, **extra_op_kwds)
-    poisson.dump_outputs(fields=(velo,),  frequency=args.dump_freq, **extra_op_kwds)
+    poisson.dump_outputs(fields=(vorti,),
+            io_params=args.io_params.clone(filename='vorti'),
+            **extra_op_kwds)
+    poisson.dump_outputs(fields=(velo,),  
+            io_params=args.io_params.clone(filename='velo'),
+            **extra_op_kwds)
     
     #> Operator to compute the infinite norm of the velocity
     if (impl is Implementation.FORTRAN):
@@ -217,7 +223,7 @@ if __name__=='__main__':
             description+='orientation of the shear layers is made.'
             description+='\n'
             description+='\nEach of the shear layers rolls up in a single vortex '
-            description+='as the flow evolves.'
+            description+='as the flow eolves.'
             description+='\n'
             description+='\nDefault example parameters depends on the chosen case:'
             description+='\n  CASE     0        1        2'
@@ -282,6 +288,7 @@ if __name__=='__main__':
             self._check_positive(args, 'plot_freq', strict=False, allow_none=False)
             
         def _setup_parameters(self, args):
+            super(ShearLayerArgParser, self)._setup_parameters(args)
             from hysop.tools.types import first_not_None
             case = args.case
 
diff --git a/examples/taylor_green/taylor_green.py b/examples/taylor_green/taylor_green.py
index 3192c0288b4aa7fd229431f4f101de05d3b6c7bc..a8d07d76fa12605817c88997ce6bb65daedb8239 100644
--- a/examples/taylor_green/taylor_green.py
+++ b/examples/taylor_green/taylor_green.py
@@ -34,8 +34,7 @@ def compute(args):
                                ViscosityParameter
     from hysop.constants import Implementation, AdvectionCriteria, StretchingCriteria, Backend
 
-    from hysop.operators import DirectionalAdvection, DirectionalStretchingDiffusion, \
-                                DirectionalDiffusion, DirectionalStretching,          \
+    from hysop.operators import DirectionalAdvection, DirectionalStretching,          \
                                 StaticDirectionalStretching, Diffusion,               \
                                 PoissonCurl, AdaptiveTimeStep, HDF_Writer,            \
                                 Enstrophy, MinMaxFieldStatistics, StrangSplitting,    \
@@ -138,6 +137,7 @@ def compute(args):
                             # plot_energy=IOParams(filepath=spectral_path, filename='E_{fname}_{ite}', frequency=args.dump_freq),
                             # plot_input_vorticity_energy=-1,  # <= disable a specific plot
                             # plot_output_vorticity_energy=-1, # <= disable a specific plot
+                            enforce_implementation=args.enforce_implementation,
                             implementation=impl, **extra_op_kwds)
     #> We ask to dump the outputs of this operator
     dump_fields = HDF_Writer(name='fields', 
diff --git a/hysop/__init__.py.in b/hysop/__init__.py.in
index aefefc7958f7ccd73decc85b39f9dc76f07e14a6..701856c217556b09b0468e89f46aa8d91b5651e2 100644
--- a/hysop/__init__.py.in
+++ b/hysop/__init__.py.in
@@ -2,9 +2,26 @@
 Python package dedicated to flow simulation using particular methods
 on hybrid architectures (MPI-GPU)
 """
-import psutil
+import psutil, signal, traceback, threading, sys, os, warnings
 from functools import wraps
-from hysop.deps import __builtin__, print_function, os, sys, warnings, traceback
+from hysop.deps import __builtin__, print_function
+
+# Register debug signals (SIGUSR1(10)=print the main stack, SIGUSR2(12)=print the stack of all threads)
+def dumpstack(signal, frame):
+    traceback.print_stack()
+def dumpstacks(signal, frame):
+    #https://stackoverflow.com/questions/132058/showing-the-stack-trace-from-a-running-python-application 
+    id2name = dict([(th.ident, th.name) for th in threading.enumerate()])
+    code = []
+    for threadId, stack in sys._current_frames().items():
+        code.append("\n# Thread: %s(%d)" % (id2name.get(threadId,""), threadId))
+        for filename, lineno, name, line in traceback.extract_stack(stack):
+            code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
+            if line:
+                code.append("  %s" % (line.strip()))
+                print("\n".join(code))
+signal.signal(signal.SIGUSR1, dumpstack)
+signal.signal(signal.SIGUSR2, dumpstacks)
 
 def get_env(target, default_value):
     target = 'HYSOP_{}'.format(target)
@@ -55,7 +72,8 @@ __ENABLE_LONG_TESTS__ = get_env('ENABLE_LONG_TESTS', ("@ENABLE_LONG_TESTS@" is "
 __ENABLE_THREADING__ = get_env('ENABLE_THREADING', True)
 __MAX_THREADS__ = int(get_env('MAX_THREADS', psutil.cpu_count(logical=False)) if __ENABLE_THREADING__ else 1)
 set_env('OMP_NUM_THREADS',   __MAX_THREADS__)
-set_env('MKL_NUM_THREADS',   __MAX_THREADS__)
+set_env('MKL_NUM_THREADS',   1)
+set_env('MKL_THREADING_LAYER', 'TBB' if __ENABLE_THREADING__ else 'SEQUENTIAL')
 set_env('NUMBA_NUM_THREADS', __MAX_THREADS__)
 set_env('NUMBA_THREADING_LAYER', 'workqueue') # Use 'numba -s' to list support
 __DEFAULT_NUMBA_TARGET__ = ('parallel' if __ENABLE_THREADING__ else 'cpu')
@@ -75,6 +93,8 @@ if __MPI_ENABLED__:
                                shm_rank,  intershm_size
 else:
     main_rank, main_size = 0, 1
+    interhost_size, intershm_size = 1, 1
+    host_rank, shm_rank = 0, 0
 
 # define printing functions
 def print(*args, **kargs):
@@ -169,7 +189,14 @@ msg_threads = \
   HYSOP_MAX_THREADS:      {}
   --------------------------------
   OMP_NUM_THREADS:        {}
+  OMP_DYNAMIC:            {}
+  OMP_NESTED:             {}
+  OMP_MAX_ACTIVE_LEVELS:  {}
+  --------------------------------
+  MKL_THREADING_LAYER:    {}
   MKL_NUM_THREADS:        {}
+  MKL_DOMAIN_NUM_THREADS: {}
+  MKL_DYNAMIC:            {}
   --------------------------------
   DEFAULT_NUMBA_TARGET:   {}
   NUMBA_THREADING_LAYER:  {}
@@ -182,11 +209,17 @@ msg_threads = \
 '''.format(
     __ENABLE_THREADING__,
     __MAX_THREADS__,
-    os.environ['OMP_NUM_THREADS'],
-    os.environ['MKL_NUM_THREADS'],
+    os.environ.get('OMP_NUM_THREADS', None),
+    os.environ.get('OMP_DYNAMIC', None),
+    os.environ.get('OMP_NESTED', None),
+    os.environ.get('OMP_MAX_ACTIVE_LEVELS', None),
+    os.environ.get('MKL_THREADING_LAYER', None),
+    os.environ.get('MKL_NUM_THREADS', None),
+    os.environ.get('MKL_DOMAIN_NUM_THREADS', None),
+    os.environ.get('MKL_DYNAMIC', None),
     __DEFAULT_NUMBA_TARGET__,
-    os.environ['NUMBA_THREADING_LAYER'],
-    os.environ['NUMBA_NUM_THREADS'],
+    os.environ.get('NUMBA_THREADING_LAYER', None),
+    os.environ.get('NUMBA_NUM_THREADS', None),
     __FFTW_NUM_THREADS__,
     __FFTW_PLANNER_EFFORT__,
     __FFTW_PLANNER_TIMELIMIT__)
diff --git a/hysop/backend/device/opencl/opencl_array.py b/hysop/backend/device/opencl/opencl_array.py
index 39cf0e7a57a41d90c22917ccb5f7c88e89b5a1f5..42df5ec55a7474bf3d93c4a8cb4da618f547881c 100644
--- a/hysop/backend/device/opencl/opencl_array.py
+++ b/hysop/backend/device/opencl/opencl_array.py
@@ -78,9 +78,12 @@ class OpenClArray(Array):
              alignment = self.backend.device.mem_base_addr_align
              if (offset % alignment) == 0:
                  # try to return a subbuffer
-                 buf = self.base_data[offset:]
-                 buf.__parent = self.base_data
-                 return buf
+                 try:
+                     buf = self.base_data[offset:]
+                     buf.__parent = self.base_data
+                     return buf
+                 except:
+                     raise clArray.ArrayHasOffsetError
              else:
                  raise
 
diff --git a/hysop/backend/host/python/operator/diffusion.py b/hysop/backend/host/python/operator/diffusion.py
index dac0916f143323db0ff5af21ecbcf765a609ed5b..d40c8e1dead53e1fd219083bfc684305eac64940 100644
--- a/hysop/backend/host/python/operator/diffusion.py
+++ b/hysop/backend/host/python/operator/diffusion.py
@@ -7,7 +7,7 @@ from hysop.tools.types import check_instance, first_not_None
 from hysop.tools.decorators import debug
 from hysop.tools.numpywrappers import npw
 from hysop.tools.numerics import is_complex, complex_to_float_dtype
-from hysop.tools.numba_utils import make_numba_signature
+from hysop.tools.numba_utils import make_numba_signature, prange
 from hysop.backend.host.host_operator import HostOperator, OpenClMappable
 from hysop.core.graph.graph import op_apply
 from hysop.fields.continuous_field import Field
@@ -42,7 +42,7 @@ class PythonDiffusion(DiffusionOperatorBase, OpenClMappable, HostOperator):
                 '(n,m),(n),(m),()->(n,m)', target=target,
                 nopython=True, cache=True)
             def filter_diffusion_2d(Fin, K0, K1, nu_dt, Fout):
-                for i in range(Fin.shape[0]):
+                for i in prange(Fin.shape[0]):
                     for j in range(Fin.shape[1]):
                         Fout[i,j] /= (1 - nu_dt*(K0[i] + K1[j]))
             F = filter_diffusion_2d
@@ -51,8 +51,8 @@ class PythonDiffusion(DiffusionOperatorBase, OpenClMappable, HostOperator):
                 '(n,m,p),(n),(m),(p),()->(n,m,p)', target=target,
                 nopython=True, cache=True)
             def filter_diffusion_3d(Fin, K0, K1, K2, nu_dt, Fout):
-                for i in range(Fin.shape[0]):
-                    for j in range(Fin.shape[1]):
+                for i in prange(Fin.shape[0]):
+                    for j in prange(Fin.shape[1]):
                         for k in range(Fin.shape[2]):
                             Fout[i,j,k] /= (1 - nu_dt*(K0[i] + K1[j] + K2[k]))
             F = filter_diffusion_3d
@@ -61,9 +61,9 @@ class PythonDiffusion(DiffusionOperatorBase, OpenClMappable, HostOperator):
                 '(n,m,p,q),(n),(m),(p),(q),()->(n,m,p,q)', target=target,
                 nopython=True, cache=True)
             def filter_diffusion_4d(Fin, K0, K1, K2, K3, nu_dt, Fout):
-                for i in range(Fin.shape[0]):
-                    for j in range(Fin.shape[1]):
-                        for k in range(Fin.shape[2]):
+                for i in prange(Fin.shape[0]):
+                    for j in prange(Fin.shape[1]):
+                        for k in prange(Fin.shape[2]):
                             for l in range(Fin.shape[3]):
                                 Fout[i,j,k,l] /= (1 - nu_dt*(K0[i] + K1[j] + K2[k] + K3[l]))
         else:
diff --git a/hysop/backend/host/python/operator/poisson.py b/hysop/backend/host/python/operator/poisson.py
index 1472ac28ea954977ef633e5ec1eeecbb4c0bab99..ae0536e56548413eb9495e02d99e293c5051e675 100644
--- a/hysop/backend/host/python/operator/poisson.py
+++ b/hysop/backend/host/python/operator/poisson.py
@@ -6,7 +6,7 @@ from hysop import __DEFAULT_NUMBA_TARGET__
 from hysop.tools.types import check_instance, first_not_None
 from hysop.tools.decorators import debug
 from hysop.tools.numpywrappers import npw
-from hysop.tools.numba_utils import make_numba_signature
+from hysop.tools.numba_utils import make_numba_signature, prange
 from hysop.backend.host.host_operator import HostOperator, OpenClMappable
 from hysop.core.graph.graph import op_apply
 from hysop.fields.continuous_field import Field
@@ -38,7 +38,7 @@ class PythonPoisson(PoissonOperatorBase, OpenClMappable, HostOperator):
                 '(n,m),(n),(m)->(n,m)', target=target,
                 nopython=True, cache=True)
             def filter_poisson_2d(Fin, K0, K1, Fout):
-                for i in range(1, Fin.shape[0]):
+                for i in prange(1, Fin.shape[0]):
                     for j in range(0, Fin.shape[1]):
                         Fout[i,j] /= (K0[i] + K1[j])
                 for j in range(1, Fin.shape[1]):
@@ -50,11 +50,11 @@ class PythonPoisson(PoissonOperatorBase, OpenClMappable, HostOperator):
                 '(n,m,p),(n),(m),(p)->(n,m,p)', target=target,
                 nopython=True, cache=True)
             def filter_poisson_3d(Fin, K0, K1, K2, Fout):
-                for i in range(1, Fin.shape[0]):
-                    for j in range(0, Fin.shape[1]):
+                for i in prange(1, Fin.shape[0]):
+                    for j in prange(0, Fin.shape[1]):
                         for k in range(0, Fin.shape[2]):
                             Fout[i,j,k] /= (K0[i] + K1[j] + K2[k])
-                for j in range(1, Fin.shape[1]):
+                for j in prange(1, Fin.shape[1]):
                     for k in range(0, Fin.shape[2]):
                         Fout[0,j,k] /= (K0[0] + K1[j] + K2[k])
                 for k in range(1, Fin.shape[2]):
@@ -66,16 +66,16 @@ class PythonPoisson(PoissonOperatorBase, OpenClMappable, HostOperator):
                 '(n,m,p,q),(n),(m),(p),(q)->(n,m,p,q)', target=target,
                 nopython=True, cache=True)
             def filter_poisson_4d(Fin, K0, K1, K2, K3, Fout):
-                for i in range(1, Fin.shape[0]):
-                    for j in range(0, Fin.shape[1]):
-                        for k in range(0, Fin.shape[2]):
+                for i in prange(1, Fin.shape[0]):
+                    for j in prange(0, Fin.shape[1]):
+                        for k in prange(0, Fin.shape[2]):
                             for l in range(0, Fin.shape[3]):
                                 Fout[i,j,k,l] /= (K0[i] + K1[j] + K2[k] + K3[l])
-                for j in range(1, Fin.shape[1]):
-                    for k in range(0, Fin.shape[2]):
+                for j in prange(1, Fin.shape[1]):
+                    for k in prange(0, Fin.shape[2]):
                         for l in range(0, Fin.shape[3]):
                             Fout[0,j,k,l] /= (K0[0] + K1[j] + K2[k] + K3[l])
-                for k in range(1, Fin.shape[2]):
+                for k in prange(1, Fin.shape[2]):
                     for l in range(0, Fin.shape[3]):
                         Fout[0,0,k,l] /= (K0[0] + K1[0] + K2[k] + K3[l])
                 for l in range(1, Fin.shape[3]):
diff --git a/hysop/backend/host/python/operator/poisson_curl.py b/hysop/backend/host/python/operator/poisson_curl.py
index 13e80586673b405bb3c7f797ba6042fffbd64ea9..5e724b48c35a283de97d93325cf4f7224397c2ba 100644
--- a/hysop/backend/host/python/operator/poisson_curl.py
+++ b/hysop/backend/host/python/operator/poisson_curl.py
@@ -5,7 +5,7 @@ from hysop import __DEFAULT_NUMBA_TARGET__
 from hysop.tools.types import check_instance, first_not_None
 from hysop.tools.decorators import debug
 from hysop.tools.numpywrappers import npw
-from hysop.tools.numba_utils import make_numba_signature
+from hysop.tools.numba_utils import make_numba_signature, prange
 from hysop.core.graph.graph import op_apply
 from hysop.backend.host.host_operator import HostOperator, OpenClMappable
 from hysop.operator.base.poisson_curl import SpectralPoissonCurlOperatorBase
@@ -28,7 +28,7 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe
         @nb.guvectorize([signature], layout,
             target=target, nopython=True, cache=True)
         def filter_curl_2d__0_m(Fin, K1, Fout):
-            for i in range(0, Fin.shape[0]):
+            for i in prange(0, Fin.shape[0]):
                 for j in range(0, Fin.shape[1]):
                     Fout[i,j] = -K1[j]*Fin[i,j]
         return functools.partial(filter_curl_2d__0_m, *args)
@@ -42,7 +42,7 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe
         @nb.guvectorize([signature], layout,
             target=target, nopython=True, cache=True)
         def filter_curl_2d__1_n(Fin, K0, Fout):
-            for i in range(0, Fin.shape[0]):
+            for i in prange(0, Fin.shape[0]):
                 for j in range(0, Fin.shape[1]):
                     Fout[i,j] = +K0[i]*Fin[i,j]
         return functools.partial(filter_curl_2d__1_n, *args)
@@ -55,8 +55,8 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe
         @nb.guvectorize([signature], layout,
             target=target, nopython=True, cache=True)
         def filter_curl_3d__0_n(Fin, K, Fout):
-            for i in range(0, Fin.shape[0]):
-                for j in range(0, Fin.shape[1]):
+            for i in prange(0, Fin.shape[0]):
+                for j in prange(0, Fin.shape[1]):
                     for k in range(0, Fin.shape[2]):
                         Fout[i,j,k] = -K[i]*Fin[i,j,k]
         return functools.partial(filter_curl_3d__0_n, *args)
@@ -69,8 +69,8 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe
         @nb.guvectorize([signature], layout,
             target=target, nopython=True, cache=True)
         def filter_curl_3d__0_m(Fin, K, Fout):
-            for i in range(0, Fin.shape[0]):
-                for j in range(0, Fin.shape[1]):
+            for i in prange(0, Fin.shape[0]):
+                for j in prange(0, Fin.shape[1]):
                     for k in range(0, Fin.shape[2]):
                         Fout[i,j,k] = -K[j]*Fin[i,j,k]
         return functools.partial(filter_curl_3d__0_m, *args)
@@ -83,8 +83,8 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe
         @nb.guvectorize([signature], layout,
             target=target, nopython=True, cache=True)
         def filter_curl_3d__0_p(Fin, K, Fout):
-            for i in range(0, Fin.shape[0]):
-                for j in range(0, Fin.shape[1]):
+            for i in prange(0, Fin.shape[0]):
+                for j in prange(0, Fin.shape[1]):
                     for k in range(0, Fin.shape[2]):
                         Fout[i,j,k] = -K[k]*Fin[i,j,k]
         return functools.partial(filter_curl_3d__0_p, *args)
@@ -97,8 +97,8 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe
         @nb.guvectorize([signature], layout,
             target=target, nopython=True, cache=True)
         def filter_curl_3d__1_n(Fin, K, Fout):
-            for i in range(0, Fin.shape[0]):
-                for j in range(0, Fin.shape[1]):
+            for i in prange(0, Fin.shape[0]):
+                for j in prange(0, Fin.shape[1]):
                     for k in range(0, Fin.shape[2]):
                         Fout[i,j,k] += K[i]*Fin[i,j,k]
         return functools.partial(filter_curl_3d__1_n, *args)
@@ -111,8 +111,8 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe
         @nb.guvectorize([signature], layout,
             target=target, nopython=True, cache=True)
         def filter_curl_3d__1_m(Fin, K, Fout):
-            for i in range(0, Fin.shape[0]):
-                for j in range(0, Fin.shape[1]):
+            for i in prange(0, Fin.shape[0]):
+                for j in prange(0, Fin.shape[1]):
                     for k in range(0, Fin.shape[2]):
                         Fout[i,j,k] += K[j]*Fin[i,j,k]
         return functools.partial(filter_curl_3d__1_m, *args)
@@ -125,8 +125,8 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe
         @nb.guvectorize([signature], layout,
             target=target, nopython=True, cache=True)
         def filter_curl_3d__1_p(Fin, K, Fout):
-            for i in range(0, Fin.shape[0]):
-                for j in range(0, Fin.shape[1]):
+            for i in prange(0, Fin.shape[0]):
+                for j in prange(0, Fin.shape[1]):
                     for k in range(0, Fin.shape[2]):
                         Fout[i,j,k] += K[k]*Fin[i,j,k]
         return functools.partial(filter_curl_3d__1_p, *args)
diff --git a/hysop/backend/host/python/operator/solenoidal_projection.py b/hysop/backend/host/python/operator/solenoidal_projection.py
index c131813b96f2e2fa82378186b07fcd60fde84ff9..a73a02da477b7f7f5ca004e553a09a49167e6ea6 100644
--- a/hysop/backend/host/python/operator/solenoidal_projection.py
+++ b/hysop/backend/host/python/operator/solenoidal_projection.py
@@ -9,7 +9,7 @@ from hysop.tools.numpywrappers import npw
 from hysop.backend.host.host_operator import HostOperator, OpenClMappable
 from hysop.core.graph.graph import op_apply
 from hysop.operator.base.solenoidal_projection import SolenoidalProjectionOperatorBase
-from hysop.tools.numba_utils import make_numba_signature
+from hysop.tools.numba_utils import make_numba_signature, prange
 
 class PythonSolenoidalProjection(SolenoidalProjectionOperatorBase, OpenClMappable, HostOperator):
     """
@@ -39,8 +39,8 @@ class PythonSolenoidalProjection(SolenoidalProjectionOperatorBase, OpenClMappabl
                                  KK10, KK11, KK12, 
                                  KK20, KK21, KK22, 
                                  Fout0, Fout1, Fout2):
-            for i in range(0, Fin0.shape[0]):
-                for j in range(0, Fin0.shape[1]):
+            for i in prange(0, Fin0.shape[0]):
+                for j in prange(0, Fin0.shape[1]):
                     for k in range(0, Fin0.shape[2]):
                         F0 = Fin0[i,j,k]
                         F1 = Fin1[i,j,k]
@@ -73,8 +73,8 @@ class PythonSolenoidalProjection(SolenoidalProjectionOperatorBase, OpenClMappabl
             @nb.guvectorize([signature], layout,
                 target=target, nopython=True, cache=True)
             def compute_div_3d(Fin0, Fin1, Fin2, K0, K1, K2, Fout):
-                for i in range(0, Fin0.shape[0]):
-                    for j in range(0, Fin0.shape[1]):
+                for i in prange(0, Fin0.shape[0]):
+                    for j in prange(0, Fin0.shape[1]):
                         for k in range(0, Fin0.shape[2]):
                             Fout[i,j,k] = (K0[i]*Fin0[i,j,k] + K1[j]*Fin1[i,j,k] + K2[k]*Fin2[i,j,k])
 
diff --git a/hysop/numerics/fft/_mkl_fft.py b/hysop/numerics/fft/_mkl_fft.py
index 154fea9a7d847e749e702192572955a21414d967..6dadfa2191d6ae41b7edfa044823f6d60329885e 100644
--- a/hysop/numerics/fft/_mkl_fft.py
+++ b/hysop/numerics/fft/_mkl_fft.py
@@ -1,34 +1,130 @@
 """
-FFT interface for fast Fourier Transforms using Intel MKL (FFTW3 wrappers).
+FFT interface for fast Fourier Transforms using Intel MKL (numpy interface).
 :class:`~hysop.numerics.MklFFT`
 :class:`~hysop.numerics.MklFFTPlan`
+
+/!\ -- OPENMP CONFLICT WITH GRAPHTOOLS -- 
+/!\ Only works if MKL_THREADING_LAYER is set to OMP because graph_tools is compiled against GNU OpenMP.
+/!\ May also work with MKL_THREADING_LAYER=TBB and SEQUENCIAL but not INTEL.
+
+Required version of mkl_fft is: https://gitlab.com/keckj/mkl_fft 
+If MKL_THREADING_LAYER is not set, or is set to INTEL, FFT tests will fail.
 """
 
+import functools, warnings
 import numpy as np
-from mkl_fft import _numpy_fft as _FFT
+import numba as nb
+from mkl_fft import (ifft as mkl_ifft, 
+                     fft as mkl_fft, 
+                     rfft_numpy as mkl_rfft, 
+                     irfft_numpy as mkl_irfft)
 
-from hysop.tools.types import first_not_None
 from hysop.numerics.fft.host_fft import HostFFTPlanI, HostFFTI, HostArray
 from hysop.numerics.fft.fft import \
             complex_to_float_dtype, float_to_complex_dtype, \
-            mk_view, mk_shape
+            mk_view, mk_shape, simd_alignment
+
+from hysop import __DEFAULT_NUMBA_TARGET__
+from hysop.numerics.fft.fft import HysopFFTWarning, bytes2str
+from hysop.tools.numba_utils import make_numba_signature, prange
+from hysop.tools.warning import HysopWarning
+from hysop.tools.numerics import get_itemsize
+from hysop.tools.types import first_not_None
+
+class HysopMKLFftWarning(HysopWarning):
+    pass
+
+def setup_transform(x, axis, transform, inverse, type):
+    shape = x.shape
+    dtype = x.dtype
+    N = shape[axis]
+    if inverse:
+        type = [1,3,2,4][type-1]
+    if (transform == 'dct'):
+        ctype = float_to_complex_dtype(x.dtype)
+        if (type==1):
+            sin  = mk_shape(shape, axis, 2*N-2)
+            din  = dtype
+            sout = mk_shape(shape, axis, N)
+            dout = ctype
+        elif (type==2):
+            sin = mk_shape(shape, axis, N)
+            din = dtype
+            sout = mk_shape(shape, axis, N//2+1)
+            dout = ctype
+        elif (type==3):
+            sin = mk_shape(shape, axis, N//2+1)
+            din = ctype
+            sout = mk_shape(shape, axis, N)
+            dout = dtype
+        else:
+            raise NotImplementedError
+    elif (transform is 'dst'):
+        ctype = float_to_complex_dtype(x.dtype)
+        if (type==1):
+            sin = mk_shape(shape, axis, 2*N+2)
+            din = dtype
+            sout = mk_shape(shape, axis, N+2)
+            dout = ctype
+        elif (type==2):
+            sin = mk_shape(shape, axis, N)
+            din = dtype
+            sout = mk_shape(shape, axis, N//2+1)
+            dout = ctype
+        elif (type==3):
+            sin = mk_shape(shape, axis, N//2+1)
+            din = ctype
+            sout = mk_shape(shape, axis, N)
+            dout = dtype
+        else:
+            raise NotImplementedError
+    else:
+        sin  = None
+        din  = None
+        sout = None
+        dout = None
+
+    return (sin,din,sout,dout)
 
-def dct(a, out=None, type=2, axis=-1):
-    ndim  = a.ndim
-    shape = a.shape
-    N     = a.shape[axis]
+def fft(**kwds):
+    out = mkl_fft(**kwds)
+    return out
+def ifft(**kwds):
+    out = mkl_ifft(**kwds)
+    return out
+def rfft(**kwds):
+    out = mkl_rfft(**kwds)
+    return out
+def irfft(**kwds):
+    out = mkl_irfft(**kwds)
+    return out
+
+
+def dct(x, out=None, type=2, axis=-1, input_tmp=None, output_tmp=None):
+    ndim  = x.ndim
+    shape = x.shape
+    N     = x.shape[axis]
+    (sin, din, sout, dout) = setup_transform(x, axis, 'dct', False, type)
     if (type==1):
         # O(sqrt(log(N))) error, O(2N) complexity, O(4*N) memory
+        if (input_tmp is None):
+            input_tmp = np.empty(shape=sin, dtype=din)
+        if (output_tmp is None):
+            output_tmp = np.empty(shape=sout, dtype=dout)
+        assert input_tmp.shape == sin
+        assert input_tmp.dtype == din
+        assert output_tmp.shape == sout
+        assert output_tmp.dtype == dout
         slc0  = mk_view(ndim, axis, 1, -1)
         slc1  = mk_view(ndim, axis, None, None, -1)
-        s0    = mk_shape(shape, axis, 2*N-2)
-        X = np.empty(shape=s0, dtype=a.dtype)
-        np.concatenate((a, a[slc0][slc1]), axis=axis, out=X)
-        res = _FFT.rfft(X, axis=axis).real
+        np.concatenate((x, x[slc0][slc1]), axis=axis, out=input_tmp)
+        rfft(x=input_tmp, out=output_tmp, axis=axis)
+        res = output_tmp.real
         if (out is None):
             out = res
         else:
             assert out.shape == res.shape
+            assert out.dtype == res.dtype
             out[...] = res
     elif (type==2):
         # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory
@@ -42,20 +138,30 @@ def dct(a, out=None, type=2, axis=-1):
         slc5  = mk_view(ndim, axis, None, n1,   None)
         slc6  = mk_view(ndim, axis, n1,   None, None)
 
-        X = np.empty_like(a)
-        np.concatenate((a[slc1], a[slc2][slc3]), axis=axis, out=X)
-        X = _FFT.rfft(X, axis=axis)
-        X *= (2*np.exp(-1j*np.pi*np.arange(n0)/(2*N)))[slc4]
+        if (input_tmp is None):
+            input_tmp = np.empty(shape=sin, dtype=din)
+        if (output_tmp is None):
+            output_tmp = np.empty(shape=sout, dtype=dout)
+        assert input_tmp.shape == sin
+        assert input_tmp.dtype == din
+        assert output_tmp.shape == sout
+        assert output_tmp.dtype == dout
+
+        np.concatenate((x[slc1], x[slc2][slc3]), axis=axis, out=input_tmp)
+        rfft(x=input_tmp, out=output_tmp, axis=axis)
+        assert output_tmp.shape == sout
+        assert output_tmp.dtype == dout
+        output_tmp *= (2*np.exp(-1j*np.pi*np.arange(n0)/(2*N)))[slc4]
         
         if (out is None):
-            out = np.empty_like(a)
+            out = np.empty_like(x)
         else:
-            assert out.shape == a.shape
-        out[slc5] = +X.real[slc5]
-        out[slc6] = -X.imag[slc0][slc3]
+            assert out.shape == x.shape
+            assert out.dtype == x.dtype
+        out[slc5] = +output_tmp.real[slc5]
+        out[slc6] = -output_tmp.imag[slc0][slc3]
     elif (type==3):
         # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory
-        ctype = float_to_complex_dtype(a.dtype)
         n0 = N//2 + 1
         n1 = (N-1)//2 + 1
         slc0  = mk_view(ndim, axis, None, n0,   None)
@@ -66,21 +172,32 @@ def dct(a, out=None, type=2, axis=-1):
         slc5  = mk_view(ndim, axis, None, n1,   None)
         slc6  = mk_view(ndim, axis, 1,    None, +2)
         slc7  = mk_view(ndim, axis, None, None, None, default=None)
-        s0    = mk_shape(shape, axis, n0)
-
-        X = np.zeros(shape=s0, dtype=ctype)
-        X.real[slc0] = +a[slc0]
-        X.imag[slc1] = -a[slc2][slc3]
-        X *= np.exp(+1j*np.pi*np.arange(n0)/(2*N))[slc7]
-        X = _FFT.irfft(X, axis=axis, n=N)
-        X *= N
+        slc8  = mk_view(ndim, axis, n0  , None, None)
+        slc9  = mk_view(ndim, axis, None, 1   , None)
+        
+        if (input_tmp is None):
+            input_tmp = np.empty(shape=sin, dtype=din)
+        if (output_tmp is None):
+            output_tmp = np.empty(shape=sout, dtype=dout)
+        assert input_tmp.shape == sin
+        assert input_tmp.dtype == din
+        assert output_tmp.shape == sout
+        assert output_tmp.dtype == dout
+        input_tmp.real[slc0] = +x[slc0]
+        input_tmp.real[slc8] = 0.0
+        input_tmp.imag[slc1] = -x[slc2][slc3]
+        input_tmp.imag[slc9] = 0.0
+        input_tmp *= np.exp(+1j*np.pi*np.arange(n0)/(2*N))[slc7]
+        output_tmp = irfft(x=input_tmp, out=output_tmp, axis=axis, n=N)
+        output_tmp *= N
 
         if (out is None):
-            out = np.empty_like(a)
+            out = np.empty_like(x)
         else:
-            assert out.shape == a.shape
-        out[slc4] = X[slc5]
-        out[slc6] = X[slc2][slc3]
+            assert out.shape == x.shape
+            assert out.dtype == x.dtype
+        out[slc4] = output_tmp[slc5]
+        out[slc6] = output_tmp[slc2][slc3]
     else:
         stypes=['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII']
         msg='DCT-{} has not been implemented yet.'
@@ -88,24 +205,34 @@ def dct(a, out=None, type=2, axis=-1):
         raise NotImplementedError(msg)
     return out
 
-def dst(a, out=None, type=2, axis=-1):
-    ndim  = a.ndim
-    shape = a.shape
-    N     = a.shape[axis]
+def dst(x, out=None, type=2, axis=-1, input_tmp=None, output_tmp=None):
+    (sin, din, sout, dout) = setup_transform(x, axis, 'dst', False, type)
+    ndim  = x.ndim
+    shape = x.shape
+    N     = x.shape[axis]
     if (type==1):
         # O(sqrt(log(N))) error, O(2N) complexity, O(4*N) memory
         slc0  = mk_view(ndim, axis, None, None, -1)
         slc1  = mk_view(ndim, axis, 1,    -1,   None)
-        s0 = mk_shape(shape, axis, 2*N+2)
         s1 = mk_shape(shape, axis, 1)
-        X = np.empty(shape=s0, dtype=a.dtype)
-        Z = np.zeros(shape=s1, dtype=a.dtype)
-        np.concatenate((Z, -a, Z, a[slc0]), axis=axis, out=X)
-        res = _FFT.rfft(X, axis=axis).imag
+        if (input_tmp is None):
+            input_tmp = np.empty(shape=sin, dtype=din)
+        if (output_tmp is None):
+            output_tmp = np.empty(shape=sout, dtype=dout)
+        assert input_tmp.shape == sin
+        assert input_tmp.dtype == din
+        assert output_tmp.shape == sout
+        assert output_tmp.dtype == dout
+
+        Z = np.zeros(shape=s1, dtype=x.dtype)
+        np.concatenate((Z, -x, Z, x[slc0]), axis=axis, out=input_tmp)
+        rfft(x=input_tmp, out=output_tmp, axis=axis)
+        res = output_tmp.imag
         if (out is None):
-            out = np.empty_like(a)
+            out = np.empty_like(x)
         else:
-            assert out.shape == a.shape
+            assert out.shape == x.shape
+            assert out.dtype == x.dtype
         out[...] = res[slc1]
     elif (type==2):
         # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory
@@ -120,20 +247,29 @@ def dst(a, out=None, type=2, axis=-1):
         slc6  = mk_view(ndim, axis, n1-1, None, None)
         slc7  = mk_view(ndim, axis, 1,    n1,   None)
 
-        X = np.empty_like(a)
-        np.concatenate((a[slc1], -a[slc2][slc3]), axis=axis, out=X)
-        X = _FFT.rfft(X, axis=axis)
-        X *= (2*np.exp(-1j*np.pi*np.arange(n0)/(2*N)))[slc4]
+        if (input_tmp is None):
+            input_tmp = np.empty(shape=sin, dtype=din)
+        if (output_tmp is None):
+            output_tmp = np.empty(shape=sout, dtype=dout)
+        assert input_tmp.shape == sin
+        assert input_tmp.dtype == din
+        assert output_tmp.shape == sout
+        assert output_tmp.dtype == dout
+
+        np.concatenate((x[slc1], -x[slc2][slc3]), axis=axis, out=input_tmp)
+        rfft(x=input_tmp, out=output_tmp, axis=axis)
+        output_tmp *= (2*np.exp(-1j*np.pi*np.arange(n0)/(2*N)))[slc4]
         
         if (out is None):
-            out = np.empty_like(a)
+            out = np.empty_like(x)
         else:
-            assert out.shape == a.shape
-        out[slc5] = -X.imag[slc7]
-        out[slc6] = +X.real[slc3]
+            assert out.shape == x.shape
+            assert out.dtype == x.dtype
+        out[slc5] = -output_tmp.imag[slc7]
+        out[slc6] = +output_tmp.real[slc3]
     elif (type==3):
         # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory
-        ctype = float_to_complex_dtype(a.dtype)
+        ctype = float_to_complex_dtype(x.dtype)
         n0 = N//2 + 1
         n1 = (N-1)//2 + 1
         slc0  = mk_view(ndim, axis, None, n0,   None)
@@ -145,21 +281,34 @@ def dst(a, out=None, type=2, axis=-1):
         slc6  = mk_view(ndim, axis, None, n1,   None)
         slc7  = mk_view(ndim, axis, 1,    None, 2)
         slc8  = mk_view(ndim, axis, n1,   None, None)
+        slc9  = mk_view(ndim, axis, n0,   None,   None)
+        slc10 = mk_view(ndim, axis, 0, 1, None)
         s0    = mk_shape(shape, axis, n0)
+        
+        if (input_tmp is None):
+            input_tmp = np.empty(shape=sin, dtype=din)
+        if (output_tmp is None):
+            output_tmp = np.empty(shape=sout, dtype=dout)
+        assert input_tmp.shape == sin
+        assert input_tmp.dtype == din
+        assert output_tmp.shape == sout
+        assert output_tmp.dtype == dout
 
-        X = np.zeros(shape=s0, dtype=ctype)
-        X.real[slc0] = +a[slc1][slc0]
-        X.imag[slc2] = -a[slc3]
-        X *= np.exp(+1j*np.pi*np.arange(n0)/(2*N))[slc4]
-        X = _FFT.irfft(X, axis=axis, n=N)
-        X[...] *= N
+        input_tmp.real[slc0] = +x[slc1][slc0]
+        input_tmp.real[slc9] = 0.0
+        input_tmp.imag[slc2] = -x[slc3]
+        input_tmp.imag[slc10] = 0.0
+        input_tmp *= np.exp(+1j*np.pi*np.arange(n0)/(2*N))[slc4]
+        irfft(x=input_tmp, out=output_tmp, axis=axis, n=N)
+        output_tmp[...] *= N
 
         if (out is None):
-            out = np.empty_like(a)
+            out = np.empty_like(x)
         else:
-            assert out.shape == a.shape
-        out[slc5] = +X[slc6]
-        out[slc7] = -X[slc8][slc1]
+            assert out.shape == x.shape
+            assert out.dtype == x.dtype
+        out[slc5] = +output_tmp[slc6]
+        out[slc7] = -output_tmp[slc8][slc1]
     else:
         stypes=['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII']
         msg='DCT-{} has not been implemented yet.'
@@ -167,13 +316,14 @@ def dst(a, out=None, type=2, axis=-1):
         raise NotImplementedError(msg)
     return out
 
-def idct(a, out=None, type=2, axis=-1, **kwds):
+def idct(x, out=None, type=2, axis=-1, **kwds):
     itype = [1,3,2,4][type-1]
-    return dct(a=a, out=out, type=itype, axis=axis, **kwds)
+    return dct(x=x, out=out, type=itype, axis=axis, **kwds)
 
-def idst(a, out=None, type=2, axis=-1, **kwds):
+def idst(x, out=None, type=2, axis=-1, **kwds):
     itype = [1,3,2,4][type-1]
-    return dst(a=a, out=out, type=itype, axis=axis, **kwds)
+    return dst(x=x, out=out, type=itype, axis=axis, **kwds)
+
 
 
 class MklFFTPlan(HostFFTPlanI):
@@ -181,24 +331,88 @@ class MklFFTPlan(HostFFTPlanI):
     Wrap a mkl fft call (mkl.fft does not offer real planning capabilities). 
     """
 
-    def __init__(self, fn, a, out, scaling=None, **kwds):
+    def __init__(self, planner, fn, a, out, axis, scaling=None, **kwds):
         super(MklFFTPlan, self).__init__()
         
+        self.planner      = planner
         self.fn           = fn
         self.a            = a
         self.out          = out
         self.scaling      = scaling
 
+        (sin, din, sout, dout) = setup_transform(a, axis, 
+                'dct' if fn in (dct, idct) else 'dst' if fn in (dst, idst) else None,
+                fn in (idct, idst),
+                kwds.get('type', None))
+
+        if (sin is None):
+            self._required_input_tmp = None
+        else:
+            self._required_input_tmp = {'size': np.prod(sin, dtype=np.int64), 'shape':sin, 'dtype':din}
+
+        if (sout is None):
+            self._required_output_tmp = None
+        else:
+            self._required_output_tmp = {'size': np.prod(sout, dtype=np.int64), 'shape':sout, 'dtype':dout}
+        
+        self._allocated = False
+
         if isinstance(a, HostArray):
             a = a.handle
         if isinstance(out, HostArray):
             out = out.handle
+
+        self.rescale = self.bake_scaling_plan(out, scaling)
+
     
         kwds = kwds.copy()
-        kwds['a'] = a
-        if fn in (dct, idct, dst, idst):
-            kwds['out'] = out
-        self.kwds         = kwds
+        kwds['x']    = a
+        kwds['out']  = out
+        kwds['axis'] = axis
+        self.kwds   = kwds
+
+    def bake_scaling_plan(self, x, scaling):
+        if (scaling is None):
+            def _rescale():
+                pass
+            return _rescale
+        target =  __DEFAULT_NUMBA_TARGET__
+        signature, layout = make_numba_signature(x, scaling)
+        if (x.ndim == 1):
+            @nb.guvectorize([signature], layout, 
+                target=target, nopython=True, cache=True)
+            def scale(x, scaling):
+                for i in xrange(0, x.shape[0]):
+                    x[i] *= scaling
+        elif (x.ndim == 2):
+            @nb.guvectorize([signature], layout, 
+                target=target, nopython=True, cache=True)
+            def scale(x, scaling):
+                for i in prange(0, x.shape[0]):
+                    for j in xrange(0, x.shape[1]):
+                        x[i,j] *= scaling
+        elif (x.ndim == 3):
+            @nb.guvectorize([signature], layout, 
+                target=target, nopython=True, cache=True)
+            def scale(x, scaling):
+                for i in prange(0, x.shape[0]):
+                    for j in prange(0, x.shape[1]):
+                        for k in xrange(0, x.shape[2]):
+                            x[i,j,k] *= scaling
+        elif (x.ndim == 4):
+            @nb.guvectorize([signature], layout, 
+                target=target, nopython=True, cache=True)
+            def scale(x, scaling):
+                for i in prange(0, x.shape[0]):
+                    for j in prange(0, x.shape[1]):
+                        for k in prange(0, x.shape[2]):
+                            for l in xrange(0, x.shape[3]):
+                                x[i,j,k,l] *= scaling
+        else:
+            raise NotImplementedError(x.ndim)
+        def _rescale(scale=scale, x=x, scaling=scaling):
+            scale(x,scaling)
+        return _rescale
 
     @property
     def input_array(self):
@@ -209,90 +423,160 @@ class MklFFTPlan(HostFFTPlanI):
         return self.out
 
     def execute(self):
-        out     = self.out
-        scaling = self.scaling
+        if not self._allocated:
+            self.allocate()
+        self.fn(**self.kwds)
+        self.rescale()
+
+
+    @property
+    def required_buffer_size(self):
+        alignment = simd_alignment
+        if self._required_input_tmp:
+            sin, din   = self._required_input_tmp['size'], self._required_input_tmp['dtype']
+            sin *= get_itemsize(din)
+            Bin  = ((sin+alignment-1)//alignment)*alignment
+        else:
+            Bin = 0
+        if self._required_output_tmp:
+            sout, dout = self._required_output_tmp['size'], self._required_output_tmp['dtype']
+            sout *= get_itemsize(dout)
+            Bout = ((sout+alignment-1)//alignment)*alignment
+        else: 
+            Bout = 0
+        return Bin+Bout
+        
+    def allocate(self, buf=None):
+        """Allocate plan extra memory, possibly with a custom buffer."""
+        if self._allocated:
+            msg='Plan was already allocated.'
+            raise RuntimeError(msg)
         
-        if self.fn in (dct, idct, dst, idst):
-            self.fn(**self.kwds)
+        if (buf is not None):
+            alignment = simd_alignment
+            if self._required_input_tmp:
+                sin, din, ssin = self._required_input_tmp['size'], self._required_input_tmp['dtype'], self._required_input_tmp['shape']
+                sin *= get_itemsize(din)
+                Bin  = ((sin+alignment-1)//alignment)*alignment
+            else:
+                Bin = 0
+            if self._required_output_tmp:
+                sout, dout, ssout = self._required_output_tmp['size'], self._required_output_tmp['dtype'], self._required_output_tmp['shape']
+                sout *= get_itemsize(dout)
+                Bout = ((sout+alignment-1)//alignment)*alignment
+            else: 
+                Bout = 0
+            assert buf.dtype.itemsize == 1
+            assert buf.size == Bin+Bout
+            input_buf = buf[:sin].view(dtype=din).reshape(ssin) if Bin else None
+            output_buf = buf[Bin:Bin+sout].view(dtype=dout).reshape(ssout) if Bout else None
         else:
-            out[...] = self.fn(**self.kwds)
+            input_buf = None
+            output_buf = None
 
-        if (scaling is not None):
-            out[...] *= scaling
+        for (k, buf, required_tmp) in zip(('input', 'output'), 
+                                          (input_buf, output_buf),
+                                          (self._required_input_tmp, self._required_output_tmp)):
+            if (required_tmp is None):
+                assert buf is None
+                continue
+            size  = required_tmp['size']
+            shape = required_tmp['shape']
+            dtype = required_tmp['dtype']
+            if (size>0):
+                if (buf is None):
+                    if self.planner.warn_on_allocation or self.planner.error_on_allocation:
+                        msg='Allocating temporary buffer of size {} for clFFT::{}.'
+                        msg=msg.format(bytes2str(size), id(self))
+                        if self.planner.error_on_allocation:
+                            raise RuntimeError(msg)
+                        else:
+                            warnings.warn(msg, HysopMKLFftWarning)
+                    buf = self.planner.backend.empty(shape=shape, 
+                                                     dtype=dtype)
+                elif (buf.shape != shape) or (buf.dtype != dtype):
+                    msg='Buffer does not match required shape: {} != {}'
+                    msg=msg.format(buf.shape, shape)
+                    msg='Buffer does not match required dtype: {} != {}'
+                    msg=msg.format(buf.dtype, dtype)
+                    raise ValueError(msg)
+            if isinstance(buf, HostArray):
+                buf = buf.handle
+            setattr(self, '{}_tmp_buffer'.format(k), buf)
+            if (buf is not None):
+                self.kwds['{}_tmp'.format(k)] = buf
+        self._allocated = True
+        return self
 
 
 class MklFFT(HostFFTI):
     """
     Interface to compute local to process FFT-like transforms using the mkl fft backend.
 
-    Mkl fft backend has many disadvantages: 
-        - only double precision is really supported
-        - single precision is supported by casting to double precision
-        - creates intermediate temporary buffers at each call
-        - no planning capabilities (mkl.fft methods are just wrapped into fake plans)
-
-    The only advantage is that planning won't destroy original inputs.
+    Mkl fft backend has some disadvantages: 
+      - creates intermediate temporary buffers at each call (out and tmp for real-to-real transforms)
+      - no planning capabilities (mkl.fft methods are just wrapped into fake plans)
     """
 
     def __init__(self, backend=None, allocator=None, 
-                        warn_on_allocation=True, **kwds):
+                        warn_on_allocation=True, error_on_allocation=False, 
+                        destroy_input=None, **kwds):
         super(MklFFT, self).__init__(backend=backend, allocator=allocator, 
-                warn_on_allocation=warn_on_allocation, **kwds)
+                warn_on_allocation=warn_on_allocation, error_on_allocation=error_on_allocation, **kwds)
         self.supported_ftypes = (np.float32, np.float64,)
         self.supported_ctypes = (np.complex64, np.complex128,)
     
-    
     def fft(self, a, out=None, axis=-1, **kwds):
         (shape, dtype) = super(MklFFT, self).fft(a=a, out=out, axis=axis, **kwds)
         out = self.allocate_output(out, shape, dtype) 
-        plan = MklFFTPlan(fn=_FFT.fft, a=a, out=out, axis=axis, **kwds)
+        plan = MklFFTPlan(self, fn=fft, a=a, out=out, axis=axis, **kwds)
         return plan
 
     def ifft(self, a, out=None, axis=-1, **kwds):
         (shape, dtype, s) = super(MklFFT, self).ifft(a=a, out=out, axis=axis, **kwds)
         out = self.allocate_output(out, shape, dtype) 
-        plan = MklFFTPlan(fn=_FFT.ifft, a=a, out=out, axis=axis, **kwds)
+        plan = MklFFTPlan(self, fn=ifft, a=a, out=out, axis=axis, **kwds)
         return plan
 
     def rfft(self, a, out=None, axis=-1, **kwds):
         (shape, dtype) = super(MklFFT, self).rfft(a=a, out=out, axis=axis, **kwds)
         out = self.allocate_output(out, shape, dtype) 
-        plan = MklFFTPlan(fn=_FFT.rfft, a=a, out=out, axis=axis, 
+        plan = MklFFTPlan(self, fn=rfft, a=a, out=out, axis=axis, 
                             **kwds)
         return plan
 
     def irfft(self, a, out=None, n=None, axis=-1, **kwds):
         (shape, dtype, s) = super(MklFFT, self).irfft(a=a, out=out, n=n, axis=axis, **kwds)
         out = self.allocate_output(out, shape, dtype) 
-        plan = MklFFTPlan(fn=_FFT.irfft, a=a, out=out, axis=axis, 
+        plan = MklFFTPlan(self, fn=irfft, a=a, out=out, axis=axis, 
                             n=shape[axis], **kwds)
         return plan
     
     def dct(self, a, out=None, type=2, axis=-1, **kwds):
         (shape, dtype) = super(MklFFT, self).dct(a=a, out=out, type=type, axis=axis, **kwds)
         out = self.allocate_output(out, shape, dtype) 
-        plan = MklFFTPlan(fn=dct, a=a, out=out, axis=axis, type=type, **kwds)
+        plan = MklFFTPlan(self, fn=dct, a=a, out=out, axis=axis, type=type, **kwds)
         return plan
     
     def idct(self, a, out=None, type=2, axis=-1, scaling=None, **kwds):
         (shape, dtype, _, s) = super(MklFFT, self).idct(a=a, out=out, type=type, 
                                     axis=axis, **kwds)
         out = self.allocate_output(out, shape, dtype) 
-        plan = MklFFTPlan(fn=idct, a=a, out=out, axis=axis, type=type, 
+        plan = MklFFTPlan(self, fn=idct, a=a, out=out, axis=axis, type=type, 
                                 scaling=first_not_None(scaling, 1.0/s), **kwds)
         return plan
     
     def dst(self, a, out=None, type=2, axis=-1, **kwds):
         (shape, dtype) = super(MklFFT, self).dst(a=a, out=out, type=type, axis=axis, **kwds)
         out = self.allocate_output(out, shape, dtype) 
-        plan = MklFFTPlan(fn=dst, a=a, out=out, axis=axis, type=type, **kwds)
+        plan = MklFFTPlan(self, fn=dst, a=a, out=out, axis=axis, type=type, **kwds)
         return plan
 
     def idst(self, a, out=None, type=2, axis=-1, scaling=None, **kwds):
         (shape, dtype, _, s) = super(MklFFT, self).idst(a=a, out=out, type=type, axis=axis, 
                 **kwds)
         out = self.allocate_output(out, shape, dtype) 
-        plan = MklFFTPlan(fn=idst, a=a, out=out, axis=axis, type=type,
+        plan = MklFFTPlan(self, fn=idst, a=a, out=out, axis=axis, type=type,
                                 scaling=first_not_None(scaling, 1.0/s), **kwds)
         return plan
 
diff --git a/hysop/numerics/fft/host_fft.py b/hysop/numerics/fft/host_fft.py
index 15e1cb03cb550e900f3a7aec20a1bae4f2b24c87..21b16859881c9ec9ae7840f7c07cd87b57b4a347 100644
--- a/hysop/numerics/fft/host_fft.py
+++ b/hysop/numerics/fft/host_fft.py
@@ -13,11 +13,38 @@ import numba as nb
 from hysop import __FFTW_NUM_THREADS__, __FFTW_PLANNER_EFFORT__, __FFTW_PLANNER_TIMELIMIT__, \
                     __DEFAULT_NUMBA_TARGET__
 from hysop.tools.types import first_not_None, check_instance
-from hysop.tools.numba_utils import make_numba_signature
+from hysop.tools.numba_utils import HAS_NUMBA, bake_numba_copy, bake_numba_accumulate, bake_numba_transpose
+from hysop.tools.decorators import static_vars
 from hysop.backend.host.host_array_backend import HostArrayBackend
 from hysop.backend.host.host_array import HostArray
 from hysop.numerics.fft.fft import FFTQueueI, FFTPlanI, FFTI
 
+def can_exec_hptt(src, dst):
+    if (src is dst):
+        return False
+    if (src.dtype != dst.dtype):
+        return False
+    if src.dtype not in (np.float32, np.float64, np.complex64, np.complex128):
+        return False
+    if src.flags['C_CONTIGUOUS'] != dst.flags['C_CONTIGUOUS']:
+        return False
+    if src.flags['F_CONTIGUOUS'] != dst.flags['F_CONTIGUOUS']:
+        return False
+    if not (src.flags['C_CONTIGUOUS'] or src.flags['F_CONTIGUOUS']):
+        return False
+    return (src.data is not dst.data)
+
+try:
+    import hptt
+    HAS_HPTT=True
+    # required version is: https://gitlab.com/keckj/hptt
+except ImportError:
+    HAS_HPTT=False
+    import warnings
+    from hysop.tools.warning import HysopPerformanceWarning
+    msg='Failed to import HPTT module, falling back to slow numpy transpose. Required version is available at https://gitlab.com/keckj/hptt.'
+    warnings.warn(msg, HysopPerformanceWarning)
+
 class DummyEvent(object):
     @classmethod
     def wait(cls):
@@ -74,22 +101,32 @@ class HostFFTI(FFTI):
                        error_on_allocation=False,
                        **kwds):
         """
-        Get the default host FFT interface which is a multithreaded FFTW interface with 
-        ESTIMATE planning effort.
+        Get the default host FFT interface.
+        Preferred interface is multithreaded MKL FFT with the TBB threading layer (does not work with Intel threading layer).
+        On import error the interface falls back to a multithreaded FFTW interface with ESTIMATE planning effort.
         """
-        threads            = first_not_None(threads,            __FFTW_NUM_THREADS__)
-        planner_effort     = first_not_None(planner_effort,     __FFTW_PLANNER_EFFORT__)
-        planning_timelimit = first_not_None(planning_timelimit, __FFTW_PLANNER_TIMELIMIT__)
-        from hysop.numerics.fft.fftw_fft import FftwFFT
-        return FftwFFT(threads=threads,  
-                       planner_effort=planner_effort, 
-                       planning_timelimit=planning_timelimit,
-                       backend=backend, allocator=allocator,
-                       destroy_input=destroy_input,
-                       warn_on_allocation=warn_on_allocation,
-                       warn_on_misalignment=warn_on_misalignment,
-                       error_on_allocation=error_on_allocation,
-                       **kwds)
+        try:
+            from hysop.numerics.fft._mkl_fft import MklFFT
+            return MklFFT(backend=backend, allocator=allocator,
+                          destroy_input=destroy_input,
+                          warn_on_allocation=warn_on_allocation,
+                          error_on_allocation=error_on_allocation,
+                          **kwds)
+        except ImportError:
+            raise
+            from hysop.numerics.fft.fftw_fft import FftwFFT
+            threads            = first_not_None(threads,            __FFTW_NUM_THREADS__)
+            planner_effort     = first_not_None(planner_effort,     __FFTW_PLANNER_EFFORT__)
+            planning_timelimit = first_not_None(planning_timelimit, __FFTW_PLANNER_TIMELIMIT__)
+            return FftwFFT(threads=threads,  
+                           planner_effort=planner_effort, 
+                           planning_timelimit=planning_timelimit,
+                           backend=backend, allocator=allocator,
+                           destroy_input=destroy_input,
+                           warn_on_allocation=warn_on_allocation,
+                           warn_on_misalignment=warn_on_misalignment,
+                           error_on_allocation=error_on_allocation,
+                           **kwds)
     
     def new_queue(self, tg, name):
         return HostFFTQueue(name=name)
@@ -97,22 +134,55 @@ class HostFFTI(FFTI):
     def plan_copy(self, tg, src, dst):
         src = self.ensure_callable(src)
         dst = self.ensure_callable(dst)
+        
+        @static_vars(numba_copy=None)
         def exec_copy(src=src, dst=dst):
-            dst()[...] = src()
+            src, dst = src(), dst()
+            if HAS_HPTT and can_exec_hptt(src, dst):
+                hptt.tensorTransposeAndUpdate(perm=range(src.ndim),
+                        alpha=1.0, A=src, beta=0.0, B=dst)
+            elif HAS_NUMBA:
+                if (exec_copy.numba_copy is None):
+                    exec_copy.numba_copy = bake_numba_copy(src=src, dst=dst)
+                exec_copy.numba_copy()
+            else:
+                dst[...] = src
         return exec_copy
     
     def plan_accumulate(self, tg, src, dst):
         src = self.ensure_callable(src)
         dst = self.ensure_callable(dst)
-        def exec_copy(src=src, dst=dst):
-            dst()[...] += src()
+        
+        @static_vars(numba_accumulate=None)
+        def exec_accumulate(src=src, dst=dst):
+            src, dst = src(), dst()
+            if HAS_HPTT and can_exec_hptt(src, dst):
+                hptt.tensorTransposeAndUpdate(perm=range(src.ndim),
+                        alpha=1.0, A=src, beta=1.0, B=dst)
+            elif HAS_NUMBA:
+                if (exec_accumulate.numba_accumulate is None):
+                    exec_accumulate.numba_accumulate = bake_numba_accumulate(src=src, dst=dst)
+                exec_accumulate.numba_accumulate()
+            else:
+                dst[...] += src
         return exec_copy
 
     def plan_transpose(self, tg, src, dst, axes):
         src = self.ensure_callable(src)
         dst = self.ensure_callable(dst)
+
+        @static_vars(numba_transpose=None)
         def exec_transpose(src=src, dst=dst, axes=axes):
-            dst()[...] = np.transpose(a=src(), axes=axes)
+            src, dst = src(), dst()
+            if HAS_HPTT and can_exec_hptt(src, dst):
+                hptt.tensorTransposeAndUpdate(perm=axes,
+                        alpha=1.0, A=src, beta=0.0, B=dst)
+            elif HAS_NUMBA:
+                if (exec_transpose.numba_transpose is None):
+                    exec_transpose.numba_transpose = bake_numba_transpose(src=src, dst=dst, axes=axes)
+                exec_transpose.numba_transpose()
+            else:
+                dst[...] = np.transpose(a=src, axes=axes)
         return exec_transpose
     
     def plan_fill_zeros(self, tg, a, slices):
diff --git a/hysop/numerics/tests/test_fft.py b/hysop/numerics/tests/test_fft.py
index 3c4f4d0e9d50b2d9358390ff6b21fb150ec5929e..8f7d461bc2218765fddbf9bf4b03eba4b54d3a37 100644
--- a/hysop/numerics/tests/test_fft.py
+++ b/hysop/numerics/tests/test_fft.py
@@ -20,9 +20,14 @@ from hysop.numerics.fft.numpy_fft  import NumpyFFT
 from hysop.numerics.fft.scipy_fft  import ScipyFFT
 from hysop.numerics.fft.fftw_fft   import FftwFFT
 from hysop.numerics.fft.gpyfft_fft import GpyFFT
+try:
+    from hysop.numerics.fft._mkl_fft import MklFFT
+    HAS_MKLFFT=True
+except:
+    HAS_MKLFFT=False
+    raise
 
-# For now Intel has a bug in their MKL FFTW-like interface so we do not test it
-from hysop.numerics.fft._mkl_fft import MklFFT
+raise_on_failure = False
 
 class TestFFT(object):
 
@@ -32,22 +37,24 @@ class TestFFT(object):
             'scipy': ScipyFFT(warn_on_allocation=False),
             'fftw':  FftwFFT(warn_on_allocation=False,
                              warn_on_misalignment=False),
-            'mkl':   MklFFT(warn_on_allocation=False),
         },
         Implementation.OPENCL: {}
     }
+    
+    if HAS_MKLFFT:
+        implementations[Implementation.PYTHON]['mkl'] = MklFFT(warn_on_allocation=False)
 
     print
     print ':: STARTING FFT BACKEND TESTS ::'
-    for (i,cl_env) in enumerate(iter_clenv()):
-        print '> Registering opencl backend {} as:\n{}'.format(
-                i, cl_env)
-        print
-        name = 'clfft{}'.format(i)
-        implementations[Implementation.OPENCL][name] = \
-            GpyFFT(cl_env=cl_env,
-                   warn_on_allocation=False,
-                   warn_on_unaligned_output_offset=False)
+    #for (i,cl_env) in enumerate(iter_clenv()):
+        #print '> Registering opencl backend {} as:\n{}'.format(
+                #i, cl_env)
+        #print
+        #name = 'clfft{}'.format(i)
+        #implementations[Implementation.OPENCL][name] = \
+            #GpyFFT(cl_env=cl_env,
+                   #warn_on_allocation=False,
+                   #warn_on_unaligned_output_offset=False)
 
     msg_shape = 'Expected output array shape to be {} but got {} for implementation {}.'
     msg_dtype = 'Expected output array dtype to be {} but got {} for implementation {}.'
@@ -57,6 +64,8 @@ class TestFFT(object):
     report_eps = 10
     fail_eps   = 100
 
+    stop_on_error = True
+
     def _test_1d(self, dtype, failures):
         print
         print '::Testing 1D transform, precision {}::'.format(dtype.__name__)
@@ -98,6 +107,8 @@ class TestFFT(object):
                     shape=results[r0].shape
                     failures.setdefault(tag, []).append((r0, r1, shape, Einf, Eeps))
             print ', '.join(ss)
+            if failed and raise_on_failure:
+                raise RuntimeError
 
 
         print '\n FORWARD C2C: complex to complex forward transform'
@@ -119,7 +130,7 @@ class TestFFT(object):
                         assert Bout.dtype == ctype, self.msg_dtype.format(ctype, Bout.dtype,
                                 name)
                         Bin[...] = Href
-                        plan.execute()
+                        evt = plan.execute()
                         H0 = Bin.get()
                         H1 = Bout.get()
                         assert np.array_equal(Href, H0), self.msg_input_modified.format(name)
@@ -467,7 +478,8 @@ class TestFFT(object):
             if failed:
                 print
                 msg='Some implementations failed !'
-                #raise RuntimeError(msg)
+                if raise_on_failure:
+                    raise RuntimeError(msg)
 
         print '\n C2C-C2C transform'
         for (shape, cshape, rshape, N, Nc, Nr,
@@ -573,7 +585,7 @@ class TestFFT(object):
                 check_distances(results)
 
         for (itype,stype) in enumerate(types, 1):
-            print '\n DST-{}: real to real discrete sinine transform {}'.format(
+            print '\n DST-{}: real to real discrete sine transform {}'.format(
                     stype.strip(), itype)
             ttype = 'SIN{}'.format(itype)
             for (shape, cshape, rshape, N, Nc, Nr,
@@ -696,7 +708,7 @@ class TestFFT(object):
         else:
             dtypes = (HYSOP_REAL,)
         for dtype in dtypes:
-            self._test_forward_backward_1d(dtype=dtype)
+            #self._test_forward_backward_1d(dtype=dtype)
             self._test_1d(dtype=dtype, failures=failures.setdefault(dtype.__name__, {}))
         self.report_failures(failures)
 
diff --git a/hysop/tools/io_utils.py b/hysop/tools/io_utils.py
index 34c85ec3eed063df4b8bd8d35b942a9972df607a..1209d91616412e4f62a90fe43c03ec1e99ed70d4 100755
--- a/hysop/tools/io_utils.py
+++ b/hysop/tools/io_utils.py
@@ -192,7 +192,8 @@ class IO(object):
 
 class IOParams(namedtuple("IOParams", ['filename', 'filepath',
                                        'frequency', 'fileformat',
-                                       'dump_times', 'dump_tstart', 'dump_tend',
+                                       'dump_times_fp32', 'dump_times_fp64',
+                                       'dump_tstart', 'dump_tend',
                                        'io_leader', 'visu_leader',
                                        'kwds'])):
     """
@@ -209,7 +210,7 @@ class IOParams(namedtuple("IOParams", ['filename', 'filepath',
     fileformat : int
         Format of the file. See notes for available format. Default=HDF5.
     dump_times: tuple of floats
-        Extra dump times that should be used to dump in addition to frequency
+        Extra dump times that should be used to dump in addition to frequency (double precision)
     dump_tstart: float
         Start to dump at given time. Defaults to -np.inf (no time constraints).
     dump_end: float
@@ -236,11 +237,19 @@ class IOParams(namedtuple("IOParams", ['filename', 'filepath',
                 io_leader=0, visu_leader=0,
                 **kwds):
 
-        dump_times  = first_not_None(dump_times, ())
         dump_tstart = first_not_None(dump_tstart, -np.inf)
         dump_tend   = first_not_None(dump_tend,   +np.inf)
         fileformat  = first_not_None(fileformat, IO.HDF5)
 
+        dump_times = first_not_None(dump_times, ())
+        if (dump_times is not None):
+            check_instance(dump_times, tuple, values=(float,np.float64))
+            dump_times_fp64 = tuple(map(np.float64, dump_times))
+            dump_times_fp32 = tuple(map(np.float32, dump_times))
+        else:
+            dump_times_fp32 = ()
+            dump_times_fp64 = ()
+
         check_instance(frequency, (int,long))
         check_instance(dump_tstart, float)
         check_instance(dump_tend,   float)
@@ -271,7 +280,8 @@ class IOParams(namedtuple("IOParams", ['filename', 'filepath',
         IO.check_dir(filepath)
         return super(IOParams, cls).__new__(cls, filename, filepath,
                                             frequency, fileformat,
-                                            dump_times, dump_tstart, dump_tend,
+                                            dump_times_fp32, dump_times_fp64, 
+                                            dump_tstart, dump_tend,
                                             io_leader, visu_leader,
                                             kwds)
 
@@ -282,7 +292,12 @@ class IOParams(namedtuple("IOParams", ['filename', 'filepath',
         if (t < self.dump_tstart) or (t > self.dump_tend):
             return dump
         if (frequency >= 0) and simulation.is_time_of_interest:
-            dump |= (t in self.dump_times)
+            if isinstance(t, np.float32):
+                dump |= (t in self.dump_times_fp32)
+            elif isinstance(t, np.float64):
+                dump |= (t in self.dump_times_fp64)
+            else:
+                raise NotImplementedError(type(t))
         if (frequency > 0):
             dump |= ((simulation.current_iteration % frequency) == 0)
         return dump
@@ -306,6 +321,10 @@ class IOParams(namedtuple("IOParams", ['filename', 'filepath',
                 all_kwds[k] = kwds.get(k, getattr(self, k))
         return IOParams(**all_kwds)
 
+    @property
+    def dump_times(self):
+        return self.dump_times_fp64
+
     def __str__(self):
         return self.to_string()
 
diff --git a/hysop/tools/numba_utils.py b/hysop/tools/numba_utils.py
index 069ed7caa760adbb7136ed538f92a97cf47baf23..58a73f603acb00a0e61227e4781e82a6fdbbdae2 100644
--- a/hysop/tools/numba_utils.py
+++ b/hysop/tools/numba_utils.py
@@ -1,8 +1,15 @@
 
-import numba as nb
 import numpy as np
+from hysop import __DEFAULT_NUMBA_TARGET__
 from hysop.core.arrays.array import Array
 
+try:
+    import numba as nb
+    from numba import prange
+    HAS_NUMBA=True
+except ImportError:
+    HAS_NUMBA=False
+
 def make_numba_signature(*args, **kwds):
     raise_on_cl_array = kwds.pop('raise_on_cl_array', True)
     if kwds:
@@ -93,3 +100,169 @@ def make_numba_signature(*args, **kwds):
         numba_args += (na,)
 
     return nb.void(*numba_args), ','.join(numba_layout)
+
+
+def bake_numba_copy(dst, src, target=None):
+    if (target is None):
+        target =  __DEFAULT_NUMBA_TARGET__
+    signature, layout = make_numba_signature(dst, src)
+    if (dst.ndim == 1):
+        @nb.guvectorize([signature], layout, 
+            target=target, nopython=True, cache=True)
+        def copy(dst, src):
+            for i in xrange(0, dst.shape[0]):
+                dst[i] = src[i]
+    elif (dst.ndim == 2):
+        @nb.guvectorize([signature], layout, 
+            target=target, nopython=True, cache=True)
+        def copy(dst, src):
+            for i in prange(0, dst.shape[0]):
+                for j in xrange(0, dst.shape[1]):
+                    dst[i,j] = src[i,j]
+    elif (dst.ndim == 3):
+        @nb.guvectorize([signature], layout, 
+            target=target, nopython=True, cache=True)
+        def copy(dst, src):
+            for i in prange(0, dst.shape[0]):
+                for j in prange(0, dst.shape[1]):
+                    for k in xrange(0, dst.shape[2]):
+                        dst[i,j,k] = src[i,j,k]
+    elif (dst.ndim == 4):
+        @nb.guvectorize([signature], layout, 
+            target=target, nopython=True, cache=True)
+        def copy(dst, src):
+            for i in prange(0, dst.shape[0]):
+                for j in prange(0, dst.shape[1]):
+                    for k in prange(0, dst.shape[2]):
+                        for l in xrange(0, dst.shape[3]):
+                            dst[i,j,k,l] = src[i,j,k,l]
+    else:
+        raise NotImplementedError(dst.ndim)
+    def _exec_copy(copy=copy, dst=dst, src=src):
+        copy(dst,src)
+    return _exec_copy
+
+
+def bake_numba_accumulate(dst, src, target=None):
+    if (target is None):
+        target =  __DEFAULT_NUMBA_TARGET__
+    signature, layout = make_numba_signature(dst, src)
+    if (dst.ndim == 1):
+        @nb.guvectorize([signature], layout, 
+            target=target, nopython=True, cache=True)
+        def accumulate(dst, src):
+            for i in xrange(0, dst.shape[0]):
+                dst[i] += src[i]
+    elif (dst.ndim == 2):
+        @nb.guvectorize([signature], layout, 
+            target=target, nopython=True, cache=True)
+        def accumulate(dst, src):
+            for i in prange(0, dst.shape[0]):
+                for j in xrange(0, dst.shape[1]):
+                    dst[i,j] += src[i,j]
+    elif (dst.ndim == 3):
+        @nb.guvectorize([signature], layout, 
+            target=target, nopython=True, cache=True)
+        def accumulate(dst, src):
+            for i in prange(0, dst.shape[0]):
+                for j in prange(0, dst.shape[1]):
+                    for k in xrange(0, dst.shape[2]):
+                        dst[i,j,k] += src[i,j,k]
+    elif (dst.ndim == 4):
+        @nb.guvectorize([signature], layout, 
+            target=target, nopython=True, cache=True)
+        def accumulate(dst, src):
+            for i in prange(0, dst.shape[0]):
+                for j in prange(0, dst.shape[1]):
+                    for k in prange(0, dst.shape[2]):
+                        for l in xrange(0, dst.shape[3]):
+                            dst[i,j,k,l] += src[i,j,k,l]
+    else:
+        raise NotImplementedError(dst.ndim)
+    def _exec_accumulate(accumulate=accumulate, dst=dst, src=src):
+        accumulate(dst,src)
+    return _exec_accumulate
+
+
+def bake_numba_transpose(src, dst, axes, target=None):
+    # inefficient permutations
+
+    if (target is None):
+        target =  __DEFAULT_NUMBA_TARGET__
+    signature, layout = make_numba_signature(dst, src)
+
+    assert src.ndim == dst.ndim
+    assert dst.shape == tuple(src.shape[i] for i in axes)
+    assert dst.dtype == src.dtype
+    ndim = src.ndim
+    
+    def noop(dst, src):
+        pass
+    
+    if (ndim == 1):
+        transpose = noop
+    elif (ndim == 2):
+        if axes == (0,1):
+            transpose == noop
+        elif axes == (1,0):
+            @nb.guvectorize([signature], layout, 
+                target=target, nopython=True, cache=True)
+            def transpose(dst, src):
+                for i in prange(0, src.shape[0]):
+                    for j in xrange(0, src.shape[1]):
+                        dst[j,i] = src[i,j]
+        else:
+            raise NotImplementedError
+    elif (ndim == 3):
+        if   axes == (0,1,2):
+            transpose == noop
+        elif axes == (0,2,1):
+            @nb.guvectorize([signature], layout, 
+                target=target, nopython=True, cache=True)
+            def transpose(dst, src):
+                for i in prange(0, src.shape[0]):
+                    for j in prange(0, src.shape[1]):
+                        for k in xrange(0, src.shape[2]):
+                            dst[i,k,j] = src[i,j,k]
+        elif axes == (1,0,2):
+            @nb.guvectorize([signature], layout, 
+                target=target, nopython=True, cache=True)
+            def transpose(dst, src):
+                for i in prange(0, src.shape[0]):
+                    for j in prange(0, src.shape[1]):
+                        for k in xrange(0, src.shape[2]):
+                            dst[j,i,k] = src[i,j,k]
+        elif axes == (1,2,0):
+            @nb.guvectorize([signature], layout, 
+                target=target, nopython=True, cache=True)
+            def transpose(dst, src):
+                for i in prange(0, src.shape[0]):
+                    for j in prange(0, src.shape[1]):
+                        for k in xrange(0, src.shape[2]):
+                            dst[j,k,i] = src[i,j,k]
+        elif axes == (2,1,0):
+            @nb.guvectorize([signature], layout, 
+                target=target, nopython=True, cache=True)
+            def transpose(dst, src):
+                for i in prange(0, src.shape[0]):
+                    for j in prange(0, src.shape[1]):
+                        for k in xrange(0, src.shape[2]):
+                            dst[k,j,i] = src[i,j,k]
+        elif axes == (2,0,1):
+            @nb.guvectorize([signature], layout, 
+                target=target, nopython=True, cache=True)
+            def transpose(dst, src):
+                for i in prange(0, src.shape[0]):
+                    for j in prange(0, src.shape[1]):
+                        for k in xrange(0, src.shape[2]):
+                            dst[k,i,j] = src[i,j,k]
+        else:
+            raise NotImplementedError(axes)
+    else:
+        raise NotImplementedError(ndim)
+
+    def _exec_transpose(transpose=transpose, dst=dst, src=src):
+        transpose(dst,src)
+    return _exec_transpose
+
+
diff --git a/hysop/tools/types.py b/hysop/tools/types.py
index bba65cfb2690209e8bac93417803488f2ad7312d..0fa5fea24ceaa47b3471e7c3741e980d0a6ee864 100644
--- a/hysop/tools/types.py
+++ b/hysop/tools/types.py
@@ -1,6 +1,5 @@
 from hysop.deps import np
 from collections import Iterable
-from hysop.tools.misc import prod
 
 class InstanceOf(object):
     def __init__(self, cls):
@@ -171,6 +170,7 @@ def check_instance(val, cls, allow_none=False,
                     print_offending_value(v, all_val_cls)
                     raise TypeError(msg)
     elif isinstance(val, np.ndarray):
+        from hysop.tools.misc import prod
         dtype = kargs.pop('dtype', None)
         shape = kargs.pop('shape', None)
         size  = kargs.pop('size', None)