diff --git a/ci/docker_images/ubuntu/bionic/Dockerfile b/ci/docker_images/ubuntu/bionic/Dockerfile
index 872147e972dbc3e5ca3e3f3cbe62baab1cc1442a..0bca021dbc3c8798dfe7d7469aa7c6411c40b13c 100644
--- a/ci/docker_images/ubuntu/bionic/Dockerfile
+++ b/ci/docker_images/ubuntu/bionic/Dockerfile
@@ -220,6 +220,20 @@ RUN cd /tmp                                      \
  && cd -                                         \
  && rm -Rf /tmp/pyFFTW
 
+# HPTT (tensor permutation library)
+RUN cd /tmp
+ && git clone https://gitlab.com/keckj/hptt \
+ && cd hptt                                 \
+ && mkdir build                             \
+ && cd build                                \
+ && cmake -DCMAKE_BUILD_TYPE=Release ..     \
+ && make -j8                                \
+ && make install                            \
+ && cd ../pythonAPI                         \
+ && pip install --upgrade .                 \
+ && cd /tmp                                 \
+ && rm -Rf /tmp/hptt
+
 # ensure all libraries are known by the runtime linker
 RUN ldconfig
 
diff --git a/hysop/backend/device/opencl/autotunable_kernels/transpose.py b/hysop/backend/device/opencl/autotunable_kernels/transpose.py
index d9a16ea93ab765e5921216fd3d3f5ae3a06a1eac..1dbd3fdb78a802c946817ba22653ef3b891f0d63 100644
--- a/hysop/backend/device/opencl/autotunable_kernels/transpose.py
+++ b/hysop/backend/device/opencl/autotunable_kernels/transpose.py
@@ -156,7 +156,8 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel):
         tile_sizes = tuple( int((2**i)*(3**j))
                 for (i,j) in it.product(range(0,imax+1), range(0,jmax+1)))
         tile_sizes = (max_tile_size,) + tuple(sorted(tile_sizes, reverse=True))
-        tile_sizes = tuple(filter(lambda x: (x>=8) and (x<=max_tile_size), tile_sizes))
+        tile_sizes = tuple(filter(lambda x: (x>=max_tile_size//8) and (x<=max_tile_size), tile_sizes))
+
         
         params.register_extra_parameter('vectorization', vectorization) 
         params.register_extra_parameter('use_diagonal_coordinates', use_diagonal_coordinates)
diff --git a/hysop/backend/host/python/operator/transpose.py b/hysop/backend/host/python/operator/transpose.py
index e7e770609b9941397689e473b697edc5288ccd76..840ac9c97c7edcc481d596cee270ec8f094e6405 100644
--- a/hysop/backend/host/python/operator/transpose.py
+++ b/hysop/backend/host/python/operator/transpose.py
@@ -1,11 +1,21 @@
-from hysop.deps import np
-
 from hysop.constants import MemoryOrdering
 from hysop.tools.decorators import debug, profile
 from hysop.backend.host.host_operator import HostOperator
 from hysop.operator.base.transpose_operator import TransposeOperatorBase
 from hysop.core.graph.graph import op_apply
 
+import numpy as np
+try:
+    import hptt
+    HAS_HPTT=True
+    # required version is: https://gitlab.com/keckj/hptt
+except ImportError:
+    HAS_HPTT=False
+    import warnings
+    from hysop.tools.warning import HysopPerformanceWarning
+    msg='Failed to import HPTT module, falling back to slow numpy transpose. Required version is available at https://gitlab.com/keckj/hptt.'
+    warnings.warn(msg, HysopPerformanceWarning)
+
 class PythonTranspose(TransposeOperatorBase, HostOperator):
     """
     Inplace and out of place field transposition and permutations in general.
@@ -17,6 +27,21 @@ class PythonTranspose(TransposeOperatorBase, HostOperator):
         """Initialize a Transpose operator on the python backend."""
         super(PythonTranspose, self).__init__(**kwds)
 
+    
+    def discretize(self):
+        super(PythonTranspose, self).discretize()
+        assert self.din.dtype == self.dout.dtype
+        dtype = self.din.dtype
+        if HAS_HPTT and (dtype in (np.float32, np.float64, np.complex64, np.complex128)):
+            if self.is_inplace:
+                self.exec_transpose = self.transpose_hptt_inplace
+            else:
+                self.exec_transpose = self.transpose_hptt_outofplace
+        else:
+            if self.is_inplace:
+                self.exec_transpose = self.transpose_np_inplace
+            else:
+                self.exec_transpose = self.transpose_np_outofplace
 
     @debug
     def get_field_requirements(self):
@@ -27,26 +52,44 @@ class PythonTranspose(TransposeOperatorBase, HostOperator):
             (field, td, req) = reqs
             req.memory_order = MemoryOrdering.ANY
         return requirements
+    
+    @classmethod
+    def supports_mpi(cls):
+        return True
 
     @op_apply
     def apply(self, **kwds):
         """ Transpose in or out of place."""
         super(PythonTranspose,self).apply(**kwds)
+        self.exec_transpose(**kwds)
+    
+    def transpose_hptt_inplace(self, **kwds):
+        axes = self.axes
+        din, dout, dtmp = self.din, self.dout, self.dtmp.handle.view(np.ndarray)
+        assert self.din.dfield is self.dout.dfield
+        for i in xrange(din.nb_components):
+            hptt.transpose(a=din.buffers[i], out=dtmp, axes=axes)
+            dout.buffers[i][...] = dtmp
+            
+    def transpose_hptt_outofplace(self, **kwds):
+        axes = self.axes
+        din, dout = self.din, self.dout
+        assert self.din.dfield is not self.dout.dfield
+        for i in xrange(din.nb_components):
+            hptt.transpose(a=din.buffers[i], out=dout.buffers[i], axes=axes)
 
+    def transpose_np_inplace(self, **kwds):
         axes = self.axes
-        
-        if self.is_inplace:
-            din, dout, dtmp = self.din, self.dout, self.dtmp.handle.view(np.ndarray)
-            assert self.din.dfield is self.dout.dfield
-            for i in xrange(din.nb_components):
-                dtmp[...] = np.transpose(din.buffers[i], axes=axes)
-                dout.buffers[i][...] = dtmp
-        else:
-            din, dout = self.din, self.dout
-            assert self.din.dfield is not self.dout.dfield
-            for i in xrange(din.nb_components):
-                dout.buffers[i][...] = np.transpose(din.buffers[i], axes=axes)
-    
-    @classmethod
-    def supports_mpi(cls):
-        return True
+        din, dout, dtmp = self.din, self.dout, self.dtmp.handle.view(np.ndarray)
+        assert self.din.dfield is self.dout.dfield
+        for i in xrange(din.nb_components):
+            dtmp[...] = np.transpose(din.buffers[i], axes=axes)
+            dout.buffers[i][...] = dtmp
+            
+    def transpose_np_outofplace(self, **kwds):
+        axes = self.axes
+        din, dout = self.din, self.dout
+        assert self.din.dfield is not self.dout.dfield
+        for i in xrange(din.nb_components):
+            dout.buffers[i][...] = np.transpose(din.buffers[i], axes=axes)
+
diff --git a/hysop/operator/tests/test_transpose.py b/hysop/operator/tests/test_transpose.py
index 655f2a73b110f083eeaebd47c5a330aba4183304..ce3da5a76ea3fd88d7d025805abbebf4c66c4279 100644
--- a/hysop/operator/tests/test_transpose.py
+++ b/hysop/operator/tests/test_transpose.py
@@ -21,10 +21,10 @@ class TestTransposeOperator(object):
         IO.set_default_path('/tmp/hysop_tests/test_transpose')
         
         if enable_debug_mode:
-            cls.size_min = 2
-            cls.size_max = 6
+            cls.size_min = 3
+            cls.size_max = 4
         else:
-            cls.size_min = 2
+            cls.size_min = 4
             cls.size_max = 23
         
         cls.enable_extra_tests = enable_extra_tests
@@ -59,12 +59,9 @@ class TestTransposeOperator(object):
             random.shuffle(all_axes)
             all_axes = all_axes[:min(naxes,len(all_axes))]
         
-        if dtype is None:
-            types = [#np.int8, np.int16, np.int32, np.int64,
-                     #np.uint8, np.uint16, np.uint32, np.uint64,
-                     #np.float32, np.float64,
+        if (dtype is None):
+            types = [np.float32, np.float64,
                      np.complex64, np.complex128]
-            random.shuffle(types)
             dtype = types[0]
         
         domain = Box(length=(1.0,)*dim)
@@ -129,8 +126,15 @@ class TestTransposeOperator(object):
         
         refout = tuple(df.copy() for df in dfout.buffers)
 
-        for in_,out_ in zip(refin, refout):
-            assert np.all(out_ == np.transpose(in_, axes=axes))
+        for i,(in_,out_) in enumerate(zip(refin, refout)):
+            ref = np.transpose(in_, axes=axes)
+            if (ref!=out_).any():
+                print
+                print np.transpose(in_, axes=axes)
+                print
+                print out_
+                msg = 'Reference did not match numpy for component {}.'.format(i)
+                raise RuntimeError(msg)
         
         def iter_impl(impl):
             base_kwds = dict(fields=fin, output_fields=fout, variables=variables,
@@ -196,37 +200,37 @@ class TestTransposeOperator(object):
             raise RuntimeError(msg) 
 
 
-    def test_2d_out_of_place(self):
-        self._test(dim=2, dtype=None, is_inplace=False)
-    def test_3d_out_of_place(self):
-        self._test(dim=3, dtype=None, is_inplace=False)
-    def test_4d_out_of_place(self):
-        self._test(dim=4, dtype=None, is_inplace=False)
+    def test_2d_out_of_place(self, **kwds):
+        self._test(dim=2, is_inplace=False, **kwds)
+    def test_3d_out_of_place(self, **kwds):
+        self._test(dim=3, is_inplace=False, **kwds)
+    def test_4d_out_of_place(self, **kwds):
+        self._test(dim=4, is_inplace=False, **kwds)
     def test_upper_dimensions_out_of_place(self):
         for i in xrange(5,9):
             self._test(dim=i, dtype=None, is_inplace=False,
                     size_min=3, size_max=4, naxes=1)
     
-    def test_2d_inplace(self):
-        self._test(dim=2, dtype=None, is_inplace=True)
-    def test_3d_inplace(self):
-        self._test(dim=3, dtype=None, is_inplace=True)
-    def test_4d_inplace(self):
-        self._test(dim=4, dtype=None, is_inplace=True)
+    def test_2d_inplace(self, **kwds):
+        self._test(dim=2, is_inplace=True, **kwds)
+    def test_3d_inplace(self, **kwds):
+        self._test(dim=3, is_inplace=True, **kwds)
+    def test_4d_inplace(self, **kwds):
+        self._test(dim=4, is_inplace=True, **kwds)
     def test_upper_dimensions_inplace(self):
         for i in xrange(5,9):
             self._test(dim=i, dtype=None, is_inplace=True,
                     size_min=3, size_max=4, naxes=1)
 
     def perform_tests(self):
-        self.test_2d_out_of_place()
-        self.test_3d_out_of_place()
+        self.test_2d_out_of_place(dtype=np.float32)
+        self.test_3d_out_of_place(dtype=np.complex64)
         if __ENABLE_LONG_TESTS__:
             self.test_4d_out_of_place()
             self.test_upper_dimensions_out_of_place()
         
-        self.test_2d_inplace()
-        self.test_3d_inplace()
+        self.test_2d_inplace(dtype=np.float64)
+        self.test_3d_inplace(dtype=np.complex128)
         if __ENABLE_LONG_TESTS__:
             self.test_4d_inplace()
             self.test_upper_dimensions_inplace()