diff --git a/ci/docker_images/ubuntu/bionic/Dockerfile b/ci/docker_images/ubuntu/bionic/Dockerfile index 872147e972dbc3e5ca3e3f3cbe62baab1cc1442a..0bca021dbc3c8798dfe7d7469aa7c6411c40b13c 100644 --- a/ci/docker_images/ubuntu/bionic/Dockerfile +++ b/ci/docker_images/ubuntu/bionic/Dockerfile @@ -220,6 +220,20 @@ RUN cd /tmp \ && cd - \ && rm -Rf /tmp/pyFFTW +# HPTT (tensor permutation library) +RUN cd /tmp + && git clone https://gitlab.com/keckj/hptt \ + && cd hptt \ + && mkdir build \ + && cd build \ + && cmake -DCMAKE_BUILD_TYPE=Release .. \ + && make -j8 \ + && make install \ + && cd ../pythonAPI \ + && pip install --upgrade . \ + && cd /tmp \ + && rm -Rf /tmp/hptt + # ensure all libraries are known by the runtime linker RUN ldconfig diff --git a/hysop/backend/device/opencl/autotunable_kernels/transpose.py b/hysop/backend/device/opencl/autotunable_kernels/transpose.py index d9a16ea93ab765e5921216fd3d3f5ae3a06a1eac..1dbd3fdb78a802c946817ba22653ef3b891f0d63 100644 --- a/hysop/backend/device/opencl/autotunable_kernels/transpose.py +++ b/hysop/backend/device/opencl/autotunable_kernels/transpose.py @@ -156,7 +156,8 @@ class OpenClAutotunableTransposeKernel(OpenClAutotunableKernel): tile_sizes = tuple( int((2**i)*(3**j)) for (i,j) in it.product(range(0,imax+1), range(0,jmax+1))) tile_sizes = (max_tile_size,) + tuple(sorted(tile_sizes, reverse=True)) - tile_sizes = tuple(filter(lambda x: (x>=8) and (x<=max_tile_size), tile_sizes)) + tile_sizes = tuple(filter(lambda x: (x>=max_tile_size//8) and (x<=max_tile_size), tile_sizes)) + params.register_extra_parameter('vectorization', vectorization) params.register_extra_parameter('use_diagonal_coordinates', use_diagonal_coordinates) diff --git a/hysop/backend/host/python/operator/transpose.py b/hysop/backend/host/python/operator/transpose.py index e7e770609b9941397689e473b697edc5288ccd76..840ac9c97c7edcc481d596cee270ec8f094e6405 100644 --- a/hysop/backend/host/python/operator/transpose.py +++ b/hysop/backend/host/python/operator/transpose.py @@ -1,11 +1,21 @@ -from hysop.deps import np - from hysop.constants import MemoryOrdering from hysop.tools.decorators import debug, profile from hysop.backend.host.host_operator import HostOperator from hysop.operator.base.transpose_operator import TransposeOperatorBase from hysop.core.graph.graph import op_apply +import numpy as np +try: + import hptt + HAS_HPTT=True + # required version is: https://gitlab.com/keckj/hptt +except ImportError: + HAS_HPTT=False + import warnings + from hysop.tools.warning import HysopPerformanceWarning + msg='Failed to import HPTT module, falling back to slow numpy transpose. Required version is available at https://gitlab.com/keckj/hptt.' + warnings.warn(msg, HysopPerformanceWarning) + class PythonTranspose(TransposeOperatorBase, HostOperator): """ Inplace and out of place field transposition and permutations in general. @@ -17,6 +27,21 @@ class PythonTranspose(TransposeOperatorBase, HostOperator): """Initialize a Transpose operator on the python backend.""" super(PythonTranspose, self).__init__(**kwds) + + def discretize(self): + super(PythonTranspose, self).discretize() + assert self.din.dtype == self.dout.dtype + dtype = self.din.dtype + if HAS_HPTT and (dtype in (np.float32, np.float64, np.complex64, np.complex128)): + if self.is_inplace: + self.exec_transpose = self.transpose_hptt_inplace + else: + self.exec_transpose = self.transpose_hptt_outofplace + else: + if self.is_inplace: + self.exec_transpose = self.transpose_np_inplace + else: + self.exec_transpose = self.transpose_np_outofplace @debug def get_field_requirements(self): @@ -27,26 +52,44 @@ class PythonTranspose(TransposeOperatorBase, HostOperator): (field, td, req) = reqs req.memory_order = MemoryOrdering.ANY return requirements + + @classmethod + def supports_mpi(cls): + return True @op_apply def apply(self, **kwds): """ Transpose in or out of place.""" super(PythonTranspose,self).apply(**kwds) + self.exec_transpose(**kwds) + + def transpose_hptt_inplace(self, **kwds): + axes = self.axes + din, dout, dtmp = self.din, self.dout, self.dtmp.handle.view(np.ndarray) + assert self.din.dfield is self.dout.dfield + for i in xrange(din.nb_components): + hptt.transpose(a=din.buffers[i], out=dtmp, axes=axes) + dout.buffers[i][...] = dtmp + + def transpose_hptt_outofplace(self, **kwds): + axes = self.axes + din, dout = self.din, self.dout + assert self.din.dfield is not self.dout.dfield + for i in xrange(din.nb_components): + hptt.transpose(a=din.buffers[i], out=dout.buffers[i], axes=axes) + def transpose_np_inplace(self, **kwds): axes = self.axes - - if self.is_inplace: - din, dout, dtmp = self.din, self.dout, self.dtmp.handle.view(np.ndarray) - assert self.din.dfield is self.dout.dfield - for i in xrange(din.nb_components): - dtmp[...] = np.transpose(din.buffers[i], axes=axes) - dout.buffers[i][...] = dtmp - else: - din, dout = self.din, self.dout - assert self.din.dfield is not self.dout.dfield - for i in xrange(din.nb_components): - dout.buffers[i][...] = np.transpose(din.buffers[i], axes=axes) - - @classmethod - def supports_mpi(cls): - return True + din, dout, dtmp = self.din, self.dout, self.dtmp.handle.view(np.ndarray) + assert self.din.dfield is self.dout.dfield + for i in xrange(din.nb_components): + dtmp[...] = np.transpose(din.buffers[i], axes=axes) + dout.buffers[i][...] = dtmp + + def transpose_np_outofplace(self, **kwds): + axes = self.axes + din, dout = self.din, self.dout + assert self.din.dfield is not self.dout.dfield + for i in xrange(din.nb_components): + dout.buffers[i][...] = np.transpose(din.buffers[i], axes=axes) + diff --git a/hysop/operator/tests/test_transpose.py b/hysop/operator/tests/test_transpose.py index 655f2a73b110f083eeaebd47c5a330aba4183304..ce3da5a76ea3fd88d7d025805abbebf4c66c4279 100644 --- a/hysop/operator/tests/test_transpose.py +++ b/hysop/operator/tests/test_transpose.py @@ -21,10 +21,10 @@ class TestTransposeOperator(object): IO.set_default_path('/tmp/hysop_tests/test_transpose') if enable_debug_mode: - cls.size_min = 2 - cls.size_max = 6 + cls.size_min = 3 + cls.size_max = 4 else: - cls.size_min = 2 + cls.size_min = 4 cls.size_max = 23 cls.enable_extra_tests = enable_extra_tests @@ -59,12 +59,9 @@ class TestTransposeOperator(object): random.shuffle(all_axes) all_axes = all_axes[:min(naxes,len(all_axes))] - if dtype is None: - types = [#np.int8, np.int16, np.int32, np.int64, - #np.uint8, np.uint16, np.uint32, np.uint64, - #np.float32, np.float64, + if (dtype is None): + types = [np.float32, np.float64, np.complex64, np.complex128] - random.shuffle(types) dtype = types[0] domain = Box(length=(1.0,)*dim) @@ -129,8 +126,15 @@ class TestTransposeOperator(object): refout = tuple(df.copy() for df in dfout.buffers) - for in_,out_ in zip(refin, refout): - assert np.all(out_ == np.transpose(in_, axes=axes)) + for i,(in_,out_) in enumerate(zip(refin, refout)): + ref = np.transpose(in_, axes=axes) + if (ref!=out_).any(): + print + print np.transpose(in_, axes=axes) + print + print out_ + msg = 'Reference did not match numpy for component {}.'.format(i) + raise RuntimeError(msg) def iter_impl(impl): base_kwds = dict(fields=fin, output_fields=fout, variables=variables, @@ -196,37 +200,37 @@ class TestTransposeOperator(object): raise RuntimeError(msg) - def test_2d_out_of_place(self): - self._test(dim=2, dtype=None, is_inplace=False) - def test_3d_out_of_place(self): - self._test(dim=3, dtype=None, is_inplace=False) - def test_4d_out_of_place(self): - self._test(dim=4, dtype=None, is_inplace=False) + def test_2d_out_of_place(self, **kwds): + self._test(dim=2, is_inplace=False, **kwds) + def test_3d_out_of_place(self, **kwds): + self._test(dim=3, is_inplace=False, **kwds) + def test_4d_out_of_place(self, **kwds): + self._test(dim=4, is_inplace=False, **kwds) def test_upper_dimensions_out_of_place(self): for i in xrange(5,9): self._test(dim=i, dtype=None, is_inplace=False, size_min=3, size_max=4, naxes=1) - def test_2d_inplace(self): - self._test(dim=2, dtype=None, is_inplace=True) - def test_3d_inplace(self): - self._test(dim=3, dtype=None, is_inplace=True) - def test_4d_inplace(self): - self._test(dim=4, dtype=None, is_inplace=True) + def test_2d_inplace(self, **kwds): + self._test(dim=2, is_inplace=True, **kwds) + def test_3d_inplace(self, **kwds): + self._test(dim=3, is_inplace=True, **kwds) + def test_4d_inplace(self, **kwds): + self._test(dim=4, is_inplace=True, **kwds) def test_upper_dimensions_inplace(self): for i in xrange(5,9): self._test(dim=i, dtype=None, is_inplace=True, size_min=3, size_max=4, naxes=1) def perform_tests(self): - self.test_2d_out_of_place() - self.test_3d_out_of_place() + self.test_2d_out_of_place(dtype=np.float32) + self.test_3d_out_of_place(dtype=np.complex64) if __ENABLE_LONG_TESTS__: self.test_4d_out_of_place() self.test_upper_dimensions_out_of_place() - self.test_2d_inplace() - self.test_3d_inplace() + self.test_2d_inplace(dtype=np.float64) + self.test_3d_inplace(dtype=np.complex128) if __ENABLE_LONG_TESTS__: self.test_4d_inplace() self.test_upper_dimensions_inplace()