diff --git a/examples/example_utils.py b/examples/example_utils.py index 73e0c55b870c7665514ad7b14e7d3f71ceadf60f..638918a4606d34181d364591d7117ed35fd42d74 100644 --- a/examples/example_utils.py +++ b/examples/example_utils.py @@ -697,6 +697,12 @@ class HysopArgParser(argparse.ArgumentParser): threading.add_argument('--mkl-threads', type=str, default=None, dest='mkl_threads', help='This parameter will set MKL_NUM_THREADS to a custom value (overrides --max-threads).') + threading.add_argument('--mkl-domain-threads', type=str, default=None, + dest='mkl_domain_threads', + help='This parameter will set MKL_DOMAIN_NUM_THREADS to a custom value (overrides --max-threads).') + threading.add_argument('--mkl-threading-layer', type=str, default='TBB', + dest='mkl_threading_layer', + help="This parameter will set MKL_THREADING_LAYER to a custom value ('TBB', 'GNU', 'INTEL', 'SEQUENTIAL').") threading.add_argument('--numba-threads', type=str, default=None, dest='numba_threads', help='This parameter will set NUMBA_NUM_THREADS to a custom value (overrides --max-threads).') @@ -753,9 +759,9 @@ class HysopArgParser(argparse.ArgumentParser): return opencl def _check_threading_args(self, args): - self._check_default(args, ('enable_threading', 'max_threads', 'numba_threading_layer', + self._check_default(args, ('enable_threading', 'max_threads', 'mkl_threading_layer', 'numba_threading_layer', 'fftw_planner_effort', 'fftw_planner_timelimit'), str, allow_none=False) - self._check_default(args, ('openmp_threads', 'mkl_threads', 'numba_threads', 'fftw_threads'), + self._check_default(args, ('openmp_threads', 'mkl_threads', 'mkl_domain_threads', 'numba_threads', 'fftw_threads'), str, allow_none=True) args.enable_threading = self._convert_bool('enable_threading', args.enable_threading) @@ -766,6 +772,8 @@ class HysopArgParser(argparse.ArgumentParser): for argname in ('openmp_threads', 'mkl_threads', 'numba_threads', 'fftw_threads'): setattr(args, argname, self._convert_threads(argname, getattr(args, argname), default=args.max_threads)) + args.mkl_threading_layer = self._convert_mkl_threading_layer('mkl_threading_layer', + args.mkl_threading_layer) args.numba_threading_layer = self._convert_numba_threading_layer('numba_threading_layer', args.numba_threading_layer) args.fftw_planner_effort = self._convert_fftw_planner_effort('fftw_planner_effort', @@ -1422,6 +1430,8 @@ class HysopArgParser(argparse.ArgumentParser): # those environment variables are not part of HySoP self.set_env('OMP_NUM_THREADS', args.openmp_threads, False) self.set_env('MKL_NUM_THREADS', args.mkl_threads, False) + self.set_env('MKL_DOMAIN_NUM_THREADS', args.mkl_domain_threads, False) + self.set_env('MKL_THREADING_LAYER', args.mkl_threading_layer, False) self.set_env('NUMBA_NUM_THREADS', args.numba_threads, False) self.set_env('NUMBA_THREADING_LAYER', args.numba_threading_layer, False) @@ -1531,6 +1541,15 @@ class HysopArgParser(argparse.ArgumentParser): } return self._check_convert(argname, val, values) + def _convert_mkl_threading_layer(self, argname, val): + values = { + 'seq': 'SEQUENTIAL', + 'omp': 'OMP', + 'tbb': 'TBB', + 'intel': 'INTEL' + } + return self._check_convert(argname, val, values) + def _convert_numba_threading_layer(self, argname, val): values = { 'workqueue': 'workqueue', diff --git a/examples/shear_layer/shear_layer.py b/examples/shear_layer/shear_layer.py index 75d894648d74abb582af8ee22a6565e03f2adc26..520d42899942b7a3868b3961a7a9279df2dc4433 100644 --- a/examples/shear_layer/shear_layer.py +++ b/examples/shear_layer/shear_layer.py @@ -124,10 +124,16 @@ def compute(args): variables={velo:npts, vorti: npts}, projection=args.reprojection_frequency, diffusion=nu, dt=dt, - implementation=impl, **extra_op_kwds) + implementation=impl, + enforce_implementation=args.enforce_implementation, + **extra_op_kwds) #> We ask to dump the inputs and the outputs of this operator - poisson.dump_outputs(fields=(vorti,), frequency=args.dump_freq, **extra_op_kwds) - poisson.dump_outputs(fields=(velo,), frequency=args.dump_freq, **extra_op_kwds) + poisson.dump_outputs(fields=(vorti,), + io_params=args.io_params.clone(filename='vorti'), + **extra_op_kwds) + poisson.dump_outputs(fields=(velo,), + io_params=args.io_params.clone(filename='velo'), + **extra_op_kwds) #> Operator to compute the infinite norm of the velocity if (impl is Implementation.FORTRAN): @@ -217,7 +223,7 @@ if __name__=='__main__': description+='orientation of the shear layers is made.' description+='\n' description+='\nEach of the shear layers rolls up in a single vortex ' - description+='as the flow evolves.' + description+='as the flow eolves.' description+='\n' description+='\nDefault example parameters depends on the chosen case:' description+='\n CASE 0 1 2' @@ -282,6 +288,7 @@ if __name__=='__main__': self._check_positive(args, 'plot_freq', strict=False, allow_none=False) def _setup_parameters(self, args): + super(ShearLayerArgParser, self)._setup_parameters(args) from hysop.tools.types import first_not_None case = args.case diff --git a/examples/taylor_green/taylor_green.py b/examples/taylor_green/taylor_green.py index 3192c0288b4aa7fd229431f4f101de05d3b6c7bc..a8d07d76fa12605817c88997ce6bb65daedb8239 100644 --- a/examples/taylor_green/taylor_green.py +++ b/examples/taylor_green/taylor_green.py @@ -34,8 +34,7 @@ def compute(args): ViscosityParameter from hysop.constants import Implementation, AdvectionCriteria, StretchingCriteria, Backend - from hysop.operators import DirectionalAdvection, DirectionalStretchingDiffusion, \ - DirectionalDiffusion, DirectionalStretching, \ + from hysop.operators import DirectionalAdvection, DirectionalStretching, \ StaticDirectionalStretching, Diffusion, \ PoissonCurl, AdaptiveTimeStep, HDF_Writer, \ Enstrophy, MinMaxFieldStatistics, StrangSplitting, \ @@ -138,6 +137,7 @@ def compute(args): # plot_energy=IOParams(filepath=spectral_path, filename='E_{fname}_{ite}', frequency=args.dump_freq), # plot_input_vorticity_energy=-1, # <= disable a specific plot # plot_output_vorticity_energy=-1, # <= disable a specific plot + enforce_implementation=args.enforce_implementation, implementation=impl, **extra_op_kwds) #> We ask to dump the outputs of this operator dump_fields = HDF_Writer(name='fields', diff --git a/hysop/__init__.py.in b/hysop/__init__.py.in index aefefc7958f7ccd73decc85b39f9dc76f07e14a6..701856c217556b09b0468e89f46aa8d91b5651e2 100644 --- a/hysop/__init__.py.in +++ b/hysop/__init__.py.in @@ -2,9 +2,26 @@ Python package dedicated to flow simulation using particular methods on hybrid architectures (MPI-GPU) """ -import psutil +import psutil, signal, traceback, threading, sys, os, warnings from functools import wraps -from hysop.deps import __builtin__, print_function, os, sys, warnings, traceback +from hysop.deps import __builtin__, print_function + +# Register debug signals (SIGUSR1(10)=print the main stack, SIGUSR2(12)=print the stack of all threads) +def dumpstack(signal, frame): + traceback.print_stack() +def dumpstacks(signal, frame): + #https://stackoverflow.com/questions/132058/showing-the-stack-trace-from-a-running-python-application + id2name = dict([(th.ident, th.name) for th in threading.enumerate()]) + code = [] + for threadId, stack in sys._current_frames().items(): + code.append("\n# Thread: %s(%d)" % (id2name.get(threadId,""), threadId)) + for filename, lineno, name, line in traceback.extract_stack(stack): + code.append('File: "%s", line %d, in %s' % (filename, lineno, name)) + if line: + code.append(" %s" % (line.strip())) + print("\n".join(code)) +signal.signal(signal.SIGUSR1, dumpstack) +signal.signal(signal.SIGUSR2, dumpstacks) def get_env(target, default_value): target = 'HYSOP_{}'.format(target) @@ -55,7 +72,8 @@ __ENABLE_LONG_TESTS__ = get_env('ENABLE_LONG_TESTS', ("@ENABLE_LONG_TESTS@" is " __ENABLE_THREADING__ = get_env('ENABLE_THREADING', True) __MAX_THREADS__ = int(get_env('MAX_THREADS', psutil.cpu_count(logical=False)) if __ENABLE_THREADING__ else 1) set_env('OMP_NUM_THREADS', __MAX_THREADS__) -set_env('MKL_NUM_THREADS', __MAX_THREADS__) +set_env('MKL_NUM_THREADS', 1) +set_env('MKL_THREADING_LAYER', 'TBB' if __ENABLE_THREADING__ else 'SEQUENTIAL') set_env('NUMBA_NUM_THREADS', __MAX_THREADS__) set_env('NUMBA_THREADING_LAYER', 'workqueue') # Use 'numba -s' to list support __DEFAULT_NUMBA_TARGET__ = ('parallel' if __ENABLE_THREADING__ else 'cpu') @@ -75,6 +93,8 @@ if __MPI_ENABLED__: shm_rank, intershm_size else: main_rank, main_size = 0, 1 + interhost_size, intershm_size = 1, 1 + host_rank, shm_rank = 0, 0 # define printing functions def print(*args, **kargs): @@ -169,7 +189,14 @@ msg_threads = \ HYSOP_MAX_THREADS: {} -------------------------------- OMP_NUM_THREADS: {} + OMP_DYNAMIC: {} + OMP_NESTED: {} + OMP_MAX_ACTIVE_LEVELS: {} + -------------------------------- + MKL_THREADING_LAYER: {} MKL_NUM_THREADS: {} + MKL_DOMAIN_NUM_THREADS: {} + MKL_DYNAMIC: {} -------------------------------- DEFAULT_NUMBA_TARGET: {} NUMBA_THREADING_LAYER: {} @@ -182,11 +209,17 @@ msg_threads = \ '''.format( __ENABLE_THREADING__, __MAX_THREADS__, - os.environ['OMP_NUM_THREADS'], - os.environ['MKL_NUM_THREADS'], + os.environ.get('OMP_NUM_THREADS', None), + os.environ.get('OMP_DYNAMIC', None), + os.environ.get('OMP_NESTED', None), + os.environ.get('OMP_MAX_ACTIVE_LEVELS', None), + os.environ.get('MKL_THREADING_LAYER', None), + os.environ.get('MKL_NUM_THREADS', None), + os.environ.get('MKL_DOMAIN_NUM_THREADS', None), + os.environ.get('MKL_DYNAMIC', None), __DEFAULT_NUMBA_TARGET__, - os.environ['NUMBA_THREADING_LAYER'], - os.environ['NUMBA_NUM_THREADS'], + os.environ.get('NUMBA_THREADING_LAYER', None), + os.environ.get('NUMBA_NUM_THREADS', None), __FFTW_NUM_THREADS__, __FFTW_PLANNER_EFFORT__, __FFTW_PLANNER_TIMELIMIT__) diff --git a/hysop/backend/device/opencl/opencl_array.py b/hysop/backend/device/opencl/opencl_array.py index 39cf0e7a57a41d90c22917ccb5f7c88e89b5a1f5..42df5ec55a7474bf3d93c4a8cb4da618f547881c 100644 --- a/hysop/backend/device/opencl/opencl_array.py +++ b/hysop/backend/device/opencl/opencl_array.py @@ -78,9 +78,12 @@ class OpenClArray(Array): alignment = self.backend.device.mem_base_addr_align if (offset % alignment) == 0: # try to return a subbuffer - buf = self.base_data[offset:] - buf.__parent = self.base_data - return buf + try: + buf = self.base_data[offset:] + buf.__parent = self.base_data + return buf + except: + raise clArray.ArrayHasOffsetError else: raise diff --git a/hysop/backend/host/python/operator/diffusion.py b/hysop/backend/host/python/operator/diffusion.py index dac0916f143323db0ff5af21ecbcf765a609ed5b..d40c8e1dead53e1fd219083bfc684305eac64940 100644 --- a/hysop/backend/host/python/operator/diffusion.py +++ b/hysop/backend/host/python/operator/diffusion.py @@ -7,7 +7,7 @@ from hysop.tools.types import check_instance, first_not_None from hysop.tools.decorators import debug from hysop.tools.numpywrappers import npw from hysop.tools.numerics import is_complex, complex_to_float_dtype -from hysop.tools.numba_utils import make_numba_signature +from hysop.tools.numba_utils import make_numba_signature, prange from hysop.backend.host.host_operator import HostOperator, OpenClMappable from hysop.core.graph.graph import op_apply from hysop.fields.continuous_field import Field @@ -42,7 +42,7 @@ class PythonDiffusion(DiffusionOperatorBase, OpenClMappable, HostOperator): '(n,m),(n),(m),()->(n,m)', target=target, nopython=True, cache=True) def filter_diffusion_2d(Fin, K0, K1, nu_dt, Fout): - for i in range(Fin.shape[0]): + for i in prange(Fin.shape[0]): for j in range(Fin.shape[1]): Fout[i,j] /= (1 - nu_dt*(K0[i] + K1[j])) F = filter_diffusion_2d @@ -51,8 +51,8 @@ class PythonDiffusion(DiffusionOperatorBase, OpenClMappable, HostOperator): '(n,m,p),(n),(m),(p),()->(n,m,p)', target=target, nopython=True, cache=True) def filter_diffusion_3d(Fin, K0, K1, K2, nu_dt, Fout): - for i in range(Fin.shape[0]): - for j in range(Fin.shape[1]): + for i in prange(Fin.shape[0]): + for j in prange(Fin.shape[1]): for k in range(Fin.shape[2]): Fout[i,j,k] /= (1 - nu_dt*(K0[i] + K1[j] + K2[k])) F = filter_diffusion_3d @@ -61,9 +61,9 @@ class PythonDiffusion(DiffusionOperatorBase, OpenClMappable, HostOperator): '(n,m,p,q),(n),(m),(p),(q),()->(n,m,p,q)', target=target, nopython=True, cache=True) def filter_diffusion_4d(Fin, K0, K1, K2, K3, nu_dt, Fout): - for i in range(Fin.shape[0]): - for j in range(Fin.shape[1]): - for k in range(Fin.shape[2]): + for i in prange(Fin.shape[0]): + for j in prange(Fin.shape[1]): + for k in prange(Fin.shape[2]): for l in range(Fin.shape[3]): Fout[i,j,k,l] /= (1 - nu_dt*(K0[i] + K1[j] + K2[k] + K3[l])) else: diff --git a/hysop/backend/host/python/operator/poisson.py b/hysop/backend/host/python/operator/poisson.py index 1472ac28ea954977ef633e5ec1eeecbb4c0bab99..ae0536e56548413eb9495e02d99e293c5051e675 100644 --- a/hysop/backend/host/python/operator/poisson.py +++ b/hysop/backend/host/python/operator/poisson.py @@ -6,7 +6,7 @@ from hysop import __DEFAULT_NUMBA_TARGET__ from hysop.tools.types import check_instance, first_not_None from hysop.tools.decorators import debug from hysop.tools.numpywrappers import npw -from hysop.tools.numba_utils import make_numba_signature +from hysop.tools.numba_utils import make_numba_signature, prange from hysop.backend.host.host_operator import HostOperator, OpenClMappable from hysop.core.graph.graph import op_apply from hysop.fields.continuous_field import Field @@ -38,7 +38,7 @@ class PythonPoisson(PoissonOperatorBase, OpenClMappable, HostOperator): '(n,m),(n),(m)->(n,m)', target=target, nopython=True, cache=True) def filter_poisson_2d(Fin, K0, K1, Fout): - for i in range(1, Fin.shape[0]): + for i in prange(1, Fin.shape[0]): for j in range(0, Fin.shape[1]): Fout[i,j] /= (K0[i] + K1[j]) for j in range(1, Fin.shape[1]): @@ -50,11 +50,11 @@ class PythonPoisson(PoissonOperatorBase, OpenClMappable, HostOperator): '(n,m,p),(n),(m),(p)->(n,m,p)', target=target, nopython=True, cache=True) def filter_poisson_3d(Fin, K0, K1, K2, Fout): - for i in range(1, Fin.shape[0]): - for j in range(0, Fin.shape[1]): + for i in prange(1, Fin.shape[0]): + for j in prange(0, Fin.shape[1]): for k in range(0, Fin.shape[2]): Fout[i,j,k] /= (K0[i] + K1[j] + K2[k]) - for j in range(1, Fin.shape[1]): + for j in prange(1, Fin.shape[1]): for k in range(0, Fin.shape[2]): Fout[0,j,k] /= (K0[0] + K1[j] + K2[k]) for k in range(1, Fin.shape[2]): @@ -66,16 +66,16 @@ class PythonPoisson(PoissonOperatorBase, OpenClMappable, HostOperator): '(n,m,p,q),(n),(m),(p),(q)->(n,m,p,q)', target=target, nopython=True, cache=True) def filter_poisson_4d(Fin, K0, K1, K2, K3, Fout): - for i in range(1, Fin.shape[0]): - for j in range(0, Fin.shape[1]): - for k in range(0, Fin.shape[2]): + for i in prange(1, Fin.shape[0]): + for j in prange(0, Fin.shape[1]): + for k in prange(0, Fin.shape[2]): for l in range(0, Fin.shape[3]): Fout[i,j,k,l] /= (K0[i] + K1[j] + K2[k] + K3[l]) - for j in range(1, Fin.shape[1]): - for k in range(0, Fin.shape[2]): + for j in prange(1, Fin.shape[1]): + for k in prange(0, Fin.shape[2]): for l in range(0, Fin.shape[3]): Fout[0,j,k,l] /= (K0[0] + K1[j] + K2[k] + K3[l]) - for k in range(1, Fin.shape[2]): + for k in prange(1, Fin.shape[2]): for l in range(0, Fin.shape[3]): Fout[0,0,k,l] /= (K0[0] + K1[0] + K2[k] + K3[l]) for l in range(1, Fin.shape[3]): diff --git a/hysop/backend/host/python/operator/poisson_curl.py b/hysop/backend/host/python/operator/poisson_curl.py index 13e80586673b405bb3c7f797ba6042fffbd64ea9..5e724b48c35a283de97d93325cf4f7224397c2ba 100644 --- a/hysop/backend/host/python/operator/poisson_curl.py +++ b/hysop/backend/host/python/operator/poisson_curl.py @@ -5,7 +5,7 @@ from hysop import __DEFAULT_NUMBA_TARGET__ from hysop.tools.types import check_instance, first_not_None from hysop.tools.decorators import debug from hysop.tools.numpywrappers import npw -from hysop.tools.numba_utils import make_numba_signature +from hysop.tools.numba_utils import make_numba_signature, prange from hysop.core.graph.graph import op_apply from hysop.backend.host.host_operator import HostOperator, OpenClMappable from hysop.operator.base.poisson_curl import SpectralPoissonCurlOperatorBase @@ -28,7 +28,7 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe @nb.guvectorize([signature], layout, target=target, nopython=True, cache=True) def filter_curl_2d__0_m(Fin, K1, Fout): - for i in range(0, Fin.shape[0]): + for i in prange(0, Fin.shape[0]): for j in range(0, Fin.shape[1]): Fout[i,j] = -K1[j]*Fin[i,j] return functools.partial(filter_curl_2d__0_m, *args) @@ -42,7 +42,7 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe @nb.guvectorize([signature], layout, target=target, nopython=True, cache=True) def filter_curl_2d__1_n(Fin, K0, Fout): - for i in range(0, Fin.shape[0]): + for i in prange(0, Fin.shape[0]): for j in range(0, Fin.shape[1]): Fout[i,j] = +K0[i]*Fin[i,j] return functools.partial(filter_curl_2d__1_n, *args) @@ -55,8 +55,8 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe @nb.guvectorize([signature], layout, target=target, nopython=True, cache=True) def filter_curl_3d__0_n(Fin, K, Fout): - for i in range(0, Fin.shape[0]): - for j in range(0, Fin.shape[1]): + for i in prange(0, Fin.shape[0]): + for j in prange(0, Fin.shape[1]): for k in range(0, Fin.shape[2]): Fout[i,j,k] = -K[i]*Fin[i,j,k] return functools.partial(filter_curl_3d__0_n, *args) @@ -69,8 +69,8 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe @nb.guvectorize([signature], layout, target=target, nopython=True, cache=True) def filter_curl_3d__0_m(Fin, K, Fout): - for i in range(0, Fin.shape[0]): - for j in range(0, Fin.shape[1]): + for i in prange(0, Fin.shape[0]): + for j in prange(0, Fin.shape[1]): for k in range(0, Fin.shape[2]): Fout[i,j,k] = -K[j]*Fin[i,j,k] return functools.partial(filter_curl_3d__0_m, *args) @@ -83,8 +83,8 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe @nb.guvectorize([signature], layout, target=target, nopython=True, cache=True) def filter_curl_3d__0_p(Fin, K, Fout): - for i in range(0, Fin.shape[0]): - for j in range(0, Fin.shape[1]): + for i in prange(0, Fin.shape[0]): + for j in prange(0, Fin.shape[1]): for k in range(0, Fin.shape[2]): Fout[i,j,k] = -K[k]*Fin[i,j,k] return functools.partial(filter_curl_3d__0_p, *args) @@ -97,8 +97,8 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe @nb.guvectorize([signature], layout, target=target, nopython=True, cache=True) def filter_curl_3d__1_n(Fin, K, Fout): - for i in range(0, Fin.shape[0]): - for j in range(0, Fin.shape[1]): + for i in prange(0, Fin.shape[0]): + for j in prange(0, Fin.shape[1]): for k in range(0, Fin.shape[2]): Fout[i,j,k] += K[i]*Fin[i,j,k] return functools.partial(filter_curl_3d__1_n, *args) @@ -111,8 +111,8 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe @nb.guvectorize([signature], layout, target=target, nopython=True, cache=True) def filter_curl_3d__1_m(Fin, K, Fout): - for i in range(0, Fin.shape[0]): - for j in range(0, Fin.shape[1]): + for i in prange(0, Fin.shape[0]): + for j in prange(0, Fin.shape[1]): for k in range(0, Fin.shape[2]): Fout[i,j,k] += K[j]*Fin[i,j,k] return functools.partial(filter_curl_3d__1_m, *args) @@ -125,8 +125,8 @@ class PythonPoissonCurl(SpectralPoissonCurlOperatorBase, OpenClMappable, HostOpe @nb.guvectorize([signature], layout, target=target, nopython=True, cache=True) def filter_curl_3d__1_p(Fin, K, Fout): - for i in range(0, Fin.shape[0]): - for j in range(0, Fin.shape[1]): + for i in prange(0, Fin.shape[0]): + for j in prange(0, Fin.shape[1]): for k in range(0, Fin.shape[2]): Fout[i,j,k] += K[k]*Fin[i,j,k] return functools.partial(filter_curl_3d__1_p, *args) diff --git a/hysop/backend/host/python/operator/solenoidal_projection.py b/hysop/backend/host/python/operator/solenoidal_projection.py index c131813b96f2e2fa82378186b07fcd60fde84ff9..a73a02da477b7f7f5ca004e553a09a49167e6ea6 100644 --- a/hysop/backend/host/python/operator/solenoidal_projection.py +++ b/hysop/backend/host/python/operator/solenoidal_projection.py @@ -9,7 +9,7 @@ from hysop.tools.numpywrappers import npw from hysop.backend.host.host_operator import HostOperator, OpenClMappable from hysop.core.graph.graph import op_apply from hysop.operator.base.solenoidal_projection import SolenoidalProjectionOperatorBase -from hysop.tools.numba_utils import make_numba_signature +from hysop.tools.numba_utils import make_numba_signature, prange class PythonSolenoidalProjection(SolenoidalProjectionOperatorBase, OpenClMappable, HostOperator): """ @@ -39,8 +39,8 @@ class PythonSolenoidalProjection(SolenoidalProjectionOperatorBase, OpenClMappabl KK10, KK11, KK12, KK20, KK21, KK22, Fout0, Fout1, Fout2): - for i in range(0, Fin0.shape[0]): - for j in range(0, Fin0.shape[1]): + for i in prange(0, Fin0.shape[0]): + for j in prange(0, Fin0.shape[1]): for k in range(0, Fin0.shape[2]): F0 = Fin0[i,j,k] F1 = Fin1[i,j,k] @@ -73,8 +73,8 @@ class PythonSolenoidalProjection(SolenoidalProjectionOperatorBase, OpenClMappabl @nb.guvectorize([signature], layout, target=target, nopython=True, cache=True) def compute_div_3d(Fin0, Fin1, Fin2, K0, K1, K2, Fout): - for i in range(0, Fin0.shape[0]): - for j in range(0, Fin0.shape[1]): + for i in prange(0, Fin0.shape[0]): + for j in prange(0, Fin0.shape[1]): for k in range(0, Fin0.shape[2]): Fout[i,j,k] = (K0[i]*Fin0[i,j,k] + K1[j]*Fin1[i,j,k] + K2[k]*Fin2[i,j,k]) diff --git a/hysop/numerics/fft/_mkl_fft.py b/hysop/numerics/fft/_mkl_fft.py index 154fea9a7d847e749e702192572955a21414d967..6dadfa2191d6ae41b7edfa044823f6d60329885e 100644 --- a/hysop/numerics/fft/_mkl_fft.py +++ b/hysop/numerics/fft/_mkl_fft.py @@ -1,34 +1,130 @@ """ -FFT interface for fast Fourier Transforms using Intel MKL (FFTW3 wrappers). +FFT interface for fast Fourier Transforms using Intel MKL (numpy interface). :class:`~hysop.numerics.MklFFT` :class:`~hysop.numerics.MklFFTPlan` + +/!\ -- OPENMP CONFLICT WITH GRAPHTOOLS -- +/!\ Only works if MKL_THREADING_LAYER is set to OMP because graph_tools is compiled against GNU OpenMP. +/!\ May also work with MKL_THREADING_LAYER=TBB and SEQUENCIAL but not INTEL. + +Required version of mkl_fft is: https://gitlab.com/keckj/mkl_fft +If MKL_THREADING_LAYER is not set, or is set to INTEL, FFT tests will fail. """ +import functools, warnings import numpy as np -from mkl_fft import _numpy_fft as _FFT +import numba as nb +from mkl_fft import (ifft as mkl_ifft, + fft as mkl_fft, + rfft_numpy as mkl_rfft, + irfft_numpy as mkl_irfft) -from hysop.tools.types import first_not_None from hysop.numerics.fft.host_fft import HostFFTPlanI, HostFFTI, HostArray from hysop.numerics.fft.fft import \ complex_to_float_dtype, float_to_complex_dtype, \ - mk_view, mk_shape + mk_view, mk_shape, simd_alignment + +from hysop import __DEFAULT_NUMBA_TARGET__ +from hysop.numerics.fft.fft import HysopFFTWarning, bytes2str +from hysop.tools.numba_utils import make_numba_signature, prange +from hysop.tools.warning import HysopWarning +from hysop.tools.numerics import get_itemsize +from hysop.tools.types import first_not_None + +class HysopMKLFftWarning(HysopWarning): + pass + +def setup_transform(x, axis, transform, inverse, type): + shape = x.shape + dtype = x.dtype + N = shape[axis] + if inverse: + type = [1,3,2,4][type-1] + if (transform == 'dct'): + ctype = float_to_complex_dtype(x.dtype) + if (type==1): + sin = mk_shape(shape, axis, 2*N-2) + din = dtype + sout = mk_shape(shape, axis, N) + dout = ctype + elif (type==2): + sin = mk_shape(shape, axis, N) + din = dtype + sout = mk_shape(shape, axis, N//2+1) + dout = ctype + elif (type==3): + sin = mk_shape(shape, axis, N//2+1) + din = ctype + sout = mk_shape(shape, axis, N) + dout = dtype + else: + raise NotImplementedError + elif (transform is 'dst'): + ctype = float_to_complex_dtype(x.dtype) + if (type==1): + sin = mk_shape(shape, axis, 2*N+2) + din = dtype + sout = mk_shape(shape, axis, N+2) + dout = ctype + elif (type==2): + sin = mk_shape(shape, axis, N) + din = dtype + sout = mk_shape(shape, axis, N//2+1) + dout = ctype + elif (type==3): + sin = mk_shape(shape, axis, N//2+1) + din = ctype + sout = mk_shape(shape, axis, N) + dout = dtype + else: + raise NotImplementedError + else: + sin = None + din = None + sout = None + dout = None + + return (sin,din,sout,dout) -def dct(a, out=None, type=2, axis=-1): - ndim = a.ndim - shape = a.shape - N = a.shape[axis] +def fft(**kwds): + out = mkl_fft(**kwds) + return out +def ifft(**kwds): + out = mkl_ifft(**kwds) + return out +def rfft(**kwds): + out = mkl_rfft(**kwds) + return out +def irfft(**kwds): + out = mkl_irfft(**kwds) + return out + + +def dct(x, out=None, type=2, axis=-1, input_tmp=None, output_tmp=None): + ndim = x.ndim + shape = x.shape + N = x.shape[axis] + (sin, din, sout, dout) = setup_transform(x, axis, 'dct', False, type) if (type==1): # O(sqrt(log(N))) error, O(2N) complexity, O(4*N) memory + if (input_tmp is None): + input_tmp = np.empty(shape=sin, dtype=din) + if (output_tmp is None): + output_tmp = np.empty(shape=sout, dtype=dout) + assert input_tmp.shape == sin + assert input_tmp.dtype == din + assert output_tmp.shape == sout + assert output_tmp.dtype == dout slc0 = mk_view(ndim, axis, 1, -1) slc1 = mk_view(ndim, axis, None, None, -1) - s0 = mk_shape(shape, axis, 2*N-2) - X = np.empty(shape=s0, dtype=a.dtype) - np.concatenate((a, a[slc0][slc1]), axis=axis, out=X) - res = _FFT.rfft(X, axis=axis).real + np.concatenate((x, x[slc0][slc1]), axis=axis, out=input_tmp) + rfft(x=input_tmp, out=output_tmp, axis=axis) + res = output_tmp.real if (out is None): out = res else: assert out.shape == res.shape + assert out.dtype == res.dtype out[...] = res elif (type==2): # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory @@ -42,20 +138,30 @@ def dct(a, out=None, type=2, axis=-1): slc5 = mk_view(ndim, axis, None, n1, None) slc6 = mk_view(ndim, axis, n1, None, None) - X = np.empty_like(a) - np.concatenate((a[slc1], a[slc2][slc3]), axis=axis, out=X) - X = _FFT.rfft(X, axis=axis) - X *= (2*np.exp(-1j*np.pi*np.arange(n0)/(2*N)))[slc4] + if (input_tmp is None): + input_tmp = np.empty(shape=sin, dtype=din) + if (output_tmp is None): + output_tmp = np.empty(shape=sout, dtype=dout) + assert input_tmp.shape == sin + assert input_tmp.dtype == din + assert output_tmp.shape == sout + assert output_tmp.dtype == dout + + np.concatenate((x[slc1], x[slc2][slc3]), axis=axis, out=input_tmp) + rfft(x=input_tmp, out=output_tmp, axis=axis) + assert output_tmp.shape == sout + assert output_tmp.dtype == dout + output_tmp *= (2*np.exp(-1j*np.pi*np.arange(n0)/(2*N)))[slc4] if (out is None): - out = np.empty_like(a) + out = np.empty_like(x) else: - assert out.shape == a.shape - out[slc5] = +X.real[slc5] - out[slc6] = -X.imag[slc0][slc3] + assert out.shape == x.shape + assert out.dtype == x.dtype + out[slc5] = +output_tmp.real[slc5] + out[slc6] = -output_tmp.imag[slc0][slc3] elif (type==3): # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory - ctype = float_to_complex_dtype(a.dtype) n0 = N//2 + 1 n1 = (N-1)//2 + 1 slc0 = mk_view(ndim, axis, None, n0, None) @@ -66,21 +172,32 @@ def dct(a, out=None, type=2, axis=-1): slc5 = mk_view(ndim, axis, None, n1, None) slc6 = mk_view(ndim, axis, 1, None, +2) slc7 = mk_view(ndim, axis, None, None, None, default=None) - s0 = mk_shape(shape, axis, n0) - - X = np.zeros(shape=s0, dtype=ctype) - X.real[slc0] = +a[slc0] - X.imag[slc1] = -a[slc2][slc3] - X *= np.exp(+1j*np.pi*np.arange(n0)/(2*N))[slc7] - X = _FFT.irfft(X, axis=axis, n=N) - X *= N + slc8 = mk_view(ndim, axis, n0 , None, None) + slc9 = mk_view(ndim, axis, None, 1 , None) + + if (input_tmp is None): + input_tmp = np.empty(shape=sin, dtype=din) + if (output_tmp is None): + output_tmp = np.empty(shape=sout, dtype=dout) + assert input_tmp.shape == sin + assert input_tmp.dtype == din + assert output_tmp.shape == sout + assert output_tmp.dtype == dout + input_tmp.real[slc0] = +x[slc0] + input_tmp.real[slc8] = 0.0 + input_tmp.imag[slc1] = -x[slc2][slc3] + input_tmp.imag[slc9] = 0.0 + input_tmp *= np.exp(+1j*np.pi*np.arange(n0)/(2*N))[slc7] + output_tmp = irfft(x=input_tmp, out=output_tmp, axis=axis, n=N) + output_tmp *= N if (out is None): - out = np.empty_like(a) + out = np.empty_like(x) else: - assert out.shape == a.shape - out[slc4] = X[slc5] - out[slc6] = X[slc2][slc3] + assert out.shape == x.shape + assert out.dtype == x.dtype + out[slc4] = output_tmp[slc5] + out[slc6] = output_tmp[slc2][slc3] else: stypes=['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII'] msg='DCT-{} has not been implemented yet.' @@ -88,24 +205,34 @@ def dct(a, out=None, type=2, axis=-1): raise NotImplementedError(msg) return out -def dst(a, out=None, type=2, axis=-1): - ndim = a.ndim - shape = a.shape - N = a.shape[axis] +def dst(x, out=None, type=2, axis=-1, input_tmp=None, output_tmp=None): + (sin, din, sout, dout) = setup_transform(x, axis, 'dst', False, type) + ndim = x.ndim + shape = x.shape + N = x.shape[axis] if (type==1): # O(sqrt(log(N))) error, O(2N) complexity, O(4*N) memory slc0 = mk_view(ndim, axis, None, None, -1) slc1 = mk_view(ndim, axis, 1, -1, None) - s0 = mk_shape(shape, axis, 2*N+2) s1 = mk_shape(shape, axis, 1) - X = np.empty(shape=s0, dtype=a.dtype) - Z = np.zeros(shape=s1, dtype=a.dtype) - np.concatenate((Z, -a, Z, a[slc0]), axis=axis, out=X) - res = _FFT.rfft(X, axis=axis).imag + if (input_tmp is None): + input_tmp = np.empty(shape=sin, dtype=din) + if (output_tmp is None): + output_tmp = np.empty(shape=sout, dtype=dout) + assert input_tmp.shape == sin + assert input_tmp.dtype == din + assert output_tmp.shape == sout + assert output_tmp.dtype == dout + + Z = np.zeros(shape=s1, dtype=x.dtype) + np.concatenate((Z, -x, Z, x[slc0]), axis=axis, out=input_tmp) + rfft(x=input_tmp, out=output_tmp, axis=axis) + res = output_tmp.imag if (out is None): - out = np.empty_like(a) + out = np.empty_like(x) else: - assert out.shape == a.shape + assert out.shape == x.shape + assert out.dtype == x.dtype out[...] = res[slc1] elif (type==2): # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory @@ -120,20 +247,29 @@ def dst(a, out=None, type=2, axis=-1): slc6 = mk_view(ndim, axis, n1-1, None, None) slc7 = mk_view(ndim, axis, 1, n1, None) - X = np.empty_like(a) - np.concatenate((a[slc1], -a[slc2][slc3]), axis=axis, out=X) - X = _FFT.rfft(X, axis=axis) - X *= (2*np.exp(-1j*np.pi*np.arange(n0)/(2*N)))[slc4] + if (input_tmp is None): + input_tmp = np.empty(shape=sin, dtype=din) + if (output_tmp is None): + output_tmp = np.empty(shape=sout, dtype=dout) + assert input_tmp.shape == sin + assert input_tmp.dtype == din + assert output_tmp.shape == sout + assert output_tmp.dtype == dout + + np.concatenate((x[slc1], -x[slc2][slc3]), axis=axis, out=input_tmp) + rfft(x=input_tmp, out=output_tmp, axis=axis) + output_tmp *= (2*np.exp(-1j*np.pi*np.arange(n0)/(2*N)))[slc4] if (out is None): - out = np.empty_like(a) + out = np.empty_like(x) else: - assert out.shape == a.shape - out[slc5] = -X.imag[slc7] - out[slc6] = +X.real[slc3] + assert out.shape == x.shape + assert out.dtype == x.dtype + out[slc5] = -output_tmp.imag[slc7] + out[slc6] = +output_tmp.real[slc3] elif (type==3): # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory - ctype = float_to_complex_dtype(a.dtype) + ctype = float_to_complex_dtype(x.dtype) n0 = N//2 + 1 n1 = (N-1)//2 + 1 slc0 = mk_view(ndim, axis, None, n0, None) @@ -145,21 +281,34 @@ def dst(a, out=None, type=2, axis=-1): slc6 = mk_view(ndim, axis, None, n1, None) slc7 = mk_view(ndim, axis, 1, None, 2) slc8 = mk_view(ndim, axis, n1, None, None) + slc9 = mk_view(ndim, axis, n0, None, None) + slc10 = mk_view(ndim, axis, 0, 1, None) s0 = mk_shape(shape, axis, n0) + + if (input_tmp is None): + input_tmp = np.empty(shape=sin, dtype=din) + if (output_tmp is None): + output_tmp = np.empty(shape=sout, dtype=dout) + assert input_tmp.shape == sin + assert input_tmp.dtype == din + assert output_tmp.shape == sout + assert output_tmp.dtype == dout - X = np.zeros(shape=s0, dtype=ctype) - X.real[slc0] = +a[slc1][slc0] - X.imag[slc2] = -a[slc3] - X *= np.exp(+1j*np.pi*np.arange(n0)/(2*N))[slc4] - X = _FFT.irfft(X, axis=axis, n=N) - X[...] *= N + input_tmp.real[slc0] = +x[slc1][slc0] + input_tmp.real[slc9] = 0.0 + input_tmp.imag[slc2] = -x[slc3] + input_tmp.imag[slc10] = 0.0 + input_tmp *= np.exp(+1j*np.pi*np.arange(n0)/(2*N))[slc4] + irfft(x=input_tmp, out=output_tmp, axis=axis, n=N) + output_tmp[...] *= N if (out is None): - out = np.empty_like(a) + out = np.empty_like(x) else: - assert out.shape == a.shape - out[slc5] = +X[slc6] - out[slc7] = -X[slc8][slc1] + assert out.shape == x.shape + assert out.dtype == x.dtype + out[slc5] = +output_tmp[slc6] + out[slc7] = -output_tmp[slc8][slc1] else: stypes=['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII'] msg='DCT-{} has not been implemented yet.' @@ -167,13 +316,14 @@ def dst(a, out=None, type=2, axis=-1): raise NotImplementedError(msg) return out -def idct(a, out=None, type=2, axis=-1, **kwds): +def idct(x, out=None, type=2, axis=-1, **kwds): itype = [1,3,2,4][type-1] - return dct(a=a, out=out, type=itype, axis=axis, **kwds) + return dct(x=x, out=out, type=itype, axis=axis, **kwds) -def idst(a, out=None, type=2, axis=-1, **kwds): +def idst(x, out=None, type=2, axis=-1, **kwds): itype = [1,3,2,4][type-1] - return dst(a=a, out=out, type=itype, axis=axis, **kwds) + return dst(x=x, out=out, type=itype, axis=axis, **kwds) + class MklFFTPlan(HostFFTPlanI): @@ -181,24 +331,88 @@ class MklFFTPlan(HostFFTPlanI): Wrap a mkl fft call (mkl.fft does not offer real planning capabilities). """ - def __init__(self, fn, a, out, scaling=None, **kwds): + def __init__(self, planner, fn, a, out, axis, scaling=None, **kwds): super(MklFFTPlan, self).__init__() + self.planner = planner self.fn = fn self.a = a self.out = out self.scaling = scaling + (sin, din, sout, dout) = setup_transform(a, axis, + 'dct' if fn in (dct, idct) else 'dst' if fn in (dst, idst) else None, + fn in (idct, idst), + kwds.get('type', None)) + + if (sin is None): + self._required_input_tmp = None + else: + self._required_input_tmp = {'size': np.prod(sin, dtype=np.int64), 'shape':sin, 'dtype':din} + + if (sout is None): + self._required_output_tmp = None + else: + self._required_output_tmp = {'size': np.prod(sout, dtype=np.int64), 'shape':sout, 'dtype':dout} + + self._allocated = False + if isinstance(a, HostArray): a = a.handle if isinstance(out, HostArray): out = out.handle + + self.rescale = self.bake_scaling_plan(out, scaling) + kwds = kwds.copy() - kwds['a'] = a - if fn in (dct, idct, dst, idst): - kwds['out'] = out - self.kwds = kwds + kwds['x'] = a + kwds['out'] = out + kwds['axis'] = axis + self.kwds = kwds + + def bake_scaling_plan(self, x, scaling): + if (scaling is None): + def _rescale(): + pass + return _rescale + target = __DEFAULT_NUMBA_TARGET__ + signature, layout = make_numba_signature(x, scaling) + if (x.ndim == 1): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def scale(x, scaling): + for i in xrange(0, x.shape[0]): + x[i] *= scaling + elif (x.ndim == 2): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def scale(x, scaling): + for i in prange(0, x.shape[0]): + for j in xrange(0, x.shape[1]): + x[i,j] *= scaling + elif (x.ndim == 3): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def scale(x, scaling): + for i in prange(0, x.shape[0]): + for j in prange(0, x.shape[1]): + for k in xrange(0, x.shape[2]): + x[i,j,k] *= scaling + elif (x.ndim == 4): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def scale(x, scaling): + for i in prange(0, x.shape[0]): + for j in prange(0, x.shape[1]): + for k in prange(0, x.shape[2]): + for l in xrange(0, x.shape[3]): + x[i,j,k,l] *= scaling + else: + raise NotImplementedError(x.ndim) + def _rescale(scale=scale, x=x, scaling=scaling): + scale(x,scaling) + return _rescale @property def input_array(self): @@ -209,90 +423,160 @@ class MklFFTPlan(HostFFTPlanI): return self.out def execute(self): - out = self.out - scaling = self.scaling + if not self._allocated: + self.allocate() + self.fn(**self.kwds) + self.rescale() + + + @property + def required_buffer_size(self): + alignment = simd_alignment + if self._required_input_tmp: + sin, din = self._required_input_tmp['size'], self._required_input_tmp['dtype'] + sin *= get_itemsize(din) + Bin = ((sin+alignment-1)//alignment)*alignment + else: + Bin = 0 + if self._required_output_tmp: + sout, dout = self._required_output_tmp['size'], self._required_output_tmp['dtype'] + sout *= get_itemsize(dout) + Bout = ((sout+alignment-1)//alignment)*alignment + else: + Bout = 0 + return Bin+Bout + + def allocate(self, buf=None): + """Allocate plan extra memory, possibly with a custom buffer.""" + if self._allocated: + msg='Plan was already allocated.' + raise RuntimeError(msg) - if self.fn in (dct, idct, dst, idst): - self.fn(**self.kwds) + if (buf is not None): + alignment = simd_alignment + if self._required_input_tmp: + sin, din, ssin = self._required_input_tmp['size'], self._required_input_tmp['dtype'], self._required_input_tmp['shape'] + sin *= get_itemsize(din) + Bin = ((sin+alignment-1)//alignment)*alignment + else: + Bin = 0 + if self._required_output_tmp: + sout, dout, ssout = self._required_output_tmp['size'], self._required_output_tmp['dtype'], self._required_output_tmp['shape'] + sout *= get_itemsize(dout) + Bout = ((sout+alignment-1)//alignment)*alignment + else: + Bout = 0 + assert buf.dtype.itemsize == 1 + assert buf.size == Bin+Bout + input_buf = buf[:sin].view(dtype=din).reshape(ssin) if Bin else None + output_buf = buf[Bin:Bin+sout].view(dtype=dout).reshape(ssout) if Bout else None else: - out[...] = self.fn(**self.kwds) + input_buf = None + output_buf = None - if (scaling is not None): - out[...] *= scaling + for (k, buf, required_tmp) in zip(('input', 'output'), + (input_buf, output_buf), + (self._required_input_tmp, self._required_output_tmp)): + if (required_tmp is None): + assert buf is None + continue + size = required_tmp['size'] + shape = required_tmp['shape'] + dtype = required_tmp['dtype'] + if (size>0): + if (buf is None): + if self.planner.warn_on_allocation or self.planner.error_on_allocation: + msg='Allocating temporary buffer of size {} for clFFT::{}.' + msg=msg.format(bytes2str(size), id(self)) + if self.planner.error_on_allocation: + raise RuntimeError(msg) + else: + warnings.warn(msg, HysopMKLFftWarning) + buf = self.planner.backend.empty(shape=shape, + dtype=dtype) + elif (buf.shape != shape) or (buf.dtype != dtype): + msg='Buffer does not match required shape: {} != {}' + msg=msg.format(buf.shape, shape) + msg='Buffer does not match required dtype: {} != {}' + msg=msg.format(buf.dtype, dtype) + raise ValueError(msg) + if isinstance(buf, HostArray): + buf = buf.handle + setattr(self, '{}_tmp_buffer'.format(k), buf) + if (buf is not None): + self.kwds['{}_tmp'.format(k)] = buf + self._allocated = True + return self class MklFFT(HostFFTI): """ Interface to compute local to process FFT-like transforms using the mkl fft backend. - Mkl fft backend has many disadvantages: - - only double precision is really supported - - single precision is supported by casting to double precision - - creates intermediate temporary buffers at each call - - no planning capabilities (mkl.fft methods are just wrapped into fake plans) - - The only advantage is that planning won't destroy original inputs. + Mkl fft backend has some disadvantages: + - creates intermediate temporary buffers at each call (out and tmp for real-to-real transforms) + - no planning capabilities (mkl.fft methods are just wrapped into fake plans) """ def __init__(self, backend=None, allocator=None, - warn_on_allocation=True, **kwds): + warn_on_allocation=True, error_on_allocation=False, + destroy_input=None, **kwds): super(MklFFT, self).__init__(backend=backend, allocator=allocator, - warn_on_allocation=warn_on_allocation, **kwds) + warn_on_allocation=warn_on_allocation, error_on_allocation=error_on_allocation, **kwds) self.supported_ftypes = (np.float32, np.float64,) self.supported_ctypes = (np.complex64, np.complex128,) - def fft(self, a, out=None, axis=-1, **kwds): (shape, dtype) = super(MklFFT, self).fft(a=a, out=out, axis=axis, **kwds) out = self.allocate_output(out, shape, dtype) - plan = MklFFTPlan(fn=_FFT.fft, a=a, out=out, axis=axis, **kwds) + plan = MklFFTPlan(self, fn=fft, a=a, out=out, axis=axis, **kwds) return plan def ifft(self, a, out=None, axis=-1, **kwds): (shape, dtype, s) = super(MklFFT, self).ifft(a=a, out=out, axis=axis, **kwds) out = self.allocate_output(out, shape, dtype) - plan = MklFFTPlan(fn=_FFT.ifft, a=a, out=out, axis=axis, **kwds) + plan = MklFFTPlan(self, fn=ifft, a=a, out=out, axis=axis, **kwds) return plan def rfft(self, a, out=None, axis=-1, **kwds): (shape, dtype) = super(MklFFT, self).rfft(a=a, out=out, axis=axis, **kwds) out = self.allocate_output(out, shape, dtype) - plan = MklFFTPlan(fn=_FFT.rfft, a=a, out=out, axis=axis, + plan = MklFFTPlan(self, fn=rfft, a=a, out=out, axis=axis, **kwds) return plan def irfft(self, a, out=None, n=None, axis=-1, **kwds): (shape, dtype, s) = super(MklFFT, self).irfft(a=a, out=out, n=n, axis=axis, **kwds) out = self.allocate_output(out, shape, dtype) - plan = MklFFTPlan(fn=_FFT.irfft, a=a, out=out, axis=axis, + plan = MklFFTPlan(self, fn=irfft, a=a, out=out, axis=axis, n=shape[axis], **kwds) return plan def dct(self, a, out=None, type=2, axis=-1, **kwds): (shape, dtype) = super(MklFFT, self).dct(a=a, out=out, type=type, axis=axis, **kwds) out = self.allocate_output(out, shape, dtype) - plan = MklFFTPlan(fn=dct, a=a, out=out, axis=axis, type=type, **kwds) + plan = MklFFTPlan(self, fn=dct, a=a, out=out, axis=axis, type=type, **kwds) return plan def idct(self, a, out=None, type=2, axis=-1, scaling=None, **kwds): (shape, dtype, _, s) = super(MklFFT, self).idct(a=a, out=out, type=type, axis=axis, **kwds) out = self.allocate_output(out, shape, dtype) - plan = MklFFTPlan(fn=idct, a=a, out=out, axis=axis, type=type, + plan = MklFFTPlan(self, fn=idct, a=a, out=out, axis=axis, type=type, scaling=first_not_None(scaling, 1.0/s), **kwds) return plan def dst(self, a, out=None, type=2, axis=-1, **kwds): (shape, dtype) = super(MklFFT, self).dst(a=a, out=out, type=type, axis=axis, **kwds) out = self.allocate_output(out, shape, dtype) - plan = MklFFTPlan(fn=dst, a=a, out=out, axis=axis, type=type, **kwds) + plan = MklFFTPlan(self, fn=dst, a=a, out=out, axis=axis, type=type, **kwds) return plan def idst(self, a, out=None, type=2, axis=-1, scaling=None, **kwds): (shape, dtype, _, s) = super(MklFFT, self).idst(a=a, out=out, type=type, axis=axis, **kwds) out = self.allocate_output(out, shape, dtype) - plan = MklFFTPlan(fn=idst, a=a, out=out, axis=axis, type=type, + plan = MklFFTPlan(self, fn=idst, a=a, out=out, axis=axis, type=type, scaling=first_not_None(scaling, 1.0/s), **kwds) return plan diff --git a/hysop/numerics/fft/host_fft.py b/hysop/numerics/fft/host_fft.py index 15e1cb03cb550e900f3a7aec20a1bae4f2b24c87..21b16859881c9ec9ae7840f7c07cd87b57b4a347 100644 --- a/hysop/numerics/fft/host_fft.py +++ b/hysop/numerics/fft/host_fft.py @@ -13,11 +13,38 @@ import numba as nb from hysop import __FFTW_NUM_THREADS__, __FFTW_PLANNER_EFFORT__, __FFTW_PLANNER_TIMELIMIT__, \ __DEFAULT_NUMBA_TARGET__ from hysop.tools.types import first_not_None, check_instance -from hysop.tools.numba_utils import make_numba_signature +from hysop.tools.numba_utils import HAS_NUMBA, bake_numba_copy, bake_numba_accumulate, bake_numba_transpose +from hysop.tools.decorators import static_vars from hysop.backend.host.host_array_backend import HostArrayBackend from hysop.backend.host.host_array import HostArray from hysop.numerics.fft.fft import FFTQueueI, FFTPlanI, FFTI +def can_exec_hptt(src, dst): + if (src is dst): + return False + if (src.dtype != dst.dtype): + return False + if src.dtype not in (np.float32, np.float64, np.complex64, np.complex128): + return False + if src.flags['C_CONTIGUOUS'] != dst.flags['C_CONTIGUOUS']: + return False + if src.flags['F_CONTIGUOUS'] != dst.flags['F_CONTIGUOUS']: + return False + if not (src.flags['C_CONTIGUOUS'] or src.flags['F_CONTIGUOUS']): + return False + return (src.data is not dst.data) + +try: + import hptt + HAS_HPTT=True + # required version is: https://gitlab.com/keckj/hptt +except ImportError: + HAS_HPTT=False + import warnings + from hysop.tools.warning import HysopPerformanceWarning + msg='Failed to import HPTT module, falling back to slow numpy transpose. Required version is available at https://gitlab.com/keckj/hptt.' + warnings.warn(msg, HysopPerformanceWarning) + class DummyEvent(object): @classmethod def wait(cls): @@ -74,22 +101,32 @@ class HostFFTI(FFTI): error_on_allocation=False, **kwds): """ - Get the default host FFT interface which is a multithreaded FFTW interface with - ESTIMATE planning effort. + Get the default host FFT interface. + Preferred interface is multithreaded MKL FFT with the TBB threading layer (does not work with Intel threading layer). + On import error the interface falls back to a multithreaded FFTW interface with ESTIMATE planning effort. """ - threads = first_not_None(threads, __FFTW_NUM_THREADS__) - planner_effort = first_not_None(planner_effort, __FFTW_PLANNER_EFFORT__) - planning_timelimit = first_not_None(planning_timelimit, __FFTW_PLANNER_TIMELIMIT__) - from hysop.numerics.fft.fftw_fft import FftwFFT - return FftwFFT(threads=threads, - planner_effort=planner_effort, - planning_timelimit=planning_timelimit, - backend=backend, allocator=allocator, - destroy_input=destroy_input, - warn_on_allocation=warn_on_allocation, - warn_on_misalignment=warn_on_misalignment, - error_on_allocation=error_on_allocation, - **kwds) + try: + from hysop.numerics.fft._mkl_fft import MklFFT + return MklFFT(backend=backend, allocator=allocator, + destroy_input=destroy_input, + warn_on_allocation=warn_on_allocation, + error_on_allocation=error_on_allocation, + **kwds) + except ImportError: + raise + from hysop.numerics.fft.fftw_fft import FftwFFT + threads = first_not_None(threads, __FFTW_NUM_THREADS__) + planner_effort = first_not_None(planner_effort, __FFTW_PLANNER_EFFORT__) + planning_timelimit = first_not_None(planning_timelimit, __FFTW_PLANNER_TIMELIMIT__) + return FftwFFT(threads=threads, + planner_effort=planner_effort, + planning_timelimit=planning_timelimit, + backend=backend, allocator=allocator, + destroy_input=destroy_input, + warn_on_allocation=warn_on_allocation, + warn_on_misalignment=warn_on_misalignment, + error_on_allocation=error_on_allocation, + **kwds) def new_queue(self, tg, name): return HostFFTQueue(name=name) @@ -97,22 +134,55 @@ class HostFFTI(FFTI): def plan_copy(self, tg, src, dst): src = self.ensure_callable(src) dst = self.ensure_callable(dst) + + @static_vars(numba_copy=None) def exec_copy(src=src, dst=dst): - dst()[...] = src() + src, dst = src(), dst() + if HAS_HPTT and can_exec_hptt(src, dst): + hptt.tensorTransposeAndUpdate(perm=range(src.ndim), + alpha=1.0, A=src, beta=0.0, B=dst) + elif HAS_NUMBA: + if (exec_copy.numba_copy is None): + exec_copy.numba_copy = bake_numba_copy(src=src, dst=dst) + exec_copy.numba_copy() + else: + dst[...] = src return exec_copy def plan_accumulate(self, tg, src, dst): src = self.ensure_callable(src) dst = self.ensure_callable(dst) - def exec_copy(src=src, dst=dst): - dst()[...] += src() + + @static_vars(numba_accumulate=None) + def exec_accumulate(src=src, dst=dst): + src, dst = src(), dst() + if HAS_HPTT and can_exec_hptt(src, dst): + hptt.tensorTransposeAndUpdate(perm=range(src.ndim), + alpha=1.0, A=src, beta=1.0, B=dst) + elif HAS_NUMBA: + if (exec_accumulate.numba_accumulate is None): + exec_accumulate.numba_accumulate = bake_numba_accumulate(src=src, dst=dst) + exec_accumulate.numba_accumulate() + else: + dst[...] += src return exec_copy def plan_transpose(self, tg, src, dst, axes): src = self.ensure_callable(src) dst = self.ensure_callable(dst) + + @static_vars(numba_transpose=None) def exec_transpose(src=src, dst=dst, axes=axes): - dst()[...] = np.transpose(a=src(), axes=axes) + src, dst = src(), dst() + if HAS_HPTT and can_exec_hptt(src, dst): + hptt.tensorTransposeAndUpdate(perm=axes, + alpha=1.0, A=src, beta=0.0, B=dst) + elif HAS_NUMBA: + if (exec_transpose.numba_transpose is None): + exec_transpose.numba_transpose = bake_numba_transpose(src=src, dst=dst, axes=axes) + exec_transpose.numba_transpose() + else: + dst[...] = np.transpose(a=src, axes=axes) return exec_transpose def plan_fill_zeros(self, tg, a, slices): diff --git a/hysop/numerics/tests/test_fft.py b/hysop/numerics/tests/test_fft.py index 3c4f4d0e9d50b2d9358390ff6b21fb150ec5929e..8f7d461bc2218765fddbf9bf4b03eba4b54d3a37 100644 --- a/hysop/numerics/tests/test_fft.py +++ b/hysop/numerics/tests/test_fft.py @@ -20,9 +20,14 @@ from hysop.numerics.fft.numpy_fft import NumpyFFT from hysop.numerics.fft.scipy_fft import ScipyFFT from hysop.numerics.fft.fftw_fft import FftwFFT from hysop.numerics.fft.gpyfft_fft import GpyFFT +try: + from hysop.numerics.fft._mkl_fft import MklFFT + HAS_MKLFFT=True +except: + HAS_MKLFFT=False + raise -# For now Intel has a bug in their MKL FFTW-like interface so we do not test it -from hysop.numerics.fft._mkl_fft import MklFFT +raise_on_failure = False class TestFFT(object): @@ -32,22 +37,24 @@ class TestFFT(object): 'scipy': ScipyFFT(warn_on_allocation=False), 'fftw': FftwFFT(warn_on_allocation=False, warn_on_misalignment=False), - 'mkl': MklFFT(warn_on_allocation=False), }, Implementation.OPENCL: {} } + + if HAS_MKLFFT: + implementations[Implementation.PYTHON]['mkl'] = MklFFT(warn_on_allocation=False) print print ':: STARTING FFT BACKEND TESTS ::' - for (i,cl_env) in enumerate(iter_clenv()): - print '> Registering opencl backend {} as:\n{}'.format( - i, cl_env) - print - name = 'clfft{}'.format(i) - implementations[Implementation.OPENCL][name] = \ - GpyFFT(cl_env=cl_env, - warn_on_allocation=False, - warn_on_unaligned_output_offset=False) + #for (i,cl_env) in enumerate(iter_clenv()): + #print '> Registering opencl backend {} as:\n{}'.format( + #i, cl_env) + #print + #name = 'clfft{}'.format(i) + #implementations[Implementation.OPENCL][name] = \ + #GpyFFT(cl_env=cl_env, + #warn_on_allocation=False, + #warn_on_unaligned_output_offset=False) msg_shape = 'Expected output array shape to be {} but got {} for implementation {}.' msg_dtype = 'Expected output array dtype to be {} but got {} for implementation {}.' @@ -57,6 +64,8 @@ class TestFFT(object): report_eps = 10 fail_eps = 100 + stop_on_error = True + def _test_1d(self, dtype, failures): print print '::Testing 1D transform, precision {}::'.format(dtype.__name__) @@ -98,6 +107,8 @@ class TestFFT(object): shape=results[r0].shape failures.setdefault(tag, []).append((r0, r1, shape, Einf, Eeps)) print ', '.join(ss) + if failed and raise_on_failure: + raise RuntimeError print '\n FORWARD C2C: complex to complex forward transform' @@ -119,7 +130,7 @@ class TestFFT(object): assert Bout.dtype == ctype, self.msg_dtype.format(ctype, Bout.dtype, name) Bin[...] = Href - plan.execute() + evt = plan.execute() H0 = Bin.get() H1 = Bout.get() assert np.array_equal(Href, H0), self.msg_input_modified.format(name) @@ -467,7 +478,8 @@ class TestFFT(object): if failed: print msg='Some implementations failed !' - #raise RuntimeError(msg) + if raise_on_failure: + raise RuntimeError(msg) print '\n C2C-C2C transform' for (shape, cshape, rshape, N, Nc, Nr, @@ -573,7 +585,7 @@ class TestFFT(object): check_distances(results) for (itype,stype) in enumerate(types, 1): - print '\n DST-{}: real to real discrete sinine transform {}'.format( + print '\n DST-{}: real to real discrete sine transform {}'.format( stype.strip(), itype) ttype = 'SIN{}'.format(itype) for (shape, cshape, rshape, N, Nc, Nr, @@ -696,7 +708,7 @@ class TestFFT(object): else: dtypes = (HYSOP_REAL,) for dtype in dtypes: - self._test_forward_backward_1d(dtype=dtype) + #self._test_forward_backward_1d(dtype=dtype) self._test_1d(dtype=dtype, failures=failures.setdefault(dtype.__name__, {})) self.report_failures(failures) diff --git a/hysop/tools/io_utils.py b/hysop/tools/io_utils.py index 34c85ec3eed063df4b8bd8d35b942a9972df607a..1209d91616412e4f62a90fe43c03ec1e99ed70d4 100755 --- a/hysop/tools/io_utils.py +++ b/hysop/tools/io_utils.py @@ -192,7 +192,8 @@ class IO(object): class IOParams(namedtuple("IOParams", ['filename', 'filepath', 'frequency', 'fileformat', - 'dump_times', 'dump_tstart', 'dump_tend', + 'dump_times_fp32', 'dump_times_fp64', + 'dump_tstart', 'dump_tend', 'io_leader', 'visu_leader', 'kwds'])): """ @@ -209,7 +210,7 @@ class IOParams(namedtuple("IOParams", ['filename', 'filepath', fileformat : int Format of the file. See notes for available format. Default=HDF5. dump_times: tuple of floats - Extra dump times that should be used to dump in addition to frequency + Extra dump times that should be used to dump in addition to frequency (double precision) dump_tstart: float Start to dump at given time. Defaults to -np.inf (no time constraints). dump_end: float @@ -236,11 +237,19 @@ class IOParams(namedtuple("IOParams", ['filename', 'filepath', io_leader=0, visu_leader=0, **kwds): - dump_times = first_not_None(dump_times, ()) dump_tstart = first_not_None(dump_tstart, -np.inf) dump_tend = first_not_None(dump_tend, +np.inf) fileformat = first_not_None(fileformat, IO.HDF5) + dump_times = first_not_None(dump_times, ()) + if (dump_times is not None): + check_instance(dump_times, tuple, values=(float,np.float64)) + dump_times_fp64 = tuple(map(np.float64, dump_times)) + dump_times_fp32 = tuple(map(np.float32, dump_times)) + else: + dump_times_fp32 = () + dump_times_fp64 = () + check_instance(frequency, (int,long)) check_instance(dump_tstart, float) check_instance(dump_tend, float) @@ -271,7 +280,8 @@ class IOParams(namedtuple("IOParams", ['filename', 'filepath', IO.check_dir(filepath) return super(IOParams, cls).__new__(cls, filename, filepath, frequency, fileformat, - dump_times, dump_tstart, dump_tend, + dump_times_fp32, dump_times_fp64, + dump_tstart, dump_tend, io_leader, visu_leader, kwds) @@ -282,7 +292,12 @@ class IOParams(namedtuple("IOParams", ['filename', 'filepath', if (t < self.dump_tstart) or (t > self.dump_tend): return dump if (frequency >= 0) and simulation.is_time_of_interest: - dump |= (t in self.dump_times) + if isinstance(t, np.float32): + dump |= (t in self.dump_times_fp32) + elif isinstance(t, np.float64): + dump |= (t in self.dump_times_fp64) + else: + raise NotImplementedError(type(t)) if (frequency > 0): dump |= ((simulation.current_iteration % frequency) == 0) return dump @@ -306,6 +321,10 @@ class IOParams(namedtuple("IOParams", ['filename', 'filepath', all_kwds[k] = kwds.get(k, getattr(self, k)) return IOParams(**all_kwds) + @property + def dump_times(self): + return self.dump_times_fp64 + def __str__(self): return self.to_string() diff --git a/hysop/tools/numba_utils.py b/hysop/tools/numba_utils.py index 069ed7caa760adbb7136ed538f92a97cf47baf23..58a73f603acb00a0e61227e4781e82a6fdbbdae2 100644 --- a/hysop/tools/numba_utils.py +++ b/hysop/tools/numba_utils.py @@ -1,8 +1,15 @@ -import numba as nb import numpy as np +from hysop import __DEFAULT_NUMBA_TARGET__ from hysop.core.arrays.array import Array +try: + import numba as nb + from numba import prange + HAS_NUMBA=True +except ImportError: + HAS_NUMBA=False + def make_numba_signature(*args, **kwds): raise_on_cl_array = kwds.pop('raise_on_cl_array', True) if kwds: @@ -93,3 +100,169 @@ def make_numba_signature(*args, **kwds): numba_args += (na,) return nb.void(*numba_args), ','.join(numba_layout) + + +def bake_numba_copy(dst, src, target=None): + if (target is None): + target = __DEFAULT_NUMBA_TARGET__ + signature, layout = make_numba_signature(dst, src) + if (dst.ndim == 1): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def copy(dst, src): + for i in xrange(0, dst.shape[0]): + dst[i] = src[i] + elif (dst.ndim == 2): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def copy(dst, src): + for i in prange(0, dst.shape[0]): + for j in xrange(0, dst.shape[1]): + dst[i,j] = src[i,j] + elif (dst.ndim == 3): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def copy(dst, src): + for i in prange(0, dst.shape[0]): + for j in prange(0, dst.shape[1]): + for k in xrange(0, dst.shape[2]): + dst[i,j,k] = src[i,j,k] + elif (dst.ndim == 4): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def copy(dst, src): + for i in prange(0, dst.shape[0]): + for j in prange(0, dst.shape[1]): + for k in prange(0, dst.shape[2]): + for l in xrange(0, dst.shape[3]): + dst[i,j,k,l] = src[i,j,k,l] + else: + raise NotImplementedError(dst.ndim) + def _exec_copy(copy=copy, dst=dst, src=src): + copy(dst,src) + return _exec_copy + + +def bake_numba_accumulate(dst, src, target=None): + if (target is None): + target = __DEFAULT_NUMBA_TARGET__ + signature, layout = make_numba_signature(dst, src) + if (dst.ndim == 1): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def accumulate(dst, src): + for i in xrange(0, dst.shape[0]): + dst[i] += src[i] + elif (dst.ndim == 2): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def accumulate(dst, src): + for i in prange(0, dst.shape[0]): + for j in xrange(0, dst.shape[1]): + dst[i,j] += src[i,j] + elif (dst.ndim == 3): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def accumulate(dst, src): + for i in prange(0, dst.shape[0]): + for j in prange(0, dst.shape[1]): + for k in xrange(0, dst.shape[2]): + dst[i,j,k] += src[i,j,k] + elif (dst.ndim == 4): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def accumulate(dst, src): + for i in prange(0, dst.shape[0]): + for j in prange(0, dst.shape[1]): + for k in prange(0, dst.shape[2]): + for l in xrange(0, dst.shape[3]): + dst[i,j,k,l] += src[i,j,k,l] + else: + raise NotImplementedError(dst.ndim) + def _exec_accumulate(accumulate=accumulate, dst=dst, src=src): + accumulate(dst,src) + return _exec_accumulate + + +def bake_numba_transpose(src, dst, axes, target=None): + # inefficient permutations + + if (target is None): + target = __DEFAULT_NUMBA_TARGET__ + signature, layout = make_numba_signature(dst, src) + + assert src.ndim == dst.ndim + assert dst.shape == tuple(src.shape[i] for i in axes) + assert dst.dtype == src.dtype + ndim = src.ndim + + def noop(dst, src): + pass + + if (ndim == 1): + transpose = noop + elif (ndim == 2): + if axes == (0,1): + transpose == noop + elif axes == (1,0): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def transpose(dst, src): + for i in prange(0, src.shape[0]): + for j in xrange(0, src.shape[1]): + dst[j,i] = src[i,j] + else: + raise NotImplementedError + elif (ndim == 3): + if axes == (0,1,2): + transpose == noop + elif axes == (0,2,1): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def transpose(dst, src): + for i in prange(0, src.shape[0]): + for j in prange(0, src.shape[1]): + for k in xrange(0, src.shape[2]): + dst[i,k,j] = src[i,j,k] + elif axes == (1,0,2): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def transpose(dst, src): + for i in prange(0, src.shape[0]): + for j in prange(0, src.shape[1]): + for k in xrange(0, src.shape[2]): + dst[j,i,k] = src[i,j,k] + elif axes == (1,2,0): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def transpose(dst, src): + for i in prange(0, src.shape[0]): + for j in prange(0, src.shape[1]): + for k in xrange(0, src.shape[2]): + dst[j,k,i] = src[i,j,k] + elif axes == (2,1,0): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def transpose(dst, src): + for i in prange(0, src.shape[0]): + for j in prange(0, src.shape[1]): + for k in xrange(0, src.shape[2]): + dst[k,j,i] = src[i,j,k] + elif axes == (2,0,1): + @nb.guvectorize([signature], layout, + target=target, nopython=True, cache=True) + def transpose(dst, src): + for i in prange(0, src.shape[0]): + for j in prange(0, src.shape[1]): + for k in xrange(0, src.shape[2]): + dst[k,i,j] = src[i,j,k] + else: + raise NotImplementedError(axes) + else: + raise NotImplementedError(ndim) + + def _exec_transpose(transpose=transpose, dst=dst, src=src): + transpose(dst,src) + return _exec_transpose + + diff --git a/hysop/tools/types.py b/hysop/tools/types.py index bba65cfb2690209e8bac93417803488f2ad7312d..0fa5fea24ceaa47b3471e7c3741e980d0a6ee864 100644 --- a/hysop/tools/types.py +++ b/hysop/tools/types.py @@ -1,6 +1,5 @@ from hysop.deps import np from collections import Iterable -from hysop.tools.misc import prod class InstanceOf(object): def __init__(self, cls): @@ -171,6 +170,7 @@ def check_instance(val, cls, allow_none=False, print_offending_value(v, all_val_cls) raise TypeError(msg) elif isinstance(val, np.ndarray): + from hysop.tools.misc import prod dtype = kargs.pop('dtype', None) shape = kargs.pop('shape', None) size = kargs.pop('size', None)