diff --git a/examples/example_utils.py b/examples/example_utils.py index 4e5d7706ee3bb2aaea5feac36fa9ae753519821b..f7fee9b5dab904d0353ac2de2e4c27adab03758c 100644 --- a/examples/example_utils.py +++ b/examples/example_utils.py @@ -111,7 +111,7 @@ class HysopArgParser(argparse.ArgumentParser): if (domain is None): domain = 'box' if (default_dump_dir is None): - default_dump_dir = '/tmp/hysop/{}'.format(prog_name) + default_dump_dir = '{}/hysop/{}'.format(self.tmp_dir(), prog_name) self._domain = domain @@ -605,10 +605,18 @@ class HysopArgParser(argparse.ArgumentParser): dest='clean', help=('Clean the dump_folder prior to launch '+ '(remove all *.txt, *.png, *.xmf and *.h5 files)')) + file_io.add_argument('--debug-dump-dir', type=str, + default='{}/hysop/debug'.format(self.tmp_dir()), + dest='debug_dump_dir', + help=('Target root directory for debug dumps. Debug dumps will appear into <dump dir>/<target>.')) + file_io.add_argument('--debug-dump-target', type=str, default=None, + dest='debug_dump_target', + help=('Tag for field debug dumps. Debug dumps will appear into <dump dir>/<target>.')) return file_io def _check_file_io_args(self, args): - self._check_default(args, ('dump_dir', 'cache_dir'), str, allow_none=True) + self._check_default(args, ('dump_dir', 'cache_dir', + 'debug_dump_dir', 'debug_dump_target'), str, allow_none=True) self._check_default(args, ('dump_freq'), int, allow_none=True) self._check_default(args, ('dump_times'), tuple, allow_none=True) self._check_default(args, ('override_cache'), bool, allow_none=True) diff --git a/examples/taylor_green/taylor_green.py b/examples/taylor_green/taylor_green.py index 278fba68c440ca779571aee302b99228936cd6e9..bb06faebcde0965c488c585125c65c0eef9f3650 100644 --- a/examples/taylor_green/taylor_green.py +++ b/examples/taylor_green/taylor_green.py @@ -37,7 +37,6 @@ def compute(args): from hysop.methods import SpaceDiscretization, Remesh, TimeIntegrator, \ ComputeGranularity, Interpolation - # Define the domain dim = args.ndim npts = args.npts @@ -226,12 +225,21 @@ def compute(args): min_max_U.Finf, min_max_W.Finf, adapt_dt.equivalent_CFL, filename='parameters.txt', precision=8) + # Attach a field debug dumper if requested + from hysop.tools.debug_dumper import DebugDumper + if args.debug_dump_target: + debug_dumper = DebugDumper( + path=args.debug_dump_dir, + name=args.debug_dump_target, + force_overwrite=True, enable_on_op_apply=True) + else: + debug_dumper = None + # Initialize only the vorticity - dfields = problem.input_discrete_fields - dfields[vorti].initialize(formula=init_vorticity) + problem.initialize_field(vorti, formula=init_vorticity) # Finally solve the problem - problem.solve(simu, dry_run=args.dry_run, debug_dumper=None) + problem.solve(simu, dry_run=args.dry_run, debug_dumper=debug_dumper) # Finalize problem.finalize() diff --git a/hysop/backend/device/opencl/opencl_array.py b/hysop/backend/device/opencl/opencl_array.py index 071081edcf0d7f87e9923f6deb833ba45ee7c76a..db424e001f581d1022ebf48744546a647a0a9121 100644 --- a/hysop/backend/device/opencl/opencl_array.py +++ b/hysop/backend/device/opencl/opencl_array.py @@ -36,8 +36,15 @@ class OpenClArray(Array): msg='{} unsupported yet for OpenCl arrays.'.format(handle.dtype) raise TypeError(msg) - backend.check_queue(handle.queue) super(OpenClArray,self).__init__(handle=handle, backend=backend, **kargs) + + # at this time the opencl backend works only with the default_queue + # so we enforce it. + if (handle.queue is not None) and (handle.queue is not self.default_queue): + msg = 'pyopencl.Array has been created with a non-default queue.' + raise RuntimeError(msg) + backend.check_queue(handle.queue) + self.set_default_queue(self.default_queue) def as_symbolic_array(self, name, **kwds): """ @@ -191,6 +198,11 @@ class OpenClArray(Array): """ Sets the default queue for this array. """ + # at this time the opencl backend works only with the default_queue + # so we enforce it. + if (queue is not self.default_queue): + msg='Default queue override has been disabled for non-default queues.' + raise RuntimeError(msg) queue = self.backend.check_queue(queue) self._handle.queue = queue def reset_default_queue(self): @@ -367,7 +379,7 @@ class OpenClArray(Array): try: self.handle.setitem(subscript=subscript, value=value, queue=queue) - except (ValueError, RuntimeError): + except: from hysop.backend.device.opencl.opencl_copy_kernel_launchers import \ OpenClCopyBufferRectLauncher kl = OpenClCopyBufferRectLauncher.from_slices(varname='buffer', src=value, diff --git a/hysop/backend/device/opencl/opencl_array_backend.py b/hysop/backend/device/opencl/opencl_array_backend.py index a8e8972323a390ae6f48e667aaf9d487ddf60358..2b72adb314c799290bc375d6b55290251001d85e 100644 --- a/hysop/backend/device/opencl/opencl_array_backend.py +++ b/hysop/backend/device/opencl/opencl_array_backend.py @@ -343,12 +343,18 @@ class OpenClArrayBackend(ArrayBackend): check_instance(queue, cl.CommandQueue, allow_none=True) check_instance(allocator, OpenClAllocator, allow_none=True) + # disable multi queue support at this time + msg='Non-default queue support has been disabled. Please provide a cl_env only.' + if (cl_env is None): + raise RuntimeError(msg) + if (queue is not None) and (queue is not cl_env.default_queue): + raise RuntimeError(msg) + if (queue is None): cl_env = cl_env or get_or_create_opencl_env() context = cl_env.context queue = cl_env.default_queue allocator = allocator or cl_env.allocator - assert allocator.context == context elif (cl_env is None): context = queue.context if (allocator is None): @@ -1486,8 +1492,13 @@ class OpenClArrayBackend(ArrayBackend): shape = to_tuple(shape) - queue = queue or self.default_queue - self.check_queue(queue) + # at this time the opencl backend works only with the default_queue + # so we enforce it by not binding any Array to any queue. + if (queue is not None) and (queue is not self.default_queue): + msg = 'pyopencl.Array has been created with non-default queue.' + raise RuntimeError(msg) + queue = self.check_queue(queue) + cq = queue #force default queue if buf is None: assert offset==0 @@ -1504,7 +1515,7 @@ class OpenClArrayBackend(ArrayBackend): buf = allocator.allocate_aligned(size, alignment=alignment) - handle = self._call(clArray.Array, cq=queue, shape=shape, dtype=dtype, order=order, + handle = self._call(clArray.Array, cq=cq, shape=shape, dtype=dtype, order=order, allocator=None, data=buf, offset=offset, strides=strides, events=events) return self.wrap(handle) @@ -1518,10 +1529,8 @@ class OpenClArrayBackend(ArrayBackend): from hysop.backend.device.opencl.opencl_array import Array, OpenClArray from hysop.backend.host.host_array import HostArray acls = a.__class__.__name__ - self.check_queue(queue) + queue = self.check_queue(queue) - queue = queue or self.default_queue - if (dtype is not None): pass if isinstance(a, Array): @@ -1557,7 +1566,7 @@ class OpenClArrayBackend(ArrayBackend): Return an array copy of the given object. """ self._unsupported_argument('copy','order',order,MemoryOrdering.SAME_ORDER) - self.check_queue(queue) + queue = queue = self.check_queue(queue) return self._call(a.handle.copy, queue=queue) # Filling @@ -1565,7 +1574,7 @@ class OpenClArrayBackend(ArrayBackend): """ Fill the array with given value """ - self.check_queue(queue) + queue = queue = self.check_queue(queue) if a.flags.forc: # only c-contiguous arrays are handled by pyopencl a.handle.fill(value=value, queue=queue) @@ -3067,7 +3076,7 @@ class OpenClArrayBackend(ArrayBackend): Default generator is clRandom.PhiloxGenerator. """ self._check_argtype('rand', 'shape', shape, tuple) - self.check_queue(queue) + queue = self.check_queue(queue) if (generator is None): generator = clRandom.PhiloxGenerator(context=self.context) if (out is None): @@ -3083,7 +3092,7 @@ class OpenClArrayBackend(ArrayBackend): Return samples from the standard normal distribution [mu,sigma]. Default generator is clRandom.PhiloxGenerator. """ - self.check_queue(queue) + queue = self.check_queue(queue) self._check_argtype('randn', 'shape', shape, tuple) if (generator is None): generator = clRandom.PhiloxGenerator(context=self.context) diff --git a/hysop/backend/device/opencl/opencl_operator.py b/hysop/backend/device/opencl/opencl_operator.py index a941e0b9788dd3161bd19f307e9a6f9cca1a61e6..18550a1ea682f55220ec27d3b3d14820afdeed1c 100644 --- a/hysop/backend/device/opencl/opencl_operator.py +++ b/hysop/backend/device/opencl/opencl_operator.py @@ -148,8 +148,10 @@ class OpenClOperator(ComputationalGraphOperator): msg=msg.format(precision) raise ValueError(msg) - self.typegen = self.cl_env.build_typegen(precision=precision, float_dump_mode=float_dump_mode, - use_short_circuit_ops=use_short_circuit_ops, unroll_loops=unroll_loops) + self.typegen = self.cl_env.build_typegen(precision=precision, + float_dump_mode=float_dump_mode, + use_short_circuit_ops=use_short_circuit_ops, + unroll_loops=unroll_loops) self.autotuner_config = autotuner_config self.precision = convert_precision(precision) diff --git a/hysop/backend/device/opencl/operator/poisson_rotational.py b/hysop/backend/device/opencl/operator/poisson_rotational.py index 52862ec7f7ab53addac8382eb0febf321b0f2fc6..068057da057879b4d9cbdf08aeb5132fe592197e 100644 --- a/hysop/backend/device/opencl/operator/poisson_rotational.py +++ b/hysop/backend/device/opencl/operator/poisson_rotational.py @@ -219,22 +219,16 @@ class OpenClPoissonRotational(PoissonRotationalOperatorBase, OpenClSymbolic): self._exchange_W_ghosts = self.dW.exchange_ghosts(build_launcher=True) @op_apply - def apply(self, simulation, debug_dumper=None, **kwds): + def apply(self, simulation, **kwds): '''Solve the PoissonRotational equation.''' # /!\ clFFT use the destination buffer as a scratch # so we reverse the order of forward transforms. - #it = simulation.current_iteration - #t = simulation.t() - - #debug_dumper(it, t, 'W', self.dW) - for fp in self.forward_W_plans: evt, = fp.enqueue(queue=self.queue) # project and recover vorticity if required if self._do_project(simulation): - assert False evt = self.projection_kernel(queue=self.queue) for bp in self.backward_W_plans: evt, = bp.enqueue() @@ -243,11 +237,9 @@ class OpenClPoissonRotational(PoissonRotationalOperatorBase, OpenClSymbolic): # recover velocity evt = self.compute_velocity_kernel(queue=self.queue) - + for bp in self.backward_U_plans: evt, = bp.enqueue() - + if (self._exchange_U_ghosts is not None): evt = self._exchange_U_ghosts(queue=self.queue) - - #debug_dumper(it, t, 'U', self.dU) diff --git a/hysop/core/arrays/array.py b/hysop/core/arrays/array.py index 3240192758c35853a9d64e75551deb5c6d7bb0f4..deab126139df360672e04ee89a0cc093c7841e11 100644 --- a/hysop/core/arrays/array.py +++ b/hysop/core/arrays/array.py @@ -70,7 +70,9 @@ class Array(object): Return underlying implementation of this buffer. """ if not hasattr(self, '_handle'): - raise RuntimeError('{} has no _handle defined.', self.__class__) + msg='{} has no _handle defined.' + msg=msg.format(self.__class__) + raise RuntimeError(msg) return self._handle def get_backend(self): @@ -78,7 +80,9 @@ class Array(object): Return the backend corresponding to this array. """ if not hasattr(self, '_backend'): - raise RuntimeError('{} has no _backend defined.', self.__class__) + msg='{} has no _backend defined.' + msg=msg.format(self.__class__) + raise RuntimeError(msg) return self._backend handle = property(get_handle) diff --git a/hysop/core/graph/graph.py b/hysop/core/graph/graph.py index 752f918d55b34d908f44ffed9f4ea200632c17eb..f781d5017746a2e08719cbcd696d32a326a51f9d 100644 --- a/hysop/core/graph/graph.py +++ b/hysop/core/graph/graph.py @@ -129,7 +129,26 @@ def op_apply(f): dbg = ('dbg' in kwds) dbg = dbg and (kwds['dbg'] is not None) dbg = dbg and (kwds['dbg'].enable_on_op_apply) - if dbg: + debug_dump = ('debug_dumper' in kwds) + debug_dump = debug_dump and (kwds['debug_dumper'] is not None) + debug_dump = debug_dump and (kwds['debug_dumper'].enable_on_op_apply) + if debug_dump: + assert 'simulation' in kwds + op = args[0] + simu = kwds['simulation'] + it = simu.current_iteration + t = simu.t() + for dfield in op.input_discrete_fields.values(): + tag = 'pre_{}_{}'.format(op.name, dfield.name) + kwds['debug_dumper'](it, t, tag, + tuple(df.get().handle for df in dfield.data)) + ret = f(*args, **kwds) + for dfield in op.output_discrete_fields.values(): + tag = 'post_{}_{}'.format(op.name, dfield.name) + kwds['debug_dumper'](it, t, tag, + tuple(df.get().handle for df in dfield.data)) + return ret + elif dbg: import inspect msg=inspect.getsourcefile(f) kwds['dbg']('pre '+msg, nostack=True) diff --git a/hysop/fields/cartesian_discrete_field.py b/hysop/fields/cartesian_discrete_field.py index c391f174191ac0c79a5c7f835b53b6ad833c17f5..999e595d039892774f011c656d2bdedb831cea95 100644 --- a/hysop/fields/cartesian_discrete_field.py +++ b/hysop/fields/cartesian_discrete_field.py @@ -548,9 +548,9 @@ CartesianDiscreteFieldView (id={}, tag={}) raise RuntimeError(msg) if __debug__: - assert all((d.size == self.size) for d in data), 'Array size was altered.' - assert all((d.dtype == self.dtype) for d in data), 'Array dtype was altered.' - assert all(all(d.shape == self.shape) for d in data), 'Array shape was altered.' + assert all((d.size == self.size) for d in data), 'Array size has been altered.' + assert all((d.dtype == self.dtype) for d in data), 'Array dtype has been altered.' + assert all(all(d.shape == self.shape) for d in data), 'Array shape has been altered.' if only_finite: for (i,d) in enumerate(data): diff --git a/hysop/fields/ghost_exchangers.py b/hysop/fields/ghost_exchangers.py index 914bf7727d07c0990cfa1bc8a62ec0207d86dc29..6cc1d8a935b0dafbd47c03965d10890a2d18b614 100644 --- a/hysop/fields/ghost_exchangers.py +++ b/hysop/fields/ghost_exchangers.py @@ -205,7 +205,7 @@ class CartesianDiscreteFieldGhostExchanger(GhostExchanger): if all(ghosts[d]>0 for d in directions): for displacements in all_outer_ghost_slices[dim][directions]: ndisplacements = sum(d!=0 for d in displacements) - if (ndisplacements != dim): + if (ndisplacements < 2): continue slc, shape = all_outer_ghost_slices[dim][directions][displacements] value = default_invalid_value(dtype=base_dtype) diff --git a/hysop/fields/tests/test_cartesian.py b/hysop/fields/tests/test_cartesian.py index 5b133403bb987bcb849ccac7675d5600f5c0cde5..507b4bcea01d918b5f515e70d464f48ddd197f36 100644 --- a/hysop/fields/tests/test_cartesian.py +++ b/hysop/fields/tests/test_cartesian.py @@ -134,6 +134,10 @@ def test_serial_initialization_3d(): assert (buf[:,:,nghosts[2]:2*nghosts[2]] == buf[:,:,nghosts[2]+npts[2]-1:]).all() assert (buf[:,:,:nghosts[2]] == buf[:,:,npts[2]-1:nghosts[2]+npts[2]-1]).all() +def iter_backends(): + yield (Backend.HOST, None) + for cl_env in iter_clenv(): + yield (Backend.OPENCL, cl_env) def test_mpi_ghost_exchange(comm): rank = comm.Get_rank() @@ -166,16 +170,18 @@ def test_mpi_ghost_exchange(comm): F0 = Field(domain=domain, name='F0', nb_components=1, dtype=dtype, _register=False) F1 = Field(domain=domain, name='F1', nb_components=2, dtype=dtype, _register=False) - for backend in (Backend.HOST, Backend.OPENCL,): + for (backend, cl_env) in iter_backends(): if rank==0: - print ' >BACKEND.{}'.format(backend) + print ' >BACKEND.{}{}'.format(backend, + '' if (cl_env is None) else '::{}.{}'.format( + cl_env.platform.name.strip(), cl_env.device.name.strip())) for shape in it.product(xrange(0,size+1), repeat=dim): if np.prod(shape, dtype=np.uint32)!=size: continue if rank==0: print ' *cart_shape: {}'.format(shape) topo = CartesianTopology(domain=domain, discretization=discretization, - backend=backend, cart_shape=shape) + backend=backend, cart_shape=shape, cl_env=cl_env) assert (topo.proc_shape==shape).all() def ghost_base(i,d,rank,local_dir): @@ -267,16 +273,18 @@ def test_mpi_ghost_accumulate(comm): F0 = Field(domain=domain, name='F0', nb_components=1, dtype=dtype, _register=False) F1 = Field(domain=domain, name='F1', nb_components=2, dtype=dtype, _register=False) - for backend in (Backend.HOST, Backend.OPENCL,): + for (backend, cl_env) in iter_backends(): if rank==0: - print ' >BACKEND.{}'.format(backend) + print ' >BACKEND.{}{}'.format(backend, + '' if (cl_env is None) else '::{}.{}'.format( + cl_env.platform.name.strip(), cl_env.device.name.strip())) for shape in it.product(xrange(0,size+1), repeat=dim): if np.prod(shape, dtype=np.uint32)!=size: continue if rank==0: print ' *cart_shape: {}'.format(shape) topo = CartesianTopology(domain=domain, discretization=discretization, - backend=backend, cart_shape=shape) + backend=backend, cart_shape=shape, cl_env=cl_env) assert (topo.proc_shape==shape).all() def ghost_base(rank, directions, displacements, i): diff --git a/hysop/numerics/fft/fft.py b/hysop/numerics/fft/fft.py index ded2b830ac10f40a089d23dfb3b3d8fd17eb2fd9..5a2ff8b079492da2731e50669e888b66e63e6163 100644 --- a/hysop/numerics/fft/fft.py +++ b/hysop/numerics/fft/fft.py @@ -5,9 +5,18 @@ Base interface for fast Fourier Transforms. """ from abc import ABCMeta, abstractmethod +import warnings import numpy as np +from hysop.tools.types import first_not_None from hysop.tools.numerics import float_to_complex_dtype, complex_to_float_dtype +from hysop.tools.units import bytes2str +from hysop.backend.host.host_array_backend import HostArrayBackend +from hysop.backend.host.host_array import HostArray + +from hysop.tools.warning import HysopWarning +class HysopFFTWarning(HysopWarning): + pass def mk_shape(base_shape, axis, N): """ @@ -44,19 +53,48 @@ class FFTPlanI(object): """ __metaclass__ = ABCMeta + def __init__(self): + self._setup = False + @abstractmethod - def execute(self): + def input_array(self): """ - Execute the FFT plan on current input and output array. + Return currently planned input array. + """ + pass + + @abstractmethod + def output_array(self): + """ + Return currently planned output array. """ pass + + def setup(self): + """ + Method that has to be called before any call to execute. + """ + if self._setup: + msg='Plan was already setup...' + raise RuntimeError(msg) + self._setup = True + return self @abstractmethod - def __call__(self, a=None, out=None): + def execute(self): + """ + Execute the FFT plan on current input and output array. + """ + + def __call__(self, a=None, out=None, **kwds): """ Apply the FFT plan to possibly new input or output arrays. """ - pass + if (a is not None) or (out is not None): + msg='New array execute is not available for FFT backend {}.' + msg=msg.format(type(self).__name__) + raise RuntimeError(msg) + self.execute(**kwds) class FFTI(object): @@ -110,13 +148,31 @@ class FFTI(object): __metaclass__ = ABCMeta - def __init__(self): + def __init__(self, backend=None, warn_on_allocation=True): """Initializes the interface and default supported real and complex types.""" self.supported_ftypes = (np.float32, np.float64) self.supported_ctypes = (np.complex64, np.complex128) self.supported_cosine_transforms = (1,2,3) self.supported_sine_transforms = (1,2,3) + + if (backend is None): + backend = HostArrayBackend.get_or_create(allocator=None) + self.backend = backend + self.warn_on_allocation = warn_on_allocation + def allocate_output(self, out, shape, dtype): + """Alocate output if required and check shape and dtype.""" + if (out is None): + if self.warn_on_allocation: + nbytes = np.prod(shape, dtype=np.int64)*dtype.itemsize + msg='FftwFFT: allocating aligned output array of size {}.' + msg=msg.format(bytes2str(nbytes)) + warnings.warn(msg, HysopFFTWarning) + out = self.backend.empty(shape=shape, dtype=dtype) + else: + assert out.dtype == dtype + assert out.shape == shape + return out @abstractmethod def fft(self, a, out, axis=-1, **kwds): @@ -140,7 +196,9 @@ class FFTI(object): Notes ----- N = a.shape[axis] - out[0] will contain the sum of the signal (zero-frequency term always real for real inputs). + out[0] will contain the sum of the signal (zero-frequency term always real for + real inputs). + If N is even: out[1:N/2] contains the positive frequency terms. out[N/2] contains the Nyquist frequency (always real for real inputs). @@ -185,7 +243,8 @@ class FFTI(object): @abstractmethod def rfft(self, a, out, axis=-1, **kwds): """ - Compute the unscaled one-dimensional real to hermitian complex discrete Fourier Transform. + Compute the unscaled one-dimensional real to hermitian complex discrete Fourier + Transform. Parameters ---------- @@ -206,8 +265,8 @@ class FFTI(object): Notes ----- For real inputs there is no information in the negative frequency components that - is not already available from the positive frequency component because of the Hermitian - symmetry. + is not already available from the positive frequency component because of the + Hermitian symmetry. N = out.shape[axis] = a.shape[axis]//2 + 1 out[0] will contain the sum of the signal (zero-frequency term, always real). @@ -334,19 +393,16 @@ class FFTI(object): ------- (shape, dtype) of the output array determined from the input array. """ - if type==2: - type=3 - elif type==3: - type=2 + itype = [1,3,2,4][type-1] n = a.shape[axis] - N = 2*(n - (type==1)) - scaling = 1.0/N + N = 2*(n - (itype==1)) + logical_size = N assert a.dtype in self.supported_ftypes, a.dtype - assert type in self.supported_cosine_transforms, self.supported_cosine_transforms + assert itype in self.supported_cosine_transforms, self.supported_cosine_transforms if (out is not None): assert a.dtype == out.dtype assert np.array_equal(a.shape, out.shape) - return (a.shape, a.dtype, type, scaling) + return (a.shape, a.dtype, itype, logical_size) @abstractmethod def dst(self, a, out=None, type=2, axis=-1, **kwds): @@ -397,17 +453,14 @@ class FFTI(object): ------- (shape, dtype) of the output array determined from the input array. """ - if type==2: - type=3 - elif type==3: - type=2 + itype = [1,3,2,4][type-1] n = a.shape[axis] - N = 2*(n + (type==1)) - scaling = 1.0/N + N = 2*(n + (itype==1)) + logical_size = N assert a.dtype in self.supported_ftypes, a.dtype assert type in self.supported_sine_transforms, self.supported_sine_transforms if (out is not None): assert a.dtype == out.dtype assert np.array_equal(a.shape, out.shape) - return (a.shape, a.dtype, type, scaling) + return (a.shape, a.dtype, itype, logical_size) diff --git a/hysop/numerics/fft/fftw_fft.py b/hysop/numerics/fft/fftw_fft.py index f0b6b4cb6256b5ffe5f6863bcec107862bd381ff..4086472b4d333513ca499dc241d2325c51f1fdd9 100644 --- a/hysop/numerics/fft/fftw_fft.py +++ b/hysop/numerics/fft/fftw_fft.py @@ -9,14 +9,9 @@ import pyfftw import numpy as np from hysop.tools.types import first_not_None -from hysop.tools.units import bytes2str from hysop.tools.misc import prod -from hysop.tools.warning import HysopWarning -from hysop.numerics.fft.fft import FFTPlanI, FFTI - -class HysopFFTWWarning(HysopWarning): - pass - +from hysop.numerics.fft.fft import FFTPlanI, FFTI, HostArray, \ + HysopFFTWarning, bytes2str class FftwFFTPlan(FFTPlanI): """ @@ -25,15 +20,36 @@ class FftwFFTPlan(FFTPlanI): Emit warnings when changing input and output alignment. """ - def __init__(self, scaling=None, **plan_kwds): + def __init__(self, a, out, scaling=None, **plan_kwds): super(FftwFFTPlan, self).__init__() + + if isinstance(a, HostArray): + plan_kwds['input_array'] = a.handle + else: + plan_kwds['input_array'] = a + + if isinstance(out, HostArray): + plan_kwds['output_array'] = out.handle + else: + plan_kwds['output_array'] = out + plan = pyfftw.FFTW(**plan_kwds) if (not plan.simd_aligned): msg='Resulting plan is not SIMD aligned ({} bytes boundary).' msg=msg.format(pyfftw.simd_alignment) - warnings.warn(msg, HysopFFTWWarning) + warnings.warn(msg, HysopFFTWarning) self.plan = plan self.scaling = scaling + self.out = out + self.a = a + + @property + def input_array(self): + return self.a + + @property + def output_array(self): + return self.out def check_new_inputs(self, a, out): plan = self.plan @@ -41,22 +57,27 @@ class FftwFFTPlan(FFTPlanI): msg='New input array is not aligned on original planned input alignment of {} bytes.' msg+='\nA copy will be made.' msg=msg.format(plan.input_alignment) - warnings.warn(msg, HysopFFTWWarning) + warnings.warn(msg, HysopFFTWarning) if (out is not None) and (not pyfftw.is_byte_aligned(out, n=plan.output_alignment)): msg='New output array is not aligned on original planned output alignment of {} bytes.' msg+='\nA copy will be made.' msg=msg.format(plan.output_alignment) - warnings.warn(msg, HysopFFTWWarning) + warnings.warn(msg, HysopFFTWarning) + self.out = out + self.a = a + if isinstance(a, HostArray): + a = a.handle + if isinstance(out, HostArray): + out = out.handle + return (a, out) def execute(self): """ Execute plan on current input and output array. """ - out = self.plan.__call__() + self.plan.__call__() if (self.scaling is not None): - out[...] *= self.scaling - return out - + self.output_array[...] *= self.scaling def __call__(self, a=None, out=None): """ @@ -65,12 +86,10 @@ class FftwFFTPlan(FFTPlanI): will result in a copy being made. Return output array for convenience. """ - self.check_new_inputs(a, out) + (a, out) = self.check_new_inputs(a, out) self.plan(input_array=a, output_array=out, normalize_idft=True) - out = self.plan.output_array if (self.scaling is not None): - out[...] *= self.scaling - return out + self.output_array[...] *= self.scaling class FftwFFT(FFTI): @@ -91,8 +110,11 @@ class FftwFFT(FFTI): planning_timelimit=None, destroy_input=False, warn_on_allocation=True, - warn_on_misalignment=True): - super(FftwFFT, self).__init__() + warn_on_misalignment=True, + backend=None, + **kwds): + super(FftwFFT, self).__init__(backend=backend, + warn_on_allocation=warn_on_allocation, **kwds) self.supported_ftypes = (np.float32, np.float64, np.longdouble) self.supported_ctypes = (np.complex64, np.complex128, np.clongdouble) self.supported_cosine_transforms = (1,2,3,4) @@ -103,7 +125,6 @@ class FftwFFT(FFTI): self.planning_timelimit = planning_timelimit self.destroy_input = destroy_input - self.warn_on_allocation = warn_on_allocation self.warn_on_misalignment = warn_on_misalignment @classmethod @@ -113,29 +134,15 @@ class FftwFFT(FFTI): msg0=msg0.format('{}', pyfftw.simd_alignment) if (a is not None) and not pyfftw.is_byte_aligned(array=a): msg=msg0.format('Input') - warnings.warn(msg, HysopFFTWWarning) + warnings.warn(msg, HysopFFTWarning) elif (out is not None) and not pyfftw.is_byte_aligned(out): msg=msg0.format('Output') - warnings.warn(msg, HysopFFTWWarning) - - def allocate_output(self, out, shape, dtype): - """Alocate output if required and check shape and dtype.""" - if (out is None): - if self.warn_on_allocation: - nbytes = prod(shape)*dtype.itemsize - msg='FftwFFT: allocating aligned output array of size {}.' - msg=msg.format(bytes2str(nbytes)) - warnings.warn(msg, HysopFFTWWarning) - out = pyfftw.empty_aligned(shape=shape, dtype=dtype) - else: - assert out.dtype == dtype - assert out.shape == shape - return out + warnings.warn(msg, HysopFFTWarning) def bake_kwds(self, **kwds): plan_kwds = {} - plan_kwds['input_array'] = kwds.pop('a') - plan_kwds['output_array'] = kwds.pop('out') + plan_kwds['a'] = kwds.pop('a') + plan_kwds['out'] = kwds.pop('out') plan_kwds['direction'] = kwds.pop('direction') plan_kwds['axes'] = kwds.pop('axes', (kwds.pop('axis'),)) plan_kwds['threads'] = kwds.pop('threads', self.threads) @@ -153,7 +160,7 @@ class FftwFFT(FFTI): msg='Unknown keyword arguments: {}' msg=msg.format(', '.join('\'{}\''.format(kwd) for kwd in kwds.keys())) raise RuntimeError(msg) - + return plan_kwds @@ -199,6 +206,7 @@ class FftwFFT(FFTI): return plan def dct(self, a, out=None, type=2, axis=-1, **kwds): + """Planning destroys initial arrays content.""" (shape, dtype) = super(FftwFFT, self).dct(a=a, out=out, type=type, axis=axis, **kwds) out = self.allocate_output(out, shape, dtype) if self.warn_on_misalignment: @@ -210,9 +218,10 @@ class FftwFFT(FFTI): return plan def idct(self, a, out=None, type=2, axis=-1, scaling=None, **kwds): + """Planning destroys initial arrays content.""" (shape, dtype, type, s) = super(FftwFFT, self).idct(a=a, out=out, type=type, axis=axis, **kwds) - scaling = first_not_None(scaling, s) + scaling = first_not_None(scaling, 1.0/s) out = self.allocate_output(out, shape, dtype) if self.warn_on_misalignment: self.check_alignment(a, out) @@ -223,6 +232,7 @@ class FftwFFT(FFTI): return plan def dst(self, a, out=None, type=2, axis=-1, **kwds): + """Planning destroys initial arrays content.""" (shape, dtype) = super(FftwFFT, self).dst(a=a, out=out, type=type, axis=axis, **kwds) out = self.allocate_output(out, shape, dtype) if self.warn_on_misalignment: @@ -234,9 +244,10 @@ class FftwFFT(FFTI): return plan def idst(self, a, out=None, type=2, axis=-1, scaling=None, **kwds): + """Planning destroys initial arrays content.""" (shape, dtype, type, s) = super(FftwFFT, self).idst(a=a, out=out, type=type, axis=axis, **kwds) - scaling = first_not_None(scaling, s) + scaling = first_not_None(scaling, 1.0/s) out = self.allocate_output(out, shape, dtype) if self.warn_on_misalignment: self.check_alignment(a, out) diff --git a/hysop/numerics/fft/gpyfft_fft.py b/hysop/numerics/fft/gpyfft_fft.py new file mode 100644 index 0000000000000000000000000000000000000000..57a35085c4f90ae7cef8022e0ea28bb7a2ae2d59 --- /dev/null +++ b/hysop/numerics/fft/gpyfft_fft.py @@ -0,0 +1,921 @@ +""" +FFT iterface for fast Fourier Transforms using CLFFT backend. +:class:`~hysop.numerics.GpyFFT` +:class:`~hysop.numerics.GpyFFTPlan` +""" + +import warnings, struct +import numpy as np +from abc import abstractmethod + +from gpyfft.fft import gfft, GFFT, FFT as FFTPlan +from hysop.numerics.fft.fft import FFTPlanI, FFTI, \ + mk_shape, float_to_complex_dtype, complex_to_float_dtype + +from hysop import vprint +from hysop.constants import Precision +from hysop.tools.types import first_not_None +from hysop.tools.units import bytes2str +from hysop.tools.misc import prod +from hysop.tools.warning import HysopWarning +from hysop.tools.types import first_not_None +from hysop.tools.numerics import is_complex, is_fp + +from hysop.backend.device.opencl import cl, clArray +from hysop.backend.device.codegen.base.variables import dtype_to_ctype +from hysop.backend.device.opencl.opencl_array_backend import OpenClArrayBackend + +class HysopGpyFftWarning(HysopWarning): + pass + + +class GpyFFTPlan(FFTPlanI, FFTPlan): + """ + Build a clFFT plan using the gpyfft python interface. + Emit warnings when transform output has an unaligned buffer offset. + """ + + def __init__(self, cl_env, + in_array, out_array, + axes, direction_forward, + scaling=None, + scale_by_size=None, + warn_on_allocation=True, + warn_on_unaligned_output_offset=True): + """ + Wrap gpyfft.FFT to allow more versatile callback settings + and buffer allocations. + """ + # we do not want to call FFTPlan.__init__ + FFTPlanI.__init__(self) + + self.cl_env = cl_env + self.warn_on_allocation = warn_on_allocation + self.warn_on_unaligned_output_offset = warn_on_unaligned_output_offset + + self.temp_buffer = None + self._setup = False + self._baked = False + self._allocated = False + + self.in_array = in_array + self.out_array = out_array + + self._setup_kwds = { + 'in_array': in_array, + 'out_array': out_array, + 'axes': axes, + 'direction_forward': direction_forward, + 'scaling': scaling, + 'scale_by_size': scale_by_size + } + + def setup(self): + super(GpyFFTPlan, self).setup() + self.setup_plan(**self._setup_kwds) + return self + + def setup_plan(self, in_array, out_array, axes, direction_forward, scaling, scale_by_size): + if (scale_by_size is not None): + if (scaling is None): + scaling = 1.0/scale_by_size + else: + scaling = scaling / scale_by_size + + axes = np.asarray(axes) + assert 0 < axes.size <= 3, axes.size + + # compute strides + t_strides_in, t_distance_in, t_batchsize_in, t_shape, axes_transform = \ + self.calculate_transform_strides(axes, in_array) + + t_strides_out, t_distance_out, t_batchsize_out, t_shape_out, axes_transform_out = \ + self.calculate_transform_strides(axes, out_array) + + # check axes + msg='Error finding transform axis (consider setting axes argument)' + assert np.all(axes_transform == axes_transform_out), msg + + # enforce no input and output overlap (unless inplace) + t_inplace = False + if (in_array.base_data == out_array.base_data): + if (in_array.offset < out_array.offset): + assert (in_array.offset + in_array.nbytes) < out_array.offset + elif (in_array.offset > out_array.offset): + assert (out_array.offset + out_array.nbytes) < in_array.offset + else: + t_inplace = True + + # check input data type + if in_array.dtype in (np.float32, np.complex64): + t_precision = gfft.CLFFT_SINGLE + elif in_array.dtype in (np.float64, np.complex128): + t_precision = gfft.CLFFT_DOUBLE + else: + msg='Unsupported precision {}.' + msg=msg.format(in_array.dtype) + raise RuntimeError(msg) + + if out_array.dtype in (np.float32, np.complex64): + t_precision_out = gfft.CLFFT_SINGLE + elif out_array.dtype in (np.float64, np.complex128): + t_precision_out = gfft.CLFFT_DOUBLE + else: + msg='Unsupported precision {}.' + msg=msg.format(out_array.dtype) + raise RuntimeError(msg) + + if (t_precision != t_precision_out): + msg='Incompatible input and output precisions: {} vs {}' + msg=msg.format(t_precision, t_precision_out) + raise RuntimeError(msg) + + if in_array.dtype in (np.float32, np.float64): + layout_in = gfft.CLFFT_REAL + layout_out = gfft.CLFFT_HERMITIAN_INTERLEAVED + + expected_out_shape = list(in_array.shape) + expected_out_shape[axes_transform[0]] = \ + expected_out_shape[axes_transform[0]]//2 + 1 + msg='output array shape {} does not match expected shape: {}' + msg=msg.format(out_array.shape, expected_out_shape) + assert out_array.shape == tuple(expected_out_shape), msg + elif in_array.dtype in (np.complex64, np.complex128): + if out_array.dtype in (np.complex64, np.complex128): + layout_in = gfft.CLFFT_COMPLEX_INTERLEAVED + layout_out = gfft.CLFFT_COMPLEX_INTERLEAVED + else: + layout_in = gfft.CLFFT_HERMITIAN_INTERLEAVED + layout_out = gfft.CLFFT_REAL + t_shape = t_shape_out + + if t_inplace and ((layout_in is gfft.CLFFT_REAL) or + (layout_out is gfft.CLFFT_REAL)): + assert ((in_array.strides[axes_transform[0]] == in_array.dtype.itemsize) and \ + (out_array.strides[axes_transform[0]] == out_array.dtype.itemsize)), \ + 'inline real transforms need stride 1 for first transform axis' + + plan = GFFT.create_plan(self.context, t_shape) + plan.inplace = t_inplace + plan.strides_in = t_strides_in + plan.strides_out = t_strides_out + plan.distances = (t_distance_in, t_distance_out) + plan.batch_size = t_batchsize_in + plan.precision = t_precision + plan.layouts = (layout_in, layout_out) + if (scaling is not None): + if direction_forward: + plan.scale_forward = scaling + else: + plan.scale_backward = scaling + + (in_data, out_data) = self.set_callbacks(plan, in_array, out_array, + layout_in, layout_out) + + self.plan = plan + self.in_data = in_data + self.out_data = out_data + self.is_inplace = t_inplace + self.direction_forward = direction_forward + + def set_callbacks(self, plan, in_array, out_array, layout_in, layout_out, **kwds): + in_data = in_array.base_data + pre, user_data = self.pre_offset_callback(in_array, layout_in, **kwds) + + out_data = out_array.base_data + post, user_data = self.post_offset_callback(out_array, layout_out, **kwds) + + # ********************************************************************************** + # Keep a reference to callback source code to prevent dangling const char* pointers. + # Do not remove because clfft only get the pointer and gpyfft does not increase the + # refcount of those strings. + self.pre_callback_src = pre + self.post_callback_src = post + # ********************************************************************************** + + plan.set_callback(u'pre_callback', pre, 'pre', user_data=user_data) + plan.set_callback(u'post_callback', post, 'post', user_data=user_data) + return (in_data, out_data) + + def bake(self, queue=None): + if self._baked: + msg='Plan was already baked.' + raise RuntimeError(msg) + queue = first_not_None(queue, self.queue) + self.plan.bake(queue) + self._baked = True + return self + + def allocate(self, buf=None): + if self._allocated: + msg='Plan was already allocated.' + raise RuntimeError(msg) + size = self.plan.temp_array_size + if (size>0): + if (buf is None): + if self.warn_on_allocation: + msg='Allocating temporary buffer of size {} for clFFT::{}.' + msg=msg.format(bytes2str(size), id(self)) + warnings.warn(msg, HysopWarning) + buf = cl.Buffer(self.context, cl.mem_flags.READ_WRITE, size=size) + self.temp_buffer = buf + elif (buf.size != size): + msg='Buffer does not match required size: {} != {}' + msg=msg.format(buf.size, size) + raise ValueError(msg) + else: + self.temp_buffer = buf.data + else: + assert (buf is None) + self._allocated = True + return self + + def enqueue(self, queue=None, wait_for_events=None): + """ + Enqueue transform with array base_data. + """ + if not self._baked: + self.bake(queue) + if not self._allocated: + self.allocate() + + queue = first_not_None(queue, self.queue) + in_data, out_data = self.in_data, self.out_data + direction_forward = self.direction_forward + + if self.is_inplace: + events = self.plan.enqueue_transform((queue,), + (in_data,), + direction_forward=direction_forward, + temp_buffer=self.temp_buffer, + wait_for_events=wait_for_events) + else: + events = self.plan.enqueue_transform((queue,), + (in_data,), (out_data), + direction_forward=direction_forward, + temp_buffer=self.temp_buffer, + wait_for_events=wait_for_events) + evt, = events + return evt + + def enqueue_arrays(self, *args, **kwds): + msg='Enqueue arrays is not supported yet.' + raise NotImplementedError(msg) + + def execute(self, **kwds): + return self.enqueue(**kwds) + + @property + def required_buffer_size(self): + assert self._baked, 'plan has not been baked yet.' + return self.plan.temp_array_size + + @property + def ready(self): + return (self._baked and self._allocated) + + @property + def queue(self): + return self.cl_env.default_queue + + @property + def context(self): + return self.cl_env.context + + @property + def input_array(self): + return self.in_array + + @property + def output_array(self): + return self.out_array + + @classmethod + def check_dtype(cls, dtype, layout): + if layout in (gfft.CLFFT_HERMITIAN_INTERLEAVED, gfft.CLFFT_COMPLEX_INTERLEAVED): + if not is_complex(dtype): + msg='Layout is {} but got array with dtype {}.' + msg=msg.format(layout, dtype) + raise RuntimeError(msg) + elif layout in (gfft.CLFFT_REAL,): + if not is_fp(dtype): + msg='Layout is CLFFT_REAL but got array with dtype {}.' + msg=msg.format(dtype) + raise RuntimeError(msg) + else: + msg='Unsupported data layout {}.' + msg=msg.format(layout) + raise NotImplementedError(msg) + + def get_array_offset(self, array): + dtype = array.dtype + if (array.offset % dtype.itemsize) != 0: + msg='Unaligned array offset.' + raise RuntimeError(msg) + base_offset = (array.offset // dtype.itemsize) + return base_offset + + def pre_offset_callback(self, in_array, layout_in, **kwds): + dtype = in_array.dtype + fp = dtype_to_ctype(dtype) + self.check_dtype(dtype, layout_in) + base_offset = self.get_array_offset(in_array) + callback = \ + '''{fp} pre_callback(const __global void* input, + const uint offset, + __global void* userdata) {{ + __global {fp}* in = (__global {fp}*) input; + return in[{base_offset}uL+offset]; + }}'''.format(fp=fp, base_offset=base_offset) + + return callback, None + + def post_offset_callback(self, out_array, layout_out, **kwds): + dtype = out_array.dtype + self.check_dtype(dtype, layout_out) + fp = dtype_to_ctype(dtype) + base_offset = self.get_array_offset(out_array) + + callback = \ + '''void post_callback(__global void* output, + const uint offset, + __global void* userdata, + const {fp} fftoutput) {{ + __global {fp}* out = (__global {fp}*) output; + out[{base_offset}uL+offset] = fftoutput; + }}'''.format(fp=fp, base_offset=base_offset) + + return callback, None + + def allocate_plans(cls, backend, plans): + tmp_size = max(plan.required_buffer_size for plan in plans) + + if (tmp_size>0): + msg='Allocating an additional {} temporary buffer for clFFT.' + msg=msg.format(bytes2str(tmp_size)) + vprint(msg) + tmp_buffer = backend.empty(shape=(tmp_size), dtype=npw.uint8) + for plan in plans: + if (plan.required_buffer_size > tmp_buffer.nbytes): + msg='\nFATAL ERROR: Failed to allocate temporary buffer for clFFT.' + msg+='\n => clFFT expected {} bytes but only {} bytes have been allocated.\n' + msg=msg.format(plan.required_buffer_size, tmp_buffer.nbytes) + raise RuntimeError(msg) + else: + buf = tmp_buffer[:plan.required_buffer_size] + plan.allocate(buf=buf) + else: + for plan in plans: + assert plan.required_buffer_size == 0 + plan.allocate() + tmp_buffer = None + return tmp_buffer + + +class GpyR2RPlan(GpyFFTPlan): + """Specialization for real to real transforms built from r2c or c2r transforms.""" + + def __init__(self, in_array, out_array, scale_by_size, **kwds): + """ + Handmade R2R transforms rely on a fake output that will + never really be written. This output is necessary because + clFFT use it as a temporary storage during the FFT computation. + + The real output array will be passed as user data in a post callback. + """ + msg='Incompatible shapes {} vs {}.'.format(in_array.shape, out_array.shape) + assert np.array_equal(in_array.shape, out_array.shape), msg + msg='Incompatible dtypes {} vs {}.'.format(in_array.dtype, out_array.dtype) + assert (in_array.dtype == out_array.dtype) + msg='Only single and double precision are supported, got {}.'.format(in_array.dtype) + assert in_array.dtype in (np.float32, np.float64), msg + + scale_by_size = first_not_None(scale_by_size, 1) + super(GpyR2RPlan, self).__init__(in_array=in_array, out_array=out_array, + scale_by_size=scale_by_size, **kwds) + + @classmethod + def fake_array(cls, shape, dtype, strides=None): + class DummyArray(object): + def __init__(self, shape, strides, dtype): + assert (shape is not None) + assert (dtype is not None) + dtype = np.dtype(dtype) + if (strides is None): + strides = self.compute_strides(shape, dtype) + assert len(shape) == len(strides) + self.shape = shape + self.strides = strides + self.dtype = dtype + + @classmethod + def compute_strides(cls, shape, dtype): + strides = list(shape)[::-1] + strides[1:] = np.cumprod(strides[:-1]) + strides[0] = 1 + strides = tuple(x*dtype.itemsize for x in strides) + return strides + + array = DummyArray(shape=shape, strides=strides, dtype=dtype) + return array + + def generate_twiddles(self, name, base, count, typegen, dtype): + ctype = float_to_complex_dtype(dtype) + fp = dtype_to_ctype(dtype) + K = np.arange(count) + + E = np.exp(base*K, dtype=np.complex128) + base = '\t\t({fp}2)({}, {})' + vals = ',\n'.join(base.format( + typegen.dump(x.real), typegen.dump(x.imag), fp=fp) for x in E) + twiddles = \ + ''' + __constant const {fp}2 {name}[{N}] = {{ +{vals} + }}; + '''.format(fp=fp, name=name, vals=vals, N=count) + return twiddles; + + @abstractmethod + def setup_plan(self, in_array, out_array, axes, direction_forward, scaling, scale_by_size): + """Redefine plan creation behaviour.""" + pass + + +class GpyDCTIPlan(GpyR2RPlan): + def setup_plan(self, in_array, out_array, axes, direction_forward, scaling, scale_by_size): + assert in_array.data != out_array.data, 'inplace R2R transforms are not supported.' + axis = in_array.ndim - 1 + assert len(axes)==1 + assert axes[0] in (-1, axis) + axes = np.asarray([axis]) + + assert scale_by_size > 0 + + # build a fake R2C plan + layout_in = gfft.CLFFT_REAL + layout_out = gfft.CLFFT_HERMITIAN_INTERLEAVED + + shape = in_array.shape + dtype = in_array.dtype + fp = dtype_to_ctype(dtype) + N = shape[axis] + rshape = mk_shape(shape, axis, 2*N-2) + cshape = mk_shape(shape, axis, N) + ctype = float_to_complex_dtype(dtype) + + fake_input = self.fake_array(shape=rshape, dtype=dtype) + fake_output = self.fake_array(shape=cshape, dtype=ctype) + + t_strides_in, t_distance_in, t_batchsize_in, t_shape_in, axes_transform_in = \ + self.calculate_transform_strides(axes, fake_input) + t_strides_out, t_distance_out, t_batchsize_out, t_shape_out, axes_transform_out = \ + self.calculate_transform_strides(axes, fake_output) + + assert np.array_equal(t_batchsize_in, t_batchsize_out) + assert np.array_equal(axes_transform_in, axes_transform_out) + t_shape = t_shape_in + t_batch_size = t_batchsize_in + t_inplace = False + + plan = GFFT.create_plan(self.context, t_shape) + plan.inplace = t_inplace + plan.batch_size = t_batch_size + plan.strides_in = t_strides_in + plan.strides_out = t_strides_out + plan.distances = (t_distance_in, t_distance_out) + plan.layouts = (layout_in, layout_out) + if (dtype == np.float32): + plan.precision = gfft.CLFFT_SINGLE + else: + plan.precision = gfft.CLFFT_DOUBLE + assert scaling is None + if (scaling is not None): + if direction_forward: + plan.scale_forward = scaling + else: + plan.scale_backward = scaling + + (in_data, out_data) = self.set_callbacks(plan, in_array, out_array, + layout_in, layout_out, + N=N, fp=fp, S=scale_by_size) + + self.in_data = in_data + self.out_data = out_data + self.is_inplace = t_inplace + self.direction_forward = direction_forward + self.plan = plan + + def pre_offset_callback(self, in_array, layout_in, N, fp, S): + base_offset = self.get_array_offset(in_array) + pre = \ + '''{fp} pre_callback(const __global void* input, uint k, + __global void* userdata) {{ + __global {fp}* in = (__global {fp}*)(input) + {base_offset}uL; + in += k/(2*{N}-2) * {N}; + {fp} ret; + k = k%(2*{N}-2); + if (k<{N}) {{ + ret = in[k]; + }} + else {{ + ret = in[2*{N}-k-2]; + }} + return ret; + }}'''.format(N=N, fp=fp, base_offset=base_offset) + return pre, None + + def post_offset_callback(self, out_array, layout_out, N, fp, S): + base_offset = self.get_array_offset(out_array) + post = \ + '''void post_callback(__global void* output, uint k, + __global void* userdata, {fp}2 fftoutput) {{ + __global {fp}* out = (__global {fp}*)(output) + {base_offset}uL; + out[k] = fftoutput.x/{S}; + }}'''.format(N=N, S=S, fp=fp, base_offset=base_offset) + return post, None + + +class GpyDCTIIPlan(GpyR2RPlan): + def setup_plan(self, in_array, out_array, axes, direction_forward, scaling, scale_by_size): + assert in_array.data != out_array.data, 'inplace R2R transforms are not supported.' + axis = in_array.ndim - 1 + assert len(axes)==1 + assert axes[0] in (-1, axis) + axes = np.asarray([axis]) + + assert scale_by_size > 0 + + # build a fake R2C plan + layout_in = gfft.CLFFT_REAL + layout_out = gfft.CLFFT_HERMITIAN_INTERLEAVED + + shape = in_array.shape + dtype = in_array.dtype + fp = dtype_to_ctype(dtype) + N = shape[axis] + rshape = mk_shape(shape, axis, N) + cshape = mk_shape(shape, axis, N//2+1) + ctype = float_to_complex_dtype(dtype) + + fake_input = self.fake_array(shape=rshape, dtype=dtype) + fake_output = self.fake_array(shape=cshape, dtype=ctype) + + t_strides_in, t_distance_in, t_batchsize_in, t_shape_in, axes_transform_in = \ + self.calculate_transform_strides(axes, fake_input) + t_strides_out, t_distance_out, t_batchsize_out, t_shape_out, axes_transform_out = \ + self.calculate_transform_strides(axes, fake_output) + + assert np.array_equal(t_batchsize_in, t_batchsize_out) + assert np.array_equal(axes_transform_in, axes_transform_out) + t_shape = t_shape_in + t_batch_size = t_batchsize_in + t_inplace = False + + plan = GFFT.create_plan(self.context, t_shape) + plan.inplace = t_inplace + plan.batch_size = t_batch_size + plan.strides_in = t_strides_in + plan.strides_out = t_strides_out + plan.distances = (t_distance_in, t_distance_out) + plan.layouts = (layout_in, layout_out) + if (dtype == np.float32): + plan.precision = gfft.CLFFT_SINGLE + precision = Precision.FLOAT + else: + plan.precision = gfft.CLFFT_DOUBLE + precision = Precision.DOUBLE + if (scaling is not None): + if direction_forward: + plan.scale_forward = scaling + else: + plan.scale_backward = scaling + + typegen = self.cl_env.build_typegen(precision=precision, + float_dump_mode='hex', use_short_circuit_ops=False, unroll_loops=False) + + (in_data, out_data) = self.set_callbacks(plan, in_array, out_array, + layout_in, layout_out, + N=N, S=scale_by_size, + fp=fp, typegen=typegen) + + self.in_data = in_data + self.out_data = out_data + self.is_inplace = t_inplace + self.direction_forward = direction_forward + self.plan = plan + + def pre_offset_callback(self, in_array, layout_in, N, S, fp, typegen): + base_offset = self.get_array_offset(in_array) + n = (N-1)//2 + 1 + pre = \ + ''' + {fp} pre_callback(const __global void* input, uint k, __global void* userdata) {{ + __global {fp}* in = (__global {fp}*)(input) + {base_offset}uL; + {fp} ret; + if (k<{n}) {{ + ret = in[2*k]; + }} + else {{ + ret = in[2*({N}-k)-1]; + }} + return ret; + }}'''.format(n=n, N=N, fp=fp, base_offset=base_offset) + return pre, None + + def post_offset_callback(self, out_array, layout_out, N, S, fp, typegen): + base_offset = self.get_array_offset(out_array) + n = (N-1)//2 + 1 + twiddles = self.generate_twiddles('dct2_twiddles', base=-1.0j*np.pi/(2*N), count=N//2+1, + dtype=out_array.dtype, typegen=typegen) + post = \ + ''' + {twiddles} + void post_callback(__global void* output, uint k, __global void* userdata, {fp}2 R) {{ + __global {fp}* out = (__global {fp}*)(output) + {base_offset}uL; + //const {fp}2 T = ({fp}2)(cos(-pi*k/(2*N)), sin(-pi*k/(2*N))); + const {fp}2 T = dct2_twiddles[k]; + if (k < {n}) {{ + out[k] = +2*(R.x*T.x - R.y*T.y)/{S}; + }} + if (k > 0) {{ + out[{N}-k] = -2*(R.x*T.y + R.y*T.x)/{S}; + }} + }}'''.format(N=N, S=S, n=n, fp=fp, base_offset=base_offset, + twiddles=twiddles) + return post, None + + +class GpyDCTIIIPlan(GpyR2RPlan): + def setup_plan(self, in_array, out_array, axes, direction_forward, scaling, scale_by_size): + assert in_array.data != out_array.data, 'inplace R2R transforms are not supported.' + axis = in_array.ndim - 1 + assert len(axes)==1 + assert axes[0] in (-1, axis) + axes = np.asarray([axis]) + + assert scale_by_size>0 + + # build a fake R2C plan + layout_in = gfft.CLFFT_HERMITIAN_INTERLEAVED + layout_out = gfft.CLFFT_REAL + + shape = in_array.shape + dtype = in_array.dtype + fp = dtype_to_ctype(dtype) + N = shape[axis] + rshape = mk_shape(shape, axis, N) + cshape = mk_shape(shape, axis, N//2+1) + ctype = float_to_complex_dtype(dtype) + + fake_input = self.fake_array(shape=cshape, dtype=ctype) + fake_output = self.fake_array(shape=rshape, dtype=dtype) + + t_strides_in, t_distance_in, t_batchsize_in, t_shape_in, axes_transform_in = \ + self.calculate_transform_strides(axes, fake_input) + t_strides_out, t_distance_out, t_batchsize_out, t_shape_out, axes_transform_out = \ + self.calculate_transform_strides(axes, fake_output) + + assert np.array_equal(t_batchsize_in, t_batchsize_out) + assert np.array_equal(axes_transform_in, axes_transform_out) + t_shape = t_shape_out + t_batch_size = t_batchsize_in + t_inplace = False + + plan = GFFT.create_plan(self.context, t_shape) + plan.inplace = t_inplace + plan.batch_size = t_batch_size + plan.strides_in = t_strides_in + plan.strides_out = t_strides_out + plan.distances = (t_distance_in, t_distance_out) + plan.layouts = (layout_in, layout_out) + if (dtype == np.float32): + plan.precision = gfft.CLFFT_SINGLE + precision = Precision.FLOAT + else: + plan.precision = gfft.CLFFT_DOUBLE + precision = Precision.DOUBLE + if (scaling is not None): + plan.scale_forward = scaling/float(N) + plan.scale_backward = scaling/float(N) + + typegen = self.cl_env.build_typegen(precision=precision, + float_dump_mode='hex', use_short_circuit_ops=False, unroll_loops=False) + + (in_data, out_data) = self.set_callbacks(plan, in_array, out_array, + layout_in, layout_out, + N=N, S=scale_by_size, + fp=fp, typegen=typegen) + + self.in_data = in_data + self.out_data = out_data + self.is_inplace = t_inplace + self.direction_forward = direction_forward + self.plan = plan + + def pre_offset_callback(self, in_array, layout_in, N, S, fp, typegen): + base_offset = self.get_array_offset(in_array) + twiddles = self.generate_twiddles('dct3_twiddles', base=+1.0j*np.pi/(2*N), count=N//2+1, + dtype=in_array.dtype, typegen=typegen) + pre = \ + ''' + {twiddles} + {fp}2 pre_callback(const __global void* input, uint k, __global void* userdata) {{ + __global {fp}* in = (__global {fp}*)(input) + {base_offset}uL; + const {fp}2 T = ({fp}2)(cos({pi}*k/(2*{N})), sin({pi}*k/(2*{N}))); + //const {fp}2 T = dct3_twiddles[k]; + {fp}2 C, R; + R.x = in[k]; + if ( k == 0 ) {{ + R.y = 0; + }} + else {{ + R.y = -in[{N}-k]; + }} + C.x = R.x*T.x - R.y*T.y; + C.y = R.x*T.y + R.y*T.x; + return C; + }}'''.format(N=N, fp=fp, base_offset=base_offset, + pi=typegen.dump(np.pi), twiddles=twiddles) + return pre, None + + def post_offset_callback(self, out_array, layout_out, N, S, fp, typegen): + base_offset = self.get_array_offset(out_array) + n = (N-1)//2 + 1 + post = \ + ''' + void post_callback(__global void* output, uint k, __global void* userdata, {fp} R) {{ + __global {fp}* out = (__global {fp}*)(output) + {base_offset}uL; + if (k < {n}) {{ + out[2*k] = {N}*R/{S}; + }} + else {{ + out[2*({N}-k)-1] = {N}*R/{S}; + }} + }}'''.format(N=N, S=S, n=n, fp=fp, base_offset=base_offset) + return post, None + + + +class GpyDSTIPlan(GpyR2RPlan): + pass +class GpyDSTIIPlan(GpyR2RPlan): + pass +class GpyDSTIIIPlan(GpyR2RPlan): + pass + + +class GpyFFT(FFTI): + """ + Interface to compute local to process FFT-like transforms using the clFFT backend + trough the gpyfft python interface. + + clFFT backend has many advantages: + - single and double precision supported + - no intermediate temporary buffers created at each call. + - all required temporary buffers can be supplied or are auto-allocated only once. + - planning capability but no caching capabilities + - injection of custom opencl code for pre and post processing. + + Planning may destroy initial arrays content. + Executing a plan may result in unwanted writes to output data, see notes. + + Notes + ----- + Output array is used during transform and if out.data is not aligned + on device.MEM_BASE_ADDR_ALIGN the begining of the buffer may be overwritten by + intermediate transform results. + + out.data = out.base_data + out.offset + if (offset%alignment > 0) + out.base_data[0:out.size] + will be trashed during computation and the result of the transform will go to + out.base_data[out.offset:out.offset+out.size] + + Thus for every transforms out.base_data[0:min(out.offset,out.size)] may be overwritten with trash data. + The default behaviour is to warn when output data is not aligned on device memory boundary. + """ + + def __init__(self, cl_env, backend=None, + warn_on_allocation=True, + warn_on_unaligned_output_offset=True): + + if (backend is None): + backend = OpenClArrayBackend.get_or_create(cl_env=cl_env, + queue=cl_env.default_queue, allocator=None) + + super(GpyFFT, self).__init__(backend=backend) + self.supported_ftypes = (np.float32, np.float64) + self.supported_ctypes = (np.complex64, np.complex128) + self.supported_cosine_transforms = (1,2,3) + self.supported_sine_transforms = () + + self.cl_env = cl_env + self.warn_on_allocation = warn_on_allocation + self.warn_on_unaligned_output_offset = warn_on_unaligned_output_offset + + def allocate_output(self, out, shape, dtype): + """Alocate output if required and check shape and dtype.""" + if (out is None): + if self.warn_on_allocation: + nbytes = prod(shape)*dtype.itemsize + msg='GpyFFT: allocating output array of size {}.' + msg=msg.format(bytes2str(nbytes)) + warnings.warn(msg, HysopGpyFftWarning) + out = self.backend.empty(shape=shape, dtype=dtype) + else: + assert out.dtype == dtype + assert out.shape == shape + return out + + def bake_kwds(self, **kwds): + plan_kwds = {} + plan_kwds['in_array'] = kwds.pop('a') + plan_kwds['out_array'] = kwds.pop('out') + plan_kwds['scaling'] = kwds.pop('scaling', None) + plan_kwds['scale_by_size'] = kwds.pop('scale_by_size', None) + plan_kwds['direction_forward'] = kwds.pop('direction_forward', True) + plan_kwds['axes'] = kwds.pop('axes', (kwds.pop('axis'),)) + plan_kwds['cl_env'] = kwds.pop('cl_env', self.cl_env) + plan_kwds['warn_on_allocation'] = kwds.pop('warn_on_allocation', self.warn_on_allocation) + plan_kwds['warn_on_unaligned_output_offset'] = \ + kwds.pop('warn_on_unaligned_output_offset', + self.warn_on_unaligned_output_offset) + + if kwds: + msg='Unknown keyword arguments: {}' + msg=msg.format(', '.join('\'{}\''.format(kwd) for kwd in kwds.keys())) + raise RuntimeError(msg) + + return plan_kwds + + def fft(self, a, out=None, axis=-1, **kwds): + (shape, dtype) = super(GpyFFT, self).fft(a=a, out=out, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) + kwds = self.bake_kwds(a=a, out=out, axis=axis, **kwds) + plan = GpyFFTPlan(**kwds) + return plan + + def ifft(self, a, out=None, axis=-1, **kwds): + (shape, dtype) = super(GpyFFT, self).ifft(a=a, out=out, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) + kwds = self.bake_kwds(a=a, out=out, axis=axis, direction_forward=False, **kwds) + plan = GpyFFTPlan(**kwds) + return plan + + def rfft(self, a, out=None, axis=-1, **kwds): + (shape, dtype) = super(GpyFFT, self).rfft(a=a, out=out, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) + kwds = self.bake_kwds(a=a, out=out, axis=axis, **kwds) + plan = GpyFFTPlan(**kwds) + return plan + + def irfft(self, a, out=None, n=None, axis=-1, **kwds): + (shape, dtype) = super(GpyFFT, self).irfft(a=a, out=out, axis=axis, + n=n, **kwds) + out = self.allocate_output(out, shape, dtype) + kwds = self.bake_kwds(a=a, out=out, axis=axis, **kwds) + plan = GpyFFTPlan(**kwds) + return plan + + def dct(self, a, out=None, type=2, axis=-1, **kwds): + (shape, dtype) = super(GpyFFT, self).dct(a=a, out=out, type=type, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) + kwds = self.bake_kwds(a=a, out=out, axis=axis, **kwds) + if type==1: + plan = GpyDCTIPlan(**kwds) + elif type==2: + plan = GpyDCTIIPlan(**kwds) + elif type==3: + plan = GpyDCTIIIPlan(**kwds) + else: + msg='Unimplemented cosine transform type {}'.format(itype) + raise RuntimeError(msg) + return plan + + def idct(self, a, out=None, type=2, axis=-1, **kwds): + (shape, dtype, itype, logical_size) = super(GpyFFT, self).idct(a=a, out=out, type=type, + axis=axis, **kwds) + return self.dct(a=a, out=out, type=itype, axis=axis, scale_by_size=logical_size, **kwds) + + def dst(self, a, out=None, type=2, axis=-1, **kwds): + (shape, dtype) = super(GpyFFT, self).dst(a=a, out=out, type=type, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) + kwds = self.bake_kwds(a=a, out=out, axis=axis, **kwds) + if type==1: + plan = GpyDSTIPlan(**kwds) + elif type==2: + plan = GpyDSTIIPlan(**kwds) + elif type==3: + plan = GpyDSTIIIPlan(**kwds) + else: + msg='Unimplemented sine transform type {}'.format(itype) + raise RuntimeError(msg) + return plan + + def idst(self, a, out=None, type=2, axis=-1, **kwds): + (shape, dtype, itype, logical_size) = super(GpyFFT, self).idst(a=a, out=out, type=type, + axis=axis, **kwds) + kwds = self.bake_kwds(a=a, out=out, axis=axis, **kwds) + return self.dst(a=a, out=out, type=itype, axis=axis, scale_by_size=logical_size, **kwds) + diff --git a/hysop/numerics/fft/numpy_fft.py b/hysop/numerics/fft/numpy_fft.py index 964b04ba246726377f6d0bcaee9d250a54e84643..e49713d66cb72804c1ca17f27abf48ee3b1f6132 100644 --- a/hysop/numerics/fft/numpy_fft.py +++ b/hysop/numerics/fft/numpy_fft.py @@ -9,10 +9,10 @@ from numpy import fft as _FFT from hysop.tools.types import first_not_None from hysop.numerics.fft.fft import FFTPlanI, FFTI, \ - mk_view, mk_shape, \ - complex_to_float_dtype, float_to_complex_dtype + complex_to_float_dtype, float_to_complex_dtype, \ + mk_view, mk_shape, HostArray -def dct(a, type=2, axis=-1): +def dct(a, out=None, type=2, axis=-1): ndim = a.ndim shape = a.shape N = a.shape[axis] @@ -24,8 +24,13 @@ def dct(a, type=2, axis=-1): X = np.empty(shape=s0, dtype=a.dtype) np.concatenate((a, a[slc0][slc1]), axis=axis, out=X) res = _FFT.rfft(X, axis=axis).real + if (out is None): + out = res + else: + assert out.shape == res.shape + out[...] = res elif (type==2): - # O(sqrt(log(N))) error, O(N) complexity, O(4N) memory + # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory n0 = N//2 + 1 n1 = (N-1)//2 + 1 slc0 = mk_view(ndim, axis, 1, None, None) @@ -41,11 +46,14 @@ def dct(a, type=2, axis=-1): X = _FFT.rfft(X, axis=axis) X *= (2*np.exp(-1j*np.pi*np.arange(n0)/(2*N)))[slc4] - res = np.empty_like(a) - res[slc5] = +X.real[slc5] - res[slc6] = -X.imag[slc0][slc3] + if (out is None): + out = np.empty_like(a) + else: + assert out.shape == a.shape + out[slc5] = +X.real[slc5] + out[slc6] = -X.imag[slc0][slc3] elif (type==3): - # O(sqrt(log(N))) error, O(N) complexity, O(4N) memory + # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory ctype = float_to_complex_dtype(a.dtype) n0 = N//2 + 1 n1 = (N-1)//2 + 1 @@ -66,17 +74,20 @@ def dct(a, type=2, axis=-1): X = _FFT.irfft(X, axis=axis, n=N) X *= N - res = np.empty_like(a) - res[slc4] = X[slc5] - res[slc6] = X[slc2][slc3] + if (out is None): + out = np.empty_like(a) + else: + assert out.shape == a.shape + out[slc4] = X[slc5] + out[slc6] = X[slc2][slc3] else: stypes=['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII'] msg='DCT-{} has not been implemented yet.' msg=msg.format(stypes[type-1]) raise NotImplementedError(msg) - return res + return out -def dst(a, type=2, axis=-1): +def dst(a, out=None, type=2, axis=-1): ndim = a.ndim shape = a.shape N = a.shape[axis] @@ -90,9 +101,13 @@ def dst(a, type=2, axis=-1): Z = np.zeros(shape=s1, dtype=a.dtype) np.concatenate((Z, -a, Z, a[slc0]), axis=axis, out=X) res = _FFT.rfft(X, axis=axis).imag - res = res[slc1] + if (out is None): + out = np.empty_like(a) + else: + assert out.shape == a.shape + out[...] = res[slc1] elif (type==2): - # O(sqrt(log(N))) error, O(N) complexity, O(4N) memory + # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory n0 = N//2 + 1 n1 = (N-1)//2 + 1 slc0 = mk_view(ndim, axis, 1, None, None) @@ -109,54 +124,55 @@ def dst(a, type=2, axis=-1): X = _FFT.rfft(X, axis=axis) X *= (2*np.exp(-1j*np.pi*np.arange(n0)/(2*N)))[slc4] - res = np.empty_like(a) - res[slc5] = -X.imag[slc7] - res[slc6] = +X.real[slc3] + if (out is None): + out = np.empty_like(a) + else: + assert out.shape == a.shape + out[slc5] = -X.imag[slc7] + out[slc6] = +X.real[slc3] elif (type==3): - # O(sqrt(log(N))) error, O(N) complexity, O(4N) memory + # O(sqrt(log(N))) error, O(N) complexity, O(3N) memory ctype = float_to_complex_dtype(a.dtype) n0 = N//2 + 1 n1 = (N-1)//2 + 1 slc0 = mk_view(ndim, axis, None, n0, None) - slc1 = mk_view(ndim, axis, 1 , None, None) - slc2 = mk_view(ndim, axis, n1 , None, None) - slc3 = mk_view(ndim, axis, None, None, -1) - slc4 = mk_view(ndim, axis, None, None, +2) - slc5 = mk_view(ndim, axis, None, n1, None) - slc6 = mk_view(ndim, axis, 1, None, +2) - slc7 = mk_view(ndim, axis, None, None, None, default=None) + slc1 = mk_view(ndim, axis, None, None, -1) + slc2 = mk_view(ndim, axis, 1, None, None) + slc3 = mk_view(ndim, axis, None, N-n1, None) + slc4 = mk_view(ndim, axis, None, None, None, default=None) + slc5 = mk_view(ndim, axis, None, None, 2) + slc6 = mk_view(ndim, axis, None, n1, None) + slc7 = mk_view(ndim, axis, 1, None, 2) + slc8 = mk_view(ndim, axis, n1, None, None) s0 = mk_shape(shape, axis, n0) X = np.zeros(shape=s0, dtype=ctype) - X.real[slc0] = +a[slc0] - X.imag[slc1] = -a[slc2][slc3] - X *= np.exp(+1j*np.pi*np.arange(n0)/(2*N))[slc7] + X.real[slc0] = +a[slc1][slc0] + X.imag[slc2] = -a[slc3] + X *= np.exp(+1j*np.pi*np.arange(n0)/(2*N))[slc4] X = _FFT.irfft(X, axis=axis, n=N) - X *= N + X[...] *= N - res = np.empty_like(a) - res[slc4] = X[slc5] - res[slc6] = X[slc2][slc3] + if (out is None): + out = np.empty_like(a) + else: + assert out.shape == a.shape + out[slc5] = +X[slc6] + out[slc7] = -X[slc8][slc1] else: stypes=['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII'] msg='DCT-{} has not been implemented yet.' msg=msg.format(stypes[type-1]) raise NotImplementedError(msg) - return res + return out -def idct(a, type=2, axis=-1, **kwds): +def idct(a, out=None, type=2, axis=-1, **kwds): itype = [1,3,2,4][type-1] - return dct(a=a, type=itype, axis=axis, **kwds) + return dct(a=a, out=out, type=itype, axis=axis, **kwds) -def idst(a, type=2, axis=-1, **kwds): +def idst(a, out=None, type=2, axis=-1, **kwds): itype = [1,3,2,4][type-1] - return dst(a=a, type=itype, axis=axis, **kwds) - -# extend numpy fft interface with R2R transforms -for (fname,fn) in zip( ('dct', 'dst', 'idct', 'idst'), - (dct, dst, idct, idst) ): - if not hasattr(_FFT, fname): - setattr(_FFT, fname, fn) + return dst(a=a, out=out, type=itype, axis=axis, **kwds) class NumpyFFTPlan(FFTPlanI): @@ -164,58 +180,44 @@ class NumpyFFTPlan(FFTPlanI): Wrap a numpy fft call (numpy.fft does not offer real planning capabilities). """ - def __init__(self, fn, out, - scaling=None, - output_dtype=lambda dtype: dtype, - **kwds): + def __init__(self, fn, a, out, scaling=None, **kwds): super(NumpyFFTPlan, self).__init__() - assert callable(output_dtype) - self.fn = fn - self.out = out - self.kwds = kwds.copy() - self.scaling = scaling - self.output_dtype = output_dtype + + self.fn = fn + self.a = a + self.out = out + self.scaling = scaling + + if isinstance(a, HostArray): + a = a.handle + if isinstance(out, HostArray): + out = out.handle + + kwds = kwds.copy() + kwds['a'] = a + if fn in (dct, idct, dst, idst): + kwds['out'] = out + self.kwds = kwds + + @property + def input_array(self): + return self.a + + @property + def output_array(self): + return self.out def execute(self): out = self.out scaling = self.scaling - - res = self.fn(**self.kwds) - - if (scaling is not None): - res[...] *= scaling - - if (out is None): - # explicit cast - cast = self.output_dtype(self.kwds['a'].dtype) - return res.astype(cast) + + if self.fn in (dct, idct, dst, idst): + self.fn(**self.kwds) else: - # implicit cast (during copy) - out[...] = res - return out - - def __call__(self, out=None, **kwds): - fn_kwds = self.kwds.copy() - fn_kwds.update(kwds) - if (self.fn is _FFT.irfft): - assert 'n' in fn_kwds + out[...] = self.fn(**self.kwds) - out = first_not_None(out, self.out) - scaling = self.scaling - - res = self.fn(**fn_kwds) - if (scaling is not None): - res[...] *= scaling - - if (out is None): - # explicit cast - cast = self.output_dtype(fn_kwds['a'].dtype) - return res.astype(cast) - else: - # implicit cast (during copy) - out[...] = res - return out + out[...] *= scaling class NumpyFFT(FFTI): @@ -231,55 +233,63 @@ class NumpyFFT(FFTI): The only advantage is that planning won't destroy original inputs. """ - def __init__(self): - super(NumpyFFT, self).__init__() + def __init__(self, backend=None, **kwds): + super(NumpyFFT, self).__init__(backend=backend, **kwds) self.supported_ftypes = (np.float32, np.float64,) self.supported_ctypes = (np.complex64, np.complex128,) + def fft(self, a, out=None, axis=-1, **kwds): - super(NumpyFFT, self).fft(a=a, out=out, axis=axis, **kwds) + (shape, dtype) = super(NumpyFFT, self).fft(a=a, out=out, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) plan = NumpyFFTPlan(fn=_FFT.fft, a=a, out=out, axis=axis, **kwds) return plan def ifft(self, a, out=None, axis=-1, **kwds): - super(NumpyFFT, self).ifft(a=a, out=out, axis=axis, **kwds) + (shape, dtype) = super(NumpyFFT, self).ifft(a=a, out=out, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) plan = NumpyFFTPlan(fn=_FFT.ifft, a=a, out=out, axis=axis, **kwds) return plan def rfft(self, a, out=None, axis=-1, **kwds): - super(NumpyFFT, self).rfft(a=a, out=out, axis=axis, **kwds) + (shape, dtype) = super(NumpyFFT, self).rfft(a=a, out=out, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) plan = NumpyFFTPlan(fn=_FFT.rfft, a=a, out=out, axis=axis, - output_dtype=lambda x: float_to_complex_dtype(x), **kwds) return plan def irfft(self, a, out=None, n=None, axis=-1, **kwds): - (rshape, _) = super(NumpyFFT, self).irfft(a=a, out=out, n=n, axis=axis, **kwds) + (shape, dtype) = super(NumpyFFT, self).irfft(a=a, out=out, n=n, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) plan = NumpyFFTPlan(fn=_FFT.irfft, a=a, out=out, axis=axis, - output_dtype=lambda x: complex_to_float_dtype(x), - n=rshape[axis], **kwds) + n=shape[axis], **kwds) return plan def dct(self, a, out=None, type=2, axis=-1, **kwds): (shape, dtype) = super(NumpyFFT, self).dct(a=a, out=out, type=type, axis=axis, **kwds) - plan = NumpyFFTPlan(fn=_FFT.dct, a=a, out=out, axis=axis, type=type, **kwds) + out = self.allocate_output(out, shape, dtype) + plan = NumpyFFTPlan(fn=dct, a=a, out=out, axis=axis, type=type, **kwds) return plan def idct(self, a, out=None, type=2, axis=-1, scaling=None, **kwds): (shape, dtype, _, s) = super(NumpyFFT, self).idct(a=a, out=out, type=type, axis=axis, **kwds) - plan = NumpyFFTPlan(fn=_FFT.idct, a=a, out=out, axis=axis, type=type, - scaling=first_not_None(scaling, s), **kwds) + out = self.allocate_output(out, shape, dtype) + plan = NumpyFFTPlan(fn=idct, a=a, out=out, axis=axis, type=type, + scaling=first_not_None(scaling, 1.0/s), **kwds) return plan def dst(self, a, out=None, type=2, axis=-1, **kwds): (shape, dtype) = super(NumpyFFT, self).dst(a=a, out=out, type=type, axis=axis, **kwds) - plan = NumpyFFTPlan(fn=_FFT.dst, a=a, out=out, axis=axis, type=type, **kwds) + out = self.allocate_output(out, shape, dtype) + plan = NumpyFFTPlan(fn=dst, a=a, out=out, axis=axis, type=type, **kwds) return plan def idst(self, a, out=None, type=2, axis=-1, scaling=None, **kwds): - (shape, dtype, _, s) = super(NumpyFFT, self).idst(a=a, out=out, type=type, axis=axis, **kwds) - plan = NumpyFFTPlan(fn=_FFT.idst, a=a, out=out, axis=axis, type=type, - scaling=first_not_None(scaling, s), **kwds) + (shape, dtype, _, s) = super(NumpyFFT, self).idst(a=a, out=out, type=type, axis=axis, + **kwds) + out = self.allocate_output(out, shape, dtype) + plan = NumpyFFTPlan(fn=idst, a=a, out=out, axis=axis, type=type, + scaling=first_not_None(scaling, 1.0/s), **kwds) return plan diff --git a/hysop/numerics/fft/scipy_fft.py b/hysop/numerics/fft/scipy_fft.py index c1d5fb68fe3f9c1e31b67156b95c1d7afbd45883..cf7c76042240e34f924f80f0b442836a1333114b 100644 --- a/hysop/numerics/fft/scipy_fft.py +++ b/hysop/numerics/fft/scipy_fft.py @@ -9,7 +9,9 @@ import scipy as sp from scipy import fftpack as _FFT from hysop.tools.types import first_not_None -from hysop.numerics.fft.fft import FFTPlanI, FFTI, complex_to_float_dtype, float_to_complex_dtype +from hysop.numerics.fft.fft import FFTPlanI, FFTI, \ + complex_to_float_dtype, float_to_complex_dtype, \ + mk_shape, mk_view, HostArray class ScipyFFTPlan(FFTPlanI): @@ -17,74 +19,97 @@ class ScipyFFTPlan(FFTPlanI): Wrap a scipy fftpack call (scipy.fftpack does not offer real planning capabilities). """ - def __init__(self, fn, out, + def __init__(self, fn, x, out, axis, scaling=None, **kwds): super(ScipyFFTPlan, self).__init__() self.fn = fn + self.x = x self.out = out + self.axis = axis self.scaling = scaling - self.kwds = kwds.copy() - def execute(self): - return self.__call__() + kwds = kwds.copy() + kwds['x'] = x + kwds['axis'] = axis + self.kwds = kwds + + @property + def input_array(self): + return self.x + + @property + def output_array(self): + return self.out - def __call__(self, out=None, **kwds): + def execute(self): fn = self.fn fn_kwds = self.kwds.copy() - fn_kwds.update(kwds) if (fn is _FFT.irfft): assert 'n' in fn_kwds - in_ = fn_kwds['x'] - axis = fn_kwds['axis'] + x = self.x + out = self.out + if isinstance(x, HostArray): + x = x.handle + if isinstance(out, HostArray): + out = out.handle + + axis = self.axis + mkv = lambda *args, **kwds: mk_view(x.ndim, axis, *args, **kwds) if fn is _FFT.irfft: - assert axis in (in_.ndim-1, -1) - in_ = in_.view(dtype=complex_to_float_dtype(in_.dtype)) + x = x.view(dtype=complex_to_float_dtype(x.dtype)) is_even = (fn_kwds['n']%2==0) - rshape = list(in_.shape) - rshape[-1] -= 1 + is_even - _in = np.empty(shape=rshape, dtype=in_.dtype) - _in[...,1:] = in_[...,2:in_.shape[-1]-is_even] - _in[...,0] = in_[...,0] + rshape = list(x.shape) + rshape[axis] -= (1 + is_even) + _in = np.empty(shape=rshape, dtype=x.dtype) + _in[mkv(1,None)] = x[mkv(2,x.shape[axis]-is_even)] + _in[mkv(0)] = x[mkv(0)] fn_kwds['x'] = _in # custom implementation of DCT-I and DST-I # default scipy FFTPACK implementation has optimal complexity # but O(sqrt(N)) error. if fn in (_FFT.dct, _FFT.idct) and fn_kwds['type']==1: - assert axis in (in_.ndim-1, -1) - N = in_.shape[axis] - X = np.hstack((in_, in_[1:-1][::-1])).astype(in_.dtype) + N = x.shape[axis] + shape = x.shape + ndim = x.ndim + slc0 = mk_view(ndim, axis, 1, -1) + slc1 = mk_view(ndim, axis, None, None, -1) + slc2 = mk_view(ndim, axis, 2, N, None) + slc3 = mk_view(ndim, axis, 3, None, 2) + slc4 = mk_view(ndim, axis, None, N, None) + s0 = mk_shape(shape, axis, 2*N-2) + X = np.empty(shape=s0, dtype=x.dtype) + np.concatenate((x, x[slc0][slc1]), axis=axis, out=X) res = _FFT.rfft(X, axis=axis) - res[2:N] = res[3::2] - res = res[:N] + res[slc2] = res[slc3] + res = res[slc4] elif fn in (_FFT.dst, _FFT.idst) and fn_kwds['type']==1: - assert axis in (in_.ndim-1, -1) - N = in_.shape[axis] - X = np.hstack((0, -in_, 0, +in_[::-1])).astype(in_.dtype) + assert axis in (x.ndim-1, -1) + N = x.shape[axis] + X = np.hstack((0, -x, 0, +x[::-1])).astype(x.dtype) res = _FFT.rfft(X, axis=axis)[2::2] else: res = fn(**fn_kwds) - out = first_not_None(out, self.out) if fn is _FFT.rfft: - assert axis in (in_.ndim-1, -1) - is_even = (in_.shape[axis] % 2 == 0) + assert axis in (x.ndim-1, -1) + is_even = (x.shape[axis] % 2 == 0) if (out is None): rshape = list(res.shape) rshape[axis] += 1 + is_even - out = np.empty(dtype=in_.dtype, shape=rshape) + out = np.empty(dtype=x.dtype, shape=rshape) else: rtype = complex_to_float_dtype(out.dtype) out = out.view(dtype=rtype) ctype = float_to_complex_dtype(out.dtype) - out[..., 1:out.shape[-1]-is_even] = res - out[..., 0] = out[..., 1] - out[..., 1] = 0.0 + out[mkv(1,out.shape[axis]-is_even)] = res + out[mkv(0)] = out[mkv(1)] + out[mkv(1)] = 0.0 if is_even: - out[..., -1] = 0.0 + out[mkv(-1)] = 0.0 out = out.view(dtype=ctype) elif (out is not None): out[...] = res @@ -111,53 +136,62 @@ class ScipyFFT(FFTI): Planning won't destroy original inputs. """ - def __init__(self): - super(ScipyFFT, self).__init__() + def __init__(self, **kwds): + super(ScipyFFT, self).__init__(**kwds) self.supported_ftypes = (np.float32, np.float64,) self.supported_ctypes = (np.complex64, np.complex128,) def fft(self, a, out=None, axis=-1, **kwds): - super(ScipyFFT, self).fft(a=a, out=out, axis=axis, **kwds) + (shape, dtype) = super(ScipyFFT, self).fft(a=a, out=out, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) plan = ScipyFFTPlan(fn=_FFT.fft, x=a, out=out, axis=axis, **kwds) return plan def ifft(self, a, out=None, axis=-1, **kwds): - super(ScipyFFT, self).ifft(a=a, out=out, axis=axis, **kwds) + (shape, dtype) = super(ScipyFFT, self).ifft(a=a, out=out, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) plan = ScipyFFTPlan(fn=_FFT.ifft, x=a, out=out, axis=axis, **kwds) return plan def rfft(self, a, out=None, axis=-1, **kwds): - super(ScipyFFT, self).rfft(a=a, out=out, axis=axis, **kwds) + (shape, dtype) = super(ScipyFFT, self).rfft(a=a, out=out, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) plan = ScipyFFTPlan(fn=_FFT.rfft, x=a, out=out, axis=axis, **kwds) return plan def irfft(self, a, out=None, n=None, axis=-1, **kwds): - (rshape, _) = super(ScipyFFT, self).irfft(a=a, out=out, n=n, axis=axis, **kwds) + (shape, dtype) = super(ScipyFFT, self).irfft(a=a, out=out, n=n, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) plan = ScipyFFTPlan(fn=_FFT.irfft, x=a, out=out, axis=axis, - n=rshape[axis], **kwds) + n=shape[axis], **kwds) return plan def dct(self, a, out=None, type=2, axis=-1, **kwds): (shape, dtype) = super(ScipyFFT, self).dct(a=a, out=out, type=type, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) plan = ScipyFFTPlan(fn=_FFT.dct, x=a, out=out, axis=axis, type=type, **kwds) return plan def idct(self, a, out=None, type=2, axis=-1, scaling=None, **kwds): (shape, dtype, _, s) = super(ScipyFFT, self).idct(a=a, out=out, type=type, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) plan = ScipyFFTPlan(fn=_FFT.idct, x=a, out=out, axis=axis, type=type, - scaling=first_not_None(scaling, s), **kwds) + scaling=first_not_None(scaling, 1.0/s), **kwds) return plan def dst(self, a, out=None, type=2, axis=-1, **kwds): (shape, dtype) = super(ScipyFFT, self).dst(a=a, out=out, type=type, axis=axis, **kwds) + out = self.allocate_output(out, shape, dtype) plan = ScipyFFTPlan(fn=_FFT.dst, x=a, out=out, axis=axis, type=type, **kwds) return plan def idst(self, a, out=None, type=2, axis=-1, scaling=None, **kwds): - (shape, dtype, _, s) = super(ScipyFFT, self).idst(a=a, out=out, type=type, axis=axis, **kwds) + (shape, dtype, _, s) = super(ScipyFFT, self).idst(a=a, out=out, type=type, axis=axis, + **kwds) + out = self.allocate_output(out, shape, dtype) plan = ScipyFFTPlan(fn=_FFT.idst, x=a, out=out, axis=axis, type=type, - scaling=first_not_None(scaling, s), **kwds) + scaling=first_not_None(scaling, 1.0/s), **kwds) return plan diff --git a/hysop/numerics/tests/test_fft.py b/hysop/numerics/tests/test_fft.py index 60e630514c457054e175dcda7ccfa584ec266712..b3ce1078de7927c26a2bfbfb8d3e0acb24b77c56 100644 --- a/hysop/numerics/tests/test_fft.py +++ b/hysop/numerics/tests/test_fft.py @@ -8,25 +8,40 @@ import numpy as np import itertools as it from hysop.deps import it, sm, random -from hysop.constants import HYSOP_REAL +from hysop.constants import Implementation from hysop.testsenv import __ENABLE_LONG_TESTS__, __HAS_OPENCL_BACKEND__ from hysop.testsenv import opencl_failed, iter_clenv from hysop.tools.contexts import printoptions from hysop.tools.numerics import float_to_complex_dtype from hysop.tools.types import check_instance, first_not_None -from hysop.numerics.fft.numpy_fft import NumpyFFT -from hysop.numerics.fft.scipy_fft import ScipyFFT -from hysop.numerics.fft.fftw_fft import FftwFFT +from hysop.numerics.fft.fft import mk_shape +from hysop.numerics.fft.numpy_fft import NumpyFFT +from hysop.numerics.fft.scipy_fft import ScipyFFT +from hysop.numerics.fft.fftw_fft import FftwFFT +from hysop.numerics.fft.gpyfft_fft import GpyFFT class TestFFT(object): implementations = { - 'numpy': NumpyFFT(), - 'scipy': ScipyFFT(), - 'fftw': FftwFFT(warn_on_allocation=False) + Implementation.PYTHON: { + 'numpy': NumpyFFT(warn_on_allocation=False), + 'scipy': ScipyFFT(warn_on_allocation=False), + 'fftw': FftwFFT(warn_on_allocation=False, + warn_on_misalignment=False) + }, + Implementation.OPENCL: {} } + + for (i,cl_env) in enumerate(iter_clenv()): + print '> Registering opencl backend {} as:\n{}'.format( + i, cl_env) + name = 'clfft{}'.format(i) + implementations[Implementation.OPENCL][name] = \ + GpyFFT(cl_env=cl_env, + warn_on_allocation=False, + warn_on_unaligned_output_offset=False) msg_shape = 'Expected output array shape to be {} but got {} for implementation {}.' msg_dtype = 'Expected output array dtype to be {} but got {} for implementation {}.' @@ -41,241 +56,233 @@ class TestFFT(object): print '::Testing 1D transform, precision {}::'.format(dtype.__name__) eps = np.finfo(dtype).eps ctype = float_to_complex_dtype(dtype) - - print '\n FORWARD C2C: complex to complex forward transform' - for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes(): - print ' FFT shape={:9s} '.format(str(shape)+':'), - Href = np.random.rand(2*N).astype(dtype).view(dtype=ctype).reshape(shape) - H = np.empty_like(Href) - H, Href = self.simd_align(H, Href) - results = {} - for (name, impl) in self.implementations.iteritems(): - if dtype in impl.supported_ftypes: - plan = impl.fft(a=H) - H[...] = Href - out = plan.execute() - assert out.shape == shape, self.msg_shape.format(shape, out.shape, name) - assert out.dtype == ctype, self.msg_dtype.format(ctype, out.dtype, name) - assert np.array_equal(Href, H), self.msg_input_modified.format(name) - refout = out.copy() - out[...] = 0 - out = plan.execute() - assert np.array_equal(out, refout), self.msg_output_modified.format(name) - results[name] = refout / N # forward normalization - self.check_distances(results, eps, self.report_eps, 'forward C2C', failures) - - print '\n BACKWARD C2C: complex to complex backward transform' - for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes(): - print ' IFFT shape={:9s} '.format(str(shape)+':'), - Href = np.random.rand(2*N).astype(dtype).view(dtype=ctype).reshape(shape) - H = np.empty_like(Href) - H, Href = self.simd_align(H, Href) - results = {} - for (name, impl) in self.implementations.iteritems(): - if dtype in impl.supported_ftypes: - plan = impl.ifft(a=H) - H[...] = Href - out = plan.execute() - assert out.shape == shape, self.msg_shape.format(shape, out.shape, name) - assert out.dtype == ctype, self.msg_dtype.format(ctype, out.dtype, name) - assert np.array_equal(Href, H), self.msg_input_modified.format(name) - refout = out.copy() - out[...] = 0 - out = plan.execute() - assert np.array_equal(out, refout), self.msg_output_modified.format(name) - results[name] = refout - self.check_distances(results, eps, self.report_eps, 'backward C2C', failures) + if False: + print '\n FORWARD C2C: complex to complex forward transform' + for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes(): + print ' FFT shape={:9s} '.format(str(shape)+':'), + Href = np.random.rand(2*N).astype(dtype).view(dtype=ctype).reshape(shape) + results = {} + for (kind, implementations) in self.implementations.iteritems(): + for (name, impl) in implementations.iteritems(): + if dtype not in impl.supported_ftypes: + continue + Bin = impl.backend.empty(shape=shape, dtype=ctype) + plan = impl.fft(a=Bin).setup() + Bout = plan.output_array + assert Bout.shape == shape, self.msg_shape.format(shape, Bout.shape, + name) + assert Bout.dtype == ctype, self.msg_dtype.format(ctype, Bout.dtype, + name) + Bin[...] = Href + plan.execute() + H0 = Bin.get() + H1 = Bout.get() + assert np.array_equal(Href, H0), self.msg_input_modified.format(name) + Bout[...] = 0 + evt = plan.execute() + assert plan.output_array is Bout + H2 = Bout.get() + assert np.array_equal(H1, H2), self.msg_output_modified.format(name) + results[name] = H1 / N # forward normalization + self.check_distances(results, eps, self.report_eps, 'forward C2C', failures) + + print '\n BACKWARD C2C: complex to complex backward transform' + for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes(): + print ' IFFT shape={:9s} '.format(str(shape)+':'), + Href = np.random.rand(2*N).astype(dtype).view(dtype=ctype).reshape(shape) + results = {} + for (kind, implementations) in self.implementations.iteritems(): + for (name, impl) in implementations.iteritems(): + if dtype not in impl.supported_ftypes: + continue + Bin = impl.backend.empty(shape=shape, dtype=ctype) + plan = impl.ifft(a=Bin).setup() + Bout = plan.output_array + assert Bout.shape == shape, self.msg_shape.format(shape, Bout.shape, + name) + assert Bout.dtype == ctype, self.msg_dtype.format(ctype, Bout.dtype, + name) + Bin[...] = Href + plan.execute() + H0 = Bin.get() + H1 = Bout.get() + assert np.array_equal(Href, H0), self.msg_input_modified.format(name) + Bout[...] = 0 + evt = plan.execute() + assert plan.output_array is Bout + H2 = Bout.get() + assert np.array_equal(H1, H2), self.msg_output_modified.format(name) + results[name] = H1 / N # forward normalization + self.check_distances(results, eps, self.report_eps, 'backward C2C', failures) + print '\n FORWARD R2C: real to hermitian complex transform' for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes(): print ' RFFT shape={:9s} '.format(str(shape)+':'), Href = np.random.rand(N).astype(dtype).reshape(shape) - H = np.empty_like(Href) - H, Href = self.simd_align(H, Href) results = {} - for (name, impl) in self.implementations.iteritems(): - if dtype in impl.supported_ftypes: - plan = impl.rfft(a=H) - H[...] = Href - out = plan.execute() - assert out.shape == cshape, self.msg_shape.format(cshape, out.shape, name) - assert out.dtype == ctype, self.msg_dtype.format(ctype, out.dtype, name) - assert np.array_equal(Href, H), self.msg_input_modified.format(name) - refout = out.copy() - out[...] = 0 - out = plan.execute() - assert np.array_equal(out, refout), self.msg_output_modified.format(name) - results[name] = refout / N #forward normalization + for (kind, implementations) in self.implementations.iteritems(): + for (name, impl) in implementations.iteritems(): + if dtype not in impl.supported_ftypes: + continue + Bin = impl.backend.empty(shape=shape, dtype=dtype) + plan = impl.rfft(a=Bin).setup() + Bout = plan.output_array + assert Bout.shape == cshape, self.msg_shape.format(cshape, Bout.shape, + name) + assert Bout.dtype == ctype, self.msg_dtype.format(ctype, Bout.dtype, + name) + Bin[...] = Href + plan.execute() + H0 = Bin.get() + H1 = Bout.get() + assert np.array_equal(Href, H0), self.msg_input_modified.format(name) + Bout[...] = 0 + evt = plan.execute() + assert plan.output_array is Bout + H2 = Bout.get() + assert np.array_equal(H1, H2), self.msg_output_modified.format(name) + results[name] = H1 / N # forward normalization self.check_distances(results, eps, self.report_eps, 'R2C', failures) print '\n BACKWARD C2R: real to hermitian complex transform' for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes(): - print ' IRFFT shape={:9s} '.format(str(rshape)+':'), + print ' IRFFT shape={:9s} '.format(str(shape)+':'), Href = np.random.rand(2*Nc).astype(dtype).view(dtype=ctype).reshape(cshape) - H = np.empty_like(Href) - H, Href = self.simd_align(H, Href) results = {} - for (name, impl) in self.implementations.iteritems(): - if dtype in impl.supported_ftypes: - plan = impl.irfft(a=H) - H[...] = Href - out = plan.execute() - assert out.shape == rshape, self.msg_shape.format(rshape, out.shape, name) - assert out.dtype == dtype, self.msg_dtype.format(dtype, out.dtype, name) - assert np.array_equal(Href, H), self.msg_input_modified.format(name) - refout = out.copy() - out[...] = 0 - out = plan.execute() - assert np.array_equal(out, refout), self.msg_output_modified.format(name) - results[name] = refout + for (kind, implementations) in self.implementations.iteritems(): + for (name, impl) in implementations.iteritems(): + if dtype not in impl.supported_ftypes: + continue + Bin = impl.backend.empty(shape=cshape, dtype=ctype) + plan = impl.irfft(a=Bin).setup() + Bout = plan.output_array + assert Bout.shape == rshape, self.msg_shape.format(rshape, Bout.shape, + name) + assert Bout.dtype == dtype, self.msg_dtype.format(dtype, Bout.dtype, + name) + Bin[...] = Href + plan.execute() + H0 = Bin.get() + H1 = Bout.get() + assert np.array_equal(Href, H0), self.msg_input_modified.format(name) + Bout[...] = 0 + evt = plan.execute() + assert plan.output_array is Bout + H2 = Bout.get() + assert np.array_equal(H1, H2), self.msg_output_modified.format(name) + results[name] = H1 self.check_distances(results, eps, self.report_eps, 'normal C2R', failures) - print '\n BACKWARD FORCED C2R: real to hermitian complex transform with specified shape' + print ('\n BACKWARD FORCED C2R: real to hermitian complex transform with specified ' + +'shape') for (shape, cshape, rshape, N, Nc, Nr) in self.iter_shapes(): print ' IRFFT shape={:9s} '.format(str(shape)+':'), Href = np.random.rand(2*Nc).astype(dtype).view(dtype=ctype).reshape(cshape) - H = np.empty_like(Href) - H, Href = self.simd_align(H, Href) results = {} - for (name, impl) in self.implementations.iteritems(): - if dtype in impl.supported_ftypes: - plan = impl.irfft(a=H, n=shape[-1]) - H[...] = Href - out = plan.execute() - assert out.shape == shape, self.msg_shape.format(shape, out.shape, name) - assert out.dtype == dtype, self.msg_dtype.format(dtype, out.dtype, name) - assert np.array_equal(Href, H), self.msg_input_modified.format(name) - refout = out.copy() - out[...] = 0 - out = plan.execute() - assert np.array_equal(out, refout), self.msg_output_modified.format(name) - results[name] = refout + for (kind, implementations) in self.implementations.iteritems(): + for (name, impl) in implementations.iteritems(): + if dtype not in impl.supported_ftypes: + continue + Bin = impl.backend.empty(shape=cshape, dtype=ctype) + plan = impl.irfft(a=Bin, n=shape[-1]).setup() + Bout = plan.output_array + assert Bout.shape == shape, self.msg_shape.format(shape, Bout.shape, + name) + assert Bout.dtype == dtype, self.msg_dtype.format(dtype, Bout.dtype, + name) + Bin[...] = Href + plan.execute() + H0 = Bin.get() + H1 = Bout.get() + assert np.array_equal(Href, H0), self.msg_input_modified.format(name) + Bout[...] = 0 + evt = plan.execute() + assert plan.output_array is Bout + H2 = Bout.get() + assert np.array_equal(H1, H2), self.msg_output_modified.format(name) + results[name] = H1 self.check_distances(results, eps, self.report_eps, 'forced C2R', failures) - + types = ['I ','II ','III','IV '] for (itype,stype) in enumerate(types, 1): print '\n DCT-{}: real to real discrete cosine transform {}'.format( stype.strip(), itype) for (shape, _, _, N, _, _) in self.iter_shapes(): print ' DCT-{} shape={:9s} '.format(stype, str(shape)+':'), + if (itype==1): # real size is 2*(N-1) + N += 1 + shape = mk_shape(shape, -1, shape[-1] + 1) Href = np.random.rand(N).astype(dtype).reshape(shape) - H = np.empty_like(Href) - H, Href = self.simd_align(H, Href) results = {} - for (name, impl) in self.implementations.iteritems(): - if dtype not in impl.supported_ftypes: - continue - if itype not in impl.supported_cosine_transforms: - continue - plan = impl.dct(a=H, type=itype) - H[...] = Href - out = plan.execute() - assert out.shape == shape, self.msg_shape.format(shape, out.shape, name) - assert out.dtype == dtype, self.msg_dtype.format(dtype, out.dtype, name) - assert np.array_equal(Href, H), self.msg_input_modified.format(name) - refout = out.copy() - out[...] = 0 - out = plan.execute() - assert np.array_equal(out, refout), self.msg_output_modified.format(name) - results[name] = out / N # forward normalization + for (kind, implementations) in self.implementations.iteritems(): + for (name, impl) in implementations.iteritems(): + if dtype not in impl.supported_ftypes: + continue + if itype not in impl.supported_cosine_transforms: + continue + + Bin = impl.backend.empty(shape=shape, dtype=dtype) + plan = impl.dct(a=Bin, type=itype).setup() + Bout = plan.output_array + assert Bout.shape == shape, self.msg_shape.format(shape, Bout.shape, + name) + assert Bout.dtype == dtype, self.msg_dtype.format(dtype, Bout.dtype, + name) + Bin[...] = Href + plan.execute() + H0 = Bin.get() + H1 = Bout.get() + assert np.array_equal(Href, H0), self.msg_input_modified.format(name) + Bout[...] = 0 + evt = plan.execute() + assert plan.output_array is Bout + H2 = Bout.get() + assert np.allclose(H1, H2, atol=eps), \ + self.msg_output_modified.format(name) + results[name] = H1 / N self.check_distances(results, eps, self.report_eps, 'DCT-{}'.format(stype), failures) - - for (itype,stype) in enumerate(types, 1): - print '\n DST-{}: real to real discrete sine transform {}'.format( - stype.strip(), itype) - for (shape, _, _, N, _, _) in self.iter_shapes(): - print ' DST-{} shape={:9s} '.format(stype, str(shape)+':'), - Href = np.random.rand(N).astype(dtype).reshape(shape) - H = np.empty_like(Href) - H, Href = self.simd_align(H, Href) - results = {} - for (name, impl) in self.implementations.iteritems(): - if dtype not in impl.supported_ftypes: - continue - if itype not in impl.supported_sine_transforms: - continue - plan = impl.dst(a=H, type=itype) - H[...] = Href - out = plan.execute() - assert out.shape == shape, self.msg_shape.format(shape, out.shape, name) - assert out.dtype == dtype, self.msg_dtype.format(dtype, out.dtype, name) - assert np.array_equal(Href, H), self.msg_input_modified.format(name) - refout = out.copy() - out[...] = 0 - out = plan.execute() - assert np.array_equal(out, refout), self.msg_output_modified.format(name) - results[name] = out / N # forward normalization - self.check_distances(results, eps, self.report_eps, - 'DST-{}'.format(stype), failures) for (itype,stype) in enumerate(types, 1): + iitype = [1,3,2,4][itype-1] print '\n IDCT-{}: real to real discrete cosine transform {}'.format( stype.strip(), itype) for (shape, _, _, N, _, _) in self.iter_shapes(): print ' IDCT-{} shape={:9s} '.format(stype, str(shape)+':'), + if (iitype==1): # real size is 2*(N-1) + N += 1 + shape = mk_shape(shape, -1, shape[-1] + 1) Href = np.random.rand(N).astype(dtype).reshape(shape) - H = np.empty_like(Href) - H, Href = self.simd_align(H, Href) results = {} - for (name, impl) in self.implementations.iteritems(): - if itype in (1,4): - iitype = itype - elif itype == 2: - iitype = 3 - else: - iitype = 2 - if dtype not in impl.supported_ftypes: - continue - if iitype not in impl.supported_cosine_transforms: - continue - plan = impl.idct(a=H, type=itype) - H[...] = Href - out = plan.execute() - assert out.shape == shape, self.msg_shape.format(shape, out.shape, name) - assert out.dtype == dtype, self.msg_dtype.format(dtype, out.dtype, name) - assert np.array_equal(Href, H), self.msg_input_modified.format(name) - refout = out.copy() - out[...] = 0 - out = plan.execute() - assert np.array_equal(out, refout), self.msg_output_modified.format(name) - results[name] = out + for (kind, implementations) in self.implementations.iteritems(): + for (name, impl) in implementations.iteritems(): + if dtype not in impl.supported_ftypes: + continue + if iitype not in impl.supported_cosine_transforms: + continue + Bin = impl.backend.empty(shape=shape, dtype=dtype) + plan = impl.idct(a=Bin, type=itype).setup() + Bout = plan.output_array + assert Bout.shape == shape, self.msg_shape.format(shape, Bout.shape, + name) + assert Bout.dtype == dtype, self.msg_dtype.format(dtype, Bout.dtype, + name) + Bin[...] = Href + plan.execute() + H0 = Bin.get() + H1 = Bout.get() + assert np.array_equal(Href, H0), self.msg_input_modified.format(name) + Bout[...] = 0 + evt = plan.execute() + assert plan.output_array is Bout + H2 = Bout.get() + assert np.allclose(H1, H2, atol=eps), \ + self.msg_output_modified.format(name) + results[name] = H1 self.check_distances(results, eps, self.report_eps, 'IDCT-{}'.format(stype), failures) - - for (itype,stype) in enumerate(types, 1): - print '\n IDST-{}: real to real discrete sine transform {}'.format( - stype.strip(), itype) - for (shape, _, _, N, _, _) in self.iter_shapes(): - print ' IDST-{} shape={:9s} '.format(stype, str(shape)+':'), - Href = np.random.rand(N).astype(dtype).reshape(shape) - H = np.empty_like(Href) - H, Href = self.simd_align(H, Href) - results = {} - for (name, impl) in self.implementations.iteritems(): - if itype in (1,4): - iitype = itype - elif itype == 2: - iitype = 3 - else: - iitype = 2 - if dtype not in impl.supported_ftypes: - continue - if iitype not in impl.supported_sine_transforms: - continue - plan = impl.idst(a=H, type=itype) - H[...] = Href - out = plan.execute() - assert out.shape == shape, self.msg_shape.format(shape, out.shape, name) - assert out.dtype == dtype, self.msg_dtype.format(dtype, out.dtype, name) - assert np.array_equal(Href, H), self.msg_input_modified.format(name) - refout = out.copy() - out[...] = 0 - out = plan.execute() - assert np.array_equal(out, refout), self.msg_output_modified.format(name) - results[name] = out - self.check_distances(results, eps, self.report_eps, - 'IDST-{}'.format(stype), failures) + def check_distances(self, results, eps, report_eps, tag, failures): if len(results.keys())==0: @@ -319,7 +326,7 @@ class TestFFT(object): ss=() for (name, Einf) in distances.iteritems(): Eeps = int(np.round(Einf/eps)) - failed |= (Eeps > self.report_eps) + failed |= (Eeps >= self.fail_eps) s='{}={}eps'.format(name,Eeps) ss += (s,) print ', '.join(ss) @@ -378,12 +385,7 @@ class TestFFT(object): H0, H1, Href = self.simd_align(H0, H1, Href) results = {} for (name, impl) in self.implementations.iteritems(): - if itype in (1,4): - iitype = itype - elif itype == 2: - iitype = 3 - else: - iitype = 2 + iitype = [1,3,2,4][itype-1] if dtype not in impl.supported_ftypes: continue if itype not in impl.supported_cosine_transforms: @@ -410,12 +412,7 @@ class TestFFT(object): H0, H1, Href = self.simd_align(H0, H1, Href) results = {} for (name, impl) in self.implementations.iteritems(): - if itype in (1,4): - iitype = itype - elif itype == 2: - iitype = 3 - else: - iitype = 2 + iitype = [1,3,2,4][itype-1] if dtype not in impl.supported_ftypes: continue if itype not in impl.supported_sine_transforms: @@ -431,15 +428,17 @@ class TestFFT(object): check_distances(results) def iter_shapes(self): - # minj=(5,1) - # maxj=(15,11) - minj=(2,2) - maxj=(5,5) + minj=(1,1) + maxj=(6,6) + # maxj=(12,8) + blacklists = ((), (7,)) msg = ('EVEN', 'ODD') for i in xrange(2): base = 2+i print ' '+msg[i] for j1 in xrange(minj[i],maxj[i]): + if j1 in blacklists[i]: + continue shape = (base**j1,) cshape = list(shape) cshape[-1] = cshape[-1]//2 + 1 @@ -530,11 +529,11 @@ class TestFFT(object): def perform_tests(self): # not testing np.longdouble because only one implementation supports it - # ie. we cannot compare results between different + # ie. we cannot compare results between different implementations failures = {} - for dtype in (np.float32, np.float64,): - self._test_forward_backward_1d(dtype=dtype) - # self._test_1d(dtype=dtype, failures=failures.setdefault(dtype.__name__, {})) + for dtype in (np.float64,): + #self._test_forward_backward_1d(dtype=dtype) + self._test_1d(dtype=dtype, failures=failures.setdefault(dtype.__name__, {})) self.report_failures(failures) if __name__ == '__main__': diff --git a/hysop/tools/debug_dumper.py b/hysop/tools/debug_dumper.py index 8e5005fafc7b229fec30e681ff07078b9211801a..90dc4eb8878fd223f12fa1cbc08ee6a5c2669d86 100644 --- a/hysop/tools/debug_dumper.py +++ b/hysop/tools/debug_dumper.py @@ -5,9 +5,9 @@ import numpy as np from hysop.tools.types import check_instance from hysop.fields.discrete_field import DiscreteFieldView - class DebugDumper(object): - def __init__(self, name, path='/tmp/hysop/debug', force_overwrite=False): + def __init__(self, name, path='/tmp/hysop/debug', force_overwrite=False, + enable_on_op_apply=False): directory = path+'/'+name blobs_directory = directory + '/data' @@ -28,6 +28,7 @@ class DebugDumper(object): self.blobs_directory = blobs_directory self.runfile = runfile self.dump_id = 0 + self.enable_on_op_apply = enable_on_op_apply self.print_header() diff --git a/hysop/tools/numerics.py b/hysop/tools/numerics.py index d0f9c9bb2893b3bde459a286a1f85a558efb6ca6..877c39232ab7256d200faf415b9e17f799b6a880 100644 --- a/hysop/tools/numerics.py +++ b/hysop/tools/numerics.py @@ -74,7 +74,7 @@ def default_invalid_value(dtype): elif is_signed(dtype): return 0 else: - raise NotImplementedError() + raise NotImplementedError # promote_dtype def match_dtype(x, dtype):