diff --git a/CMakeLists.txt b/CMakeLists.txt old mode 100755 new mode 100644 index c56fe833eda8e7606a270231bfd898d8a3fabbf1..766202db64021d550b87f1bd93d8d1bd4575a89a --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,7 +175,8 @@ endif() if(WITH_FFTW) #set(FIND_FFTW_VERBOSE VERBOSE_MODE) set(FIND_FFTW_VERBOSE OFF) - set(FIND_FFTW_DEBUG OFF) + set(FIND_FFTW_DEBUG ON) + set(FIND_FFTW_SHARED_ONLY ON) compile_with(FFTW REQUIRED COMPONENTS Fftw3f Fftw3d Fftw3f-mpi Fftw3d-mpi diff --git a/cmake/FindFFTW.cmake b/cmake/FindFFTW.cmake index 15bca50d3bd1ed1c893974f91bc28dd0da96065f..c6a42e4fb70b5e575bf5f002576a51f1d0f74fcc 100644 --- a/cmake/FindFFTW.cmake +++ b/cmake/FindFFTW.cmake @@ -16,6 +16,7 @@ # == Full example == # set(FIND_FFTW_VERBOSE ON) (optional) # set(FIND_FFTW_DEBUG OFF) (optional) +# set(FIND_FFTW_SHARED_ONLY ON) (optional) # find_package(FFTW # REQUIRED COMPONENTS Fftw3f Fftw3d # OPTIONAL_COMPONENTS Fftw3l Fftw3q @@ -55,6 +56,10 @@ # == Checking against a specific version or the library == # Not supported yet. # +# == Looking for shared or static libraries only == +# set ${FIND_FFTW_SHARED_ONLY} to ON before find_package call. +# set ${FIND_FFTW_STATIC_ONLY} to ON before find_package call. +# # == Debug and verbose mode == # Set ${FIND_FFTW_VERBOSE} to ON before find_package call to enable verbose mode # Set ${FIND_FFTW_DEBUG} to ON before find_package call to enable debug mode @@ -98,6 +103,13 @@ foreach(fftw_comp ${FFTW_FIND_COMPONENTS}) # -- find library name given the component name string(REPLACE "fftw3d" "fftw3" library "${component}") + # -- rename library to match shared or static constraints + if(FIND_FFTW_SHARED_ONLY) + set(library "lib${library}.so") + elseif(FIND_FFTW_STATIC_ONLY) + set(library "lib${library}.a") + endif() + if(FIND_FFTW_DEBUG) message("\tFFTW::${fftw_comp}:${COMPONENT}:${component}, LIB=${library} HEADER=${header}") endif() diff --git a/examples/taylor_green/taylor_green_monoscale.py b/examples/taylor_green/taylor_green_monoscale.py index d599b58deef39647775e230e82295e4627731985..d3ec5259ecb2084d6a69f99b9208cc0ad3933ed1 100644 --- a/examples/taylor_green/taylor_green_monoscale.py +++ b/examples/taylor_green/taylor_green_monoscale.py @@ -30,11 +30,11 @@ def init_vorticity(res, x, y, z, t): return res def do_solve(npts, viscosity=1./1600., lcfl=0.125, cfl=0.50): - autotuner_config = AutotunerConfig(autotuner_flag=AutotunerFlags.EXHAUSTIVE, - prune_threshold=1.2, override_cache=False, verbose=2) + autotuner_config = AutotunerConfig(autotuner_flag=AutotunerFlags.ESTIMATE, + prune_threshold=1.2, override_cache=True, verbose=1) kernel_config = KernelConfig(autotuner_config=autotuner_config) - dim = 1 + dim = 3 box = Box(length=(2*pi,)*dim) @@ -66,10 +66,10 @@ def do_solve(npts, viscosity=1./1600., lcfl=0.125, cfl=0.50): # method = {SpaceDiscretization: SpaceDiscretization.FDC4} # ) - # poisson = Poisson(name='poisson', velocity=velo, vorticity=vorti, - # variables={velo:d3d, vorti: d3d}) - # diffusion = Diffusion(name='diffusion', input_field=vorti, viscosity=viscosity, - # variables={vorti: d3d}) + poisson = Poisson(name='poisson', velocity=velo, vorticity=vorti, + variables={velo:d3d, vorti: d3d}) + diffusion = Diffusion(name='diffusion', input_field=vorti, viscosity=viscosity, + variables={vorti: d3d}) splitting = StrangSplitting(splitting_dim=dim, order=StrangOrder.STRANG_SECOND_ORDER, method={KernelConfig: kernel_config}) @@ -78,16 +78,16 @@ def do_solve(npts, viscosity=1./1600., lcfl=0.125, cfl=0.50): problem = Problem() problem.insert(splitting) - # problem.insert(diffusion) - # problem.insert(poisson) + problem.insert(diffusion) + problem.insert(poisson) problem.build() - # problem.display() + problem.display() problem.solve(simu) problem.finalize() if __name__=='__main__': - N = 128*128 + 1 + N = 128 + 1 do_solve(N) diff --git a/hysop/backend/device/codegen/base/function_codegen.py b/hysop/backend/device/codegen/base/function_codegen.py index d9ec1f53718e25747b6059ca0ef7d5b94ca81f6e..cb8c0654dcc7baa32f51856d1e95e619f9d52aaa 100644 --- a/hysop/backend/device/codegen/base/function_codegen.py +++ b/hysop/backend/device/codegen/base/function_codegen.py @@ -29,13 +29,14 @@ class FunctionBase(object): pass else: varval.const = True - if symbolic_mode is not None: + if symbolic_mode: varval.force_symbolic(symbolic_mode) fargs[varname]=varval else: fargs[varname].set_value(varval) fargs[varname].const = True - fargs[varname].force_symbolic(symbolic_mode) + if symbolic_mode: + fargs[varname].force_symbolic(symbolic_mode) fargs.lock() self.fname = fname diff --git a/hysop/backend/device/codegen/base/kernel_codegen.py b/hysop/backend/device/codegen/base/kernel_codegen.py index 41224982a6a1022d4413c24f6b9ae11a5ebfd3df..936a1591d5bec7e5cbdcc5cb789b37ce0787ad2d 100644 --- a/hysop/backend/device/codegen/base/kernel_codegen.py +++ b/hysop/backend/device/codegen/base/kernel_codegen.py @@ -1,4 +1,4 @@ - + from contextlib import contextmanager from hysop.tools.types import check_instance diff --git a/hysop/backend/device/codegen/base/utils.py b/hysop/backend/device/codegen/base/utils.py index 50c08aa80daa0407294a8ea24d42a27d5fae2276..b69bbc9722196a0434475964c8293cd1fad23059 100644 --- a/hysop/backend/device/codegen/base/utils.py +++ b/hysop/backend/device/codegen/base/utils.py @@ -71,6 +71,7 @@ class ArgDict(WriteOnceDict): i=0 for varname in self.arg_order: var = self[varname] + print var.name, var.symbolic_mode, var.is_symbolic() if var.symbolic_mode and var.known(): constant_args.append(var) elif var.is_symbolic(): diff --git a/hysop/backend/device/codegen/base/variables.py b/hysop/backend/device/codegen/base/variables.py index 943a08fab9253ea63499512c5dd2c13d46a567af..b8bd145562f9d795fbe9159dade51d465893bd58 100644 --- a/hysop/backend/device/codegen/base/variables.py +++ b/hysop/backend/device/codegen/base/variables.py @@ -50,8 +50,10 @@ def dtype_to_ctype(dtype): return _dtype_to_ctype[dtype] def ctype_to_dtype(ctype): if ctype not in _ctype_to_dtype.keys(): - msg='Unknown ctype {}.'.format(ctype) - raise ValueError(msg) + # msg='Unknown ctype {}.'.format(ctype) + # raise ValueError(msg) + from hysop.backend.device.opencl.opencl_types import np_dtype + return np_dtype(ctype) else: return _ctype_to_dtype[ctype] def register_ctype_dtype(ctype,dtype): @@ -89,6 +91,7 @@ class CodegenVariable(object): self.volatile = volatile self.add_impl_const = add_impl_const self.storage = storage + self.declared = False self.nl = nl if (nl is not None) else (storage is not None) self.struct_var = struct_var @@ -134,7 +137,7 @@ class CodegenVariable(object): dtype = self.typegen.floatn(1) else: dtype = ctype_to_dtype(ctype) - value = dtype(value) + value = np.asarray([value],dtype=dtype)[0] elif value.__class__ in [list,tuple]: dtype = ctype_to_dtype(ctype) try: @@ -152,7 +155,7 @@ class CodegenVariable(object): self.value = value self.svalue = svalue - self.init=init + self.init = init # check if add_impl_const: @@ -382,6 +385,7 @@ class CodegenVariable(object): code+=';' self.force_symbolic() + self.declared = True if not align: code = code.replace('$','') @@ -561,9 +565,9 @@ class CodegenArray(CodegenVariable): self.sshape = sshape def decl_name(self): - if self.shape: + if self.shape is not None: static_array = ['[{}]'.format(val) for val in self.shape] - elif self.shape: + elif self.sshape is not None: static_array = ['[{}]'.format(val) for val in self.sshape] else: static_array = [] @@ -673,7 +677,7 @@ class CodegenVectorClBuiltin(CodegenVector): if (value is not None): dtype = ctype_to_dtype(btype) value = np.asarray(value,dtype) - svalue = [typegen.dump(dtype(f)) for f in value] + svalue = [typegen.dump(np.asarray([f],dtype=dtype)[0]) for f in value] super(CodegenVectorClBuiltin,self).__init__(name=name,ctype=ctype,dim=dim,typegen=typegen, value=value,svalue=svalue, const=const, add_impl_const=add_impl_const, @@ -741,16 +745,18 @@ class CodegenVectorClBuiltin(CodegenVector): dim = self.dim if isinstance(key,slice) : ids = range(*key.indices(dim)) + if self.declared and key.indices(dim)==(0,dim,1): + return self.name + else: + return self.__getitem__(ids) + elif isinstance(key, (tuple,list)): if self.is_symbolic(): - if key.indices(dim)==(0,dim,1): - return self.name - else: - mode = self.access_mode - access = self.access_prefix(True) + ('s' if mode.lower()=='hex' else '') - access += ''.join([self.typegen.vtype_component_adressing(i,mode) for i in ids]) + mode = self.access_mode + access = self.access_prefix(True) + ('s' if mode.lower()=='hex' else '') + access += ''.join([self.typegen.vtype_component_adressing(i,mode) for i in key]) else: - ctype = self.btype + (str(len(ids)) if len(ids)!=1 else '') - value = [self.svalue[i] for i in ids] + ctype = self.btype + (str(len(key)) if len(key)!=1 else '') + value = [self.svalue[i] for i in key] return '({})({})'.format(ctype, ','.join(value)) return access elif isinstance(key, int) : @@ -763,6 +769,7 @@ class CodegenVectorClBuiltin(CodegenVector): raise TypeError, 'Invalid key type!' def declare(self, codegen=None, init=None, **kargs): + init = init or self.init if isinstance(init,int): init = ','.join([self.typegen.dump(init) for _ in xrange(self.dim)]) init = '({})({})'.format(self.ctype,init) diff --git a/hysop/backend/device/codegen/kernels/tests/test_directional_transpose.py b/hysop/backend/device/codegen/kernels/tests/test_directional_transpose.py new file mode 100644 index 0000000000000000000000000000000000000000..643e4a16fb610b4172b42b41c902a1ddb3e8d158 --- /dev/null +++ b/hysop/backend/device/codegen/kernels/tests/test_directional_transpose.py @@ -0,0 +1,352 @@ + +import copy, math + +from hysop import __ENABLE_LONG_TESTS__ +from hysop.deps import np +from hysop.backend.device.opencl import cl, clTools +from hysop.tools.types import check_instance +from hysop.backend.device.codegen.base.test import _test_typegen +# from hysop.backend.device.codegen.kernels.transpose import TransposeKernel + +class TestDirectionalAdvection(object): + + @classmethod + def setup_class(cls, + enable_extra_tests=__ENABLE_LONG_TESTS__, + enable_error_plots=False): + + typegen = _test_typegen('double','dec') + queue = cl.CommandQueue(typegen.context) + ctx = typegen.context + + # Lx = min(typegen.device.max_work_item_sizes[0], typegen.device.max_work_group_size) + # Lx = min(Lx, grid_size[0]) + + cls.enable_extra_tests = enable_extra_tests + cls.enable_error_plots = enable_error_plots + + cls.size_min = 8 + cls.size_max = 64 + + cls.queue = queue + cls.ctx = ctx + + # cls.local_work_size = np.asarray([Lx,1,1]) + # cls.work_load = np.asarray([1,1,1]) + + + def _alloc_cpu_gpu(self, dtype, dim): + + ctx = self.ctx + queue = self.queue + + grid_size = np.rint(self.size_min + np.random.rand(dim) * (self.size_max - self.size_min))\ + .astype(np.int32) + + igrid_ghosts = 1 + np.rint(dim*np.random.rand(dim)).astype(np.int32) + ogrid_ghosts = 1 + np.rint(dim*np.random.rand(dim)).astype(np.int32) + + igrid_size = grid_size + 2*igrid_ghosts + ogrid_size = grid_size + 2*ogrid_ghosts + + grid_shape = grid_size[::-1] + igrid_shape = igrid_size[::-1] + ogrid_shape = ogrid_size[::-1] + + grid_bytes = grid_size.size * np.dtype(dtype).itemsize + igrid_bytes = igrid_size.size * np.dtype(dtype).itemsize + ogrid_bytes = ogrid_size.size * np.dtype(dtype).itemsize + + grid_view = [ slice(0, grid_size[i] + 0 ) for i in xrange(dim-1,-1,-1) ] + igrid_view = [ slice(igrid_ghosts[i], grid_size[i] + igrid_ghosts[i]) for i in xrange(dim-1,-1,-1) ] + ogrid_view = [ slice(ogrid_ghosts[i], grid_size[i] + ogrid_ghosts[i]) for i in xrange(dim-1,-1,-1) ] + + print '::Alloc:: dtype={} dim={}'.format(dtype, dim) + print ' *INPUT: base={} ghosts={} size={}'.format(grid_size, igrid_ghosts, igrid_size) + print ' *OUTPUT: base={} ghosts={} size={}'.format(grid_size, ogrid_ghosts, ogrid_size) + print ' allocating...' + + mf = cl.mem_flags + + host_buffers_init = { + 'no_ghosts': { + 'Tin': np.random.rand(*grid_shape).astype(dtype), + 'Tout': -1 * np.ones(shape=grid_shape, dtype=dtype), + 'dbg0': -1 * np.ones(shape=grid_shape, dtype=np.int32), + 'dbg1': -1 * np.ones(shape=grid_shape, dtype=np.int32), + }, + 'with_ghosts': { + 'Tin': np.random.rand(*igrid_shape).astype(dtype), + 'Tout': -1 * np.ones(shape=ogrid_shape, dtype=dtype), + 'dbg0': -1 * np.ones(shape=grid_shape, dtype=np.int32), + 'dbg1': -1 * np.ones(shape=grid_shape, dtype=np.int32), + } + } + + device_buffers = { + 'no_ghosts': { + 'Tin': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['Tin']), + 'Tout': cl.Buffer(ctx, flags=mf.WRITE_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['Tout']), + 'dbg0': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['dbg0']), + 'dbg1': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['dbg1']), + }, + 'with_ghosts': { + 'Tin': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['Tin']), + 'Tout': cl.Buffer(ctx, flags=mf.WRITE_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['Tout']), + 'dbg0': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['dbg0']), + 'dbg1': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['dbg1']), + } + } + + host_buffers_reference = copy.deepcopy(host_buffers_init) + host_buffers_gpu = copy.deepcopy(host_buffers_init) + + self.dtype = dtype + self.dim = dim + + self.host_buffers_init = host_buffers_init + self.host_buffers_reference = host_buffers_reference + self.host_buffers_gpu = host_buffers_gpu + self.device_buffers = device_buffers + + self.grid_size = grid_size + self.igrid_size = igrid_size + self.ogrid_size = ogrid_size + + self.grid_shape = grid_shape + self.igrid_shape = igrid_shape + self.ogrid_shape = ogrid_shape + + self.grid_bytes = grid_bytes + self.igrid_bytes = igrid_bytes + self.ogrid_bytes = ogrid_bytes + + self.grid_view = grid_view + self.igrid_view = igrid_view + self.ogrid_view = ogrid_view + + self.igrid_ghosts = igrid_ghosts + self.ogrid_ghosts = ogrid_ghosts + + @classmethod + def teardown_class(cls): + pass + + def setup_method(self, method): + pass + + def teardown_method(self, method): + pass + + def _do_transpose_cpu(self, is_inplace, axes): + + out_target = 'Tin' if is_inplace else 'Tout' + + for target in ['no_ghosts', 'with_ghosts']: + host_init_buffers = self.host_buffers_init[target] + host_buffers_reference = self.host_buffers_reference[target] + + if target == 'no_ghosts': + in_view = self.grid_view + out_view = self.grid_view + else: + in_view = self.igrid_view + out_view = self.igrid_view if is_inplace else self.ogrid_view + + Tin = host_init_buffers['Tin'] + Tout = host_buffers_reference[out_target] + + print axes, Tin[in_view].shape, Tout[out_view].shape + Tout[out_view] = np.transpose(Tin[in_view].copy(), axes=axes) + + def _do_compute_gpu_and_check(self, dim, axes, is_inplace, vectorization): + + msg = '\nTesting {}d {} transposition with {} axes and {} vectorization.' + msg = msg.format(dim, + 'inplace' if is_inplace else 'out of place', + axes, vectorization) + print msg + + return + + host_init_buffers = self.host_buffers_init[target] + host_buffers_reference = self.host_buffers_reference[target] + host_buffers_gpu = self.host_buffers_gpu[target] + device_buffers = self.device_buffers[target] + + dak = DirectionalAdvectionKernel( + typegen=self.typegen, + ftype=self.typegen.fbtype, + work_dim=work_dim, + rk_scheme=rk_scheme, + is_cached=cached, + vboundary=(boundary, boundary), + nparticles = nparticles, + min_ghosts=min_ghosts, + symbolic_mode=False, + debug_mode=True, + known_vars=known_vars) + # dak.edit() + + global_work_size = dak.get_global_size(work_size,local_work_size,work_load) + (static_shared_bytes, dynamic_shared_bytes, total_bytes) = \ + dak.required_workgroup_cache_size(local_work_size) + + variables = ['u','pos'] + debug = ['dbg0', 'dbg1'] + for varname in variables: + kernel_args.append(device_buffers[varname]) + kernel_args.append(np.uint64(0)) + for varname in debug: + kernel_args.append(device_buffers[varname]) + if (dynamic_shared_bytes != 0): + shared_buffer = cl.LocalMemory(dynamic_shared_bytes) + kernel_args.append(shared_buffer) + + print '\tGenerating and compiling Kernel...' + source = dak.__str__() + prg = cl.Program(self.typegen.context, source) + prg.build(devices=[self.typegen.device]) + kernel = prg.all_kernels()[0] + kernel.set_args(*kernel_args) + + print '\tCPU => GPU' + for buf in variables+debug: + src = host_init_buffers[buf] + dst = device_buffers[buf] + cl.enqueue_copy(queue,dst,src) + + print '\tKernel execution <<<{},{}>>>'.format(global_work_size,local_work_size) + evt = cl.enqueue_nd_range_kernel(queue, kernel, + global_work_size.tolist(), local_work_size.tolist()) + evt.wait() + + print '\tGPU => CPU' + for buf in variables+debug: + src = device_buffers[buf] + dst = host_buffers_gpu[buf] + cl.enqueue_copy(queue,dst,src) + + print '\tSynchronize queue' + queue.flush() + queue.finish() + + buffers = [(varname,host_buffers_reference[varname],host_buffers_gpu[varname]) + for varname in variables] + self._cmp_buffers(buffers,view,dak,work_dim) + + def _cmp_buffers(self,buffers,view,dak,work_dim): + good = True + err_buffers = [] + + for (name,host,dev) in buffers: + (l1,l2,linf) = self._distances(host,dev,view) + print '\t{} -> l1={} l2={} linf={}'.format(name,l1,l2,linf) + if linf>1e-12: + err_buffers.append(name) + good = False + if not good: + msg = '\n[FAIL] Buffer comparisson failed for buffers {}.\n'.format(err_buffers) + print msg + dak.edit() + if self.enable_error_plots: + from matplotlib import pyplot as plt + for (name,host,dev) in buffers: + if name in err_buffers: + + host = host[view] + dev = dev[view] + + d = (dev-host)*(dev-host) + d -= np.mean(d) + + if work_dim==3: + fig,axes = plt.subplots(2,2) + axes[0][0].imshow(np.sum(d,axis=0),interpolation='nearest') + axes[0][1].imshow(np.sum(d,axis=1),interpolation='nearest') + axes[1][0].imshow(np.sum(d,axis=2),interpolation='nearest') + axes[1][1].imshow(np.sum(d,axis=(0,1))[np.newaxis,:],interpolation='nearest') + plt.title(name) + fig.show() + raw_input('Press enter to close all windows.') + break + elif work_dim==2: + fig,axe = plt.subplots() + axe.imshow(d,interpolation='nearest') + plt.title(name) + fig.show() + raw_input('Press enter to close all windows.') + break + else: + pass + raise RuntimeError(msg) + + def _distances(self,lhs,rhs,view): + d = rhs[view]-lhs[view] + da = np.abs(d) + + l1 = np.sum(da)/d.size + l2 = np.sqrt(np.sum(d*d))/d.size + linf = np.max(da) + return (l1,l2,linf) + + + def _test_transpose(self, dim, axes, is_inplace): + check_instance(axes, tuple, values=int) + assert dim > 1 + assert len(axes)==2 + + if self.enable_extra_tests: + vectorizations=[1,2,4,8,16] + dtypes = [np.float32, np.float64, + np.int8, np.int16, np.int32, np.int64, + np.uint8, np.uint16, np.uint32, np.uint64] + else: + vectorizations=[1,2] + dtypes = [np.float32, np.int32, np.uint32] + + for dtype in dtypes: + self._alloc_cpu_gpu(dtype=dtype, dim=dim) + self._do_transpose_cpu(axes=axes, is_inplace=is_inplace) + for vectorization in vectorizations: + self._do_compute_gpu_and_check(dim=dim, axes=axes, + vectorization=vectorization, is_inplace=is_inplace) + + + def test_transpose_XY_out_of_place(self): + self._test_transpose(dim=2, axes=(1,0), is_inplace=False) + self._test_transpose(dim=3, axes=(1,0), is_inplace=False) + + def test_transpose_XZ_out_of_place(self): + self._test_transpose(dim=3, axes=(2,1,0), is_inplace=False) + + def test_transpose_YZ_out_of_place(self): + self._test_transpose(dim=3, axes=(0,2,1), is_inplace=False) + + + def test_transpose_XY_inplace(self): + self._test_transpose(dim=2, axes=(1,0), is_inplace=True) + self._test_transpose(dim=3, axes=(1,0), is_inplace=True) + + def test_transpose_XZ_inplace(self): + self._test_transpose(dim=3, axes=(2,1,0), is_inplace=True) + + def test_transpose_YZ_inplace(self): + self._test_transpose(dim=3, axes=(0,2,1), is_inplace=True) + +if __name__ == '__main__': + TestDirectionalAdvection.setup_class(enable_extra_tests=False, enable_error_plots=True) + + test = TestDirectionalAdvection() + test.test_transpose_XY_out_of_place() + + TestDirectionalAdvection.teardown_class() + diff --git a/hysop/backend/device/codegen/kernels/transpose.py b/hysop/backend/device/codegen/kernels/transpose.py index 4d974f35cc616bfac4347e599656c53a3c865ae3..06e3d63b73f0abc4241458caee6fc16204259965 100644 --- a/hysop/backend/device/codegen/kernels/transpose.py +++ b/hysop/backend/device/codegen/kernels/transpose.py @@ -1,9 +1,12 @@ import operator import numpy as np +from contextlib import contextmanager, nested +from hysop.tools.misc import upper_pow2 from hysop.backend.device.codegen.base.opencl_codegen import OpenClCodeGenerator from hysop.backend.device.codegen.base.kernel_codegen import KernelCodeGenerator -from hysop.backend.device.codegen.base.variables import CodegenVariable, CodegenVectorClBuiltin, CodegenArray +from hysop.backend.device.codegen.base.variables import CodegenVariable, \ + CodegenVectorClBuiltin, CodegenArray from hysop.backend.device.opencl.opencl_types import OpenClTypeGen from hysop.backend.device.codegen.base.utils import WriteOnceDict, ArgDict @@ -12,34 +15,78 @@ from hysop.backend.device.codegen.functions.compute_index import ComputeIndexFun class TransposeKernel(KernelCodeGenerator): @staticmethod - def codegen_name(components,dtype,work_dim): - return 'transpose_{}_{}_{}d'.format(components,dtype,work_dim) + def codegen_name(is_inplace, components, ctype, + tile_dim, tile_size, tile_padding, vectorization): + components = [ str(j) if i!=j else 'X' for i,j in enumerate(components) ] + return 'transpose_{}_{}_{}d__N{}__T{}__P{}__{}'.format( + 'inplace' if is_inplace else 'out_of_place', + ctype, tile_dim, vectorization, tile_size, tile_padding, + '_'.join(components)) - def __init__(self, typegen, work_dim, dtype, - tile_size=(8,8), tile_padding=1, + def __init__(self, typegen, ctype, vectorization, + components, tile_size, tile_padding, + is_inplace = False, known_vars = None, **kargs): - - if len(tile_size)==1: - tile_size = (tile_size,tile_size,) - elif len(tile_size)!=2: - raise ValueError('Tile size should be a 2d tuple.') - name = TransposeKernel.codegen_name('xy',dtype, work_dim) - - kernel_args = self.gen_kernel_arguments(typegen, dtype, work_dim) + components = np.asarray(components) + dim = components.size + pdim = upper_pow2(dim) + assert dim <= 16, 'Maximal permutation dimension is 16.' + assert pdim in [1,2,4,8,16] + assert vectorization in [1,2,4,8,16] + + # check permutation components + msg='Invalid permutation {} for dimension {}.' + msg=msg.format(components, dim) + assert components.size == dim, msg + assert (components<dim).all(), msg + axes = set(components.tolist()) + if len(axes) != dim: + raise ValueError(msg) + + permutation = (components != range(dim)) + naxes = sum(permutation) + + if (naxes == 0): + msg='There is nothing to transpose with given permutation {}.' + msg=msg.format() + raise ValueError(msg) + assert naxes >= 2, msg + + # we need the first axe for contiguous reads and writes even if it's not part + # of the permutation scheme. + tile_dim = naxes + int(not permutation[0]) + tile_shape = [tile_size,]*tile_dim + tile_shape[-1] += tile_padding + tile_shape = tuple(tile_shape) + + name = TransposeKernel.codegen_name(is_inplace, components, ctype, tile_dim, + tile_size, tile_padding, vectorization) + + work_dim = min(3,tile_dim) + kernel_args = self.gen_kernel_arguments(typegen, ctype, pdim) super(self.__class__,self).__init__( name=name, typegen=typegen, work_dim=work_dim, - kernel_args=kernel_args, - vec_type_hint=dtype, + known_vars = known_vars, + kernel_args = kernel_args, **kargs) - self.dtype = dtype - self.tile_size = tile_size - self.tile_padding = tile_padding + self.ctype = ctype + self.permutation = permutation + self.components = components + self.dim = dim + self.pdim = pdim + self.tile_dim = tile_dim + self.tile_size = tile_size + self.tile_shape = tile_shape + self.tile_padding = tile_padding + self.is_inplace = is_inplace + self.vectorization = vectorization + self.naxes = naxes self.gencode() @@ -58,14 +105,17 @@ class TransposeKernel(KernelCodeGenerator): return reqs - def gen_kernel_arguments(self, typegen, dtype, work_dim): + def gen_kernel_arguments(self, typegen, ctype, pdim): _global = OpenClCodeGenerator.default_keywords['global'] tg = typegen kargs = ArgDict() - kargs['in'] = CodegenVariable(dtype=dtype,name='in' ,typegen=tg,storage=_global,ptr=True,const=True,restrict=True,nl=True) - kargs['out'] = CodegenVariable(dtype=dtype,name='out',typegen=tg,storage=_global,ptr=True,restrict=True,nl=True) - kargs['dsize'] = CodegenVectorClBuiltin(btype='int', dim=work_dim, name='S', typegen=tg, add_impl_const=True) + kargs['in'] = CodegenVariable(ctype=ctype, name='in', + typegen=tg, storage=_global, const=True, ptr=True, add_impl_const=True, ptr_restrict=True, nl=True) + kargs['out'] = CodegenVariable(ctype=ctype, name='out', + typegen=tg, storage=_global, ptr=True, ptr_const=True, ptr_restrict=True, nl=True) + kargs['shape'] = CodegenVectorClBuiltin(btype='int', dim=pdim, + name='shape', typegen=tg, add_impl_const=True, symbolic_mode=True) return kargs @@ -75,46 +125,101 @@ class TransposeKernel(KernelCodeGenerator): _local = OpenClCodeGenerator.default_keywords['local'] - s = self - tg = s.typegen - work_dim = s.work_dim - dtype = s.dtype - - global_id = s.vars['global_id'] - local_id = s.vars['local_id'] - - _in = s.vars['in'] - _out = s.vars['out'] - S = s.vars['dsize'] - - tile_size = CodegenVectorClBuiltin(typegen=tg,name='TILE_SIZE',btype='uint', dim=2, - const=True, value=self.tile_size) - tile_padding = CodegenVariable(typegen=tg,name='TILE_PADDING',dtype='uint', - const=True, value=self.tile_padding) - tile = CodegenArray(typegen=tg, name='tile', dtype=dtype, storage='__constant', - dim=2, - size=(tile_size[0](), '{}+{}'.format(tile_size[1](), tile_padding()),) - ) - + s = self + tg = s.typegen + work_dim = s.work_dim + symbolic_mode = s.symbolic_mode + + dim = s.dim + pdim = s.pdim + + ctype = s.ctype + permutation = s.permutation + components = s.components + tile_dim = s.tile_dim + tile_size = s.tile_size + tile_shape = s.tile_shape + tile_padding = s.tile_padding + is_inplace = s.is_inplace + vectorization = s.vectorization + naxes = s.naxes + + local_id = s.vars['local_id'] + local_size = s.vars['local_size'] + group_id = s.vars['group_id'] + group_size = s.vars['num_groups'] + + _in = s.vars['in'] + _out = s.vars['out'] + S = s.vars['shape'] + + tile_size = CodegenVariable(typegen=tg, name='tile_size', ctype='int', + const=True, value=self.tile_size, symbolic_mode=symbolic_mode) + tile_padding = CodegenVariable(typegen=tg, name='tile_padding', ctype='int', + const=True, value=self.tile_padding, symbolic_mode=symbolic_mode) + tile_sshape = [tile_size]*tile_dim + tile_sshape[-1] = '{}+{}'.format(tile_size.name, tile_padding.name) + tile = CodegenArray(typegen=tg, name='tile', ctype=ctype, storage='__local', + dim=tile_dim, shape=tile_sshape) + + is_tile_index = tuple( (i==0 or permutation[i]) for i in xrange(dim) ) + is_tile_index += (False,)*(pdim-dim) + is_tile_index_var = CodegenVectorClBuiltin('is_tile_index', 'int', pdim, tg, const=True, + init=np.asarray(is_tile_index, dtype=np.int32)) + + ntiles = CodegenVectorClBuiltin('ntiles', 'int', pdim, tg, const=True, + init='{} * (({}+{}-1)/{})'.format(is_tile_index_var, S, tile_size, tile_size)) + + idx = CodegenVectorClBuiltin('idx', 'int', pdim, tg, const=True) + + @contextmanager + def _work_iterate_(i): + try: + if is_tile_index[i]: + loop = '{i}=0; {i}<{N}; {i}++'.format(i=idx[i], N=S[i]) + else: + loop = '{i}=0; {i}<{N}; {i}++'.format(i=idx[i], N=S[i]) + with s._for_(loop) as ctx: + yield ctx + except: + raise + nested_loops = [ _work_iterate_(i) for i,k in enumerate(permutation) if (i!=0 and not k) ][::-1] with s._kernel_(): with s._align_() as al: - tile_size.define(al,align=True) - tile_padding.define(al,align=True) + tile_size.declare(al,align=True) + tile_padding.declare(al,align=True) + s.jumpline() + is_tile_index_var.declare(s) + ntiles.declare(s) + s.jumpline() with s._align_() as al: - global_id.define(al,align=True,const=True) - local_id.define(al,align=True,const=True) - #s.check_workitem_bounds(grid_size, compact=False) - #s.jumpline() + local_id.declare(al,align=True,const=True) + group_id.declare(al,align=True,const=True) + local_size.declare(al,align=True,const=True) + group_size.declare(al,align=True,const=True) + + s.jumpline() + tile.declare(s) + + s.jumpline() + s.comment('Iteration over all non active axes') + idx.declare(s) + with nested(*nested_loops): + s.append(';') if __name__ == '__main__': - from hysop.backend.device.codegen.base.test import test_typegen - typegen = test_typegen('float') - ek = TransposeKernel(typegen=tg, work_dim=3, dtype='float4', - tile_size=(8,8,), tile_padding=1, - known_vars=dict()) + from hysop.backend.device.codegen.base.test import _test_typegen + tg = _test_typegen('float') + ek = TransposeKernel(typegen=tg, ctype='float', vectorization=2, + components = (0,2,1,3,4), + tile_size=8, tile_padding=1, + is_inplace=False, + symbolic_mode=True, + known_vars={ + 'shape' : (100,200,300,400,500,0,0,0) + }) ek.edit() - #ek.test_compile() - print + ek.test_compile() diff --git a/hysop/constants.py b/hysop/constants.py index 1cca17bde937c3eacc9efb87a79bc85409a16323..2494bced9cc26549e1aa7570be250e1e8378ea7f 100644 --- a/hysop/constants.py +++ b/hysop/constants.py @@ -197,6 +197,9 @@ TranspositionState2D = transposition_states[2] TranspositionState3D = transposition_states[3] """3D memory layout (transposition state) enum""" +TransposeConfiguration = EnumFactory.create('TranspositionConfiguration', + ['TRANSPOSE_XY', 'TRANSPOSE_XZ', 'TRANSPOSE_YZ']) + default_order = None """ Default HySoP memory ordering for array allocations. diff --git a/hysop/constants.py.in b/hysop/constants.py.in index b2a900d716f7eca19fa6f91955ca480868ea623c..dd00fa9b0677cb1efe659a1b930b2d60d2c0bc4d 100644 --- a/hysop/constants.py.in +++ b/hysop/constants.py.in @@ -197,6 +197,12 @@ TranspositionState2D = transposition_states[2] TranspositionState3D = transposition_states[3] """3D memory layout (transposition state) enum""" +TransposeConfiguration = EnumFactory.create('TranspositionConfiguration', + ['TRANSPOSE_XY', 'TRANSPOSE_XZ', 'TRANSPOSE_YZ']) +""" +Possible transposition configurations used to transpose fields in the Transposition operator. +""" + default_order = None """ Default HySoP memory ordering for array allocations. diff --git a/hysop/old/gpu.old/cl_src/kernels/transpose_xz.cl b/hysop/old/gpu.old/cl_src/kernels/transpose_xz.cl index b2197fbb12f4643f1922ab8674a22af9a0583174..7d646eca97e3c18eecfe7b1f0f7ec25fc43b7ca1 100644 --- a/hysop/old/gpu.old/cl_src/kernels/transpose_xz.cl +++ b/hysop/old/gpu.old/cl_src/kernels/transpose_xz.cl @@ -70,7 +70,7 @@ __kernel void transpose_xz(__global const float* in, /* Fill the tile */ temp = vload__N__((index_in + i*NB_III + j*NB_III*NB_II)/__N__, in); tile[lid_z + j][lid_y + i][lid_x*__N__+__NN__] = temp.s__NN__; - } + } } /* Synchronize work-group */ diff --git a/hysop/tools/misc.py b/hysop/tools/misc.py index 341f3d37440bba10fb36d10eeb6df6f60e274897..a66bd6fdc572445681b0e1cceebb48494c6d3c8d 100644 --- a/hysop/tools/misc.py +++ b/hysop/tools/misc.py @@ -41,6 +41,21 @@ def kargs2args(func, kargs, remove=[]): argnames,_,_,_ = inspect.getargspec(func) return tuple([kargs[a] for a in argnames if a not in remove]) +def upper_pow2(x): + def _upper_pow2(x): + if x<0: + raise RuntimeError('x<0') + i=0 + k=2**i + while k<x: + i+=1 + k*=2 + return k + + if np.isscalar(x): + return _upper_pow2(x) + else: + return np.asarray([_upper_pow2(_x) for _x in x]) class Utils(object): """tools to handle array and slices. diff --git a/setup.py.in b/setup.py.in index 8aeb7c90af78cda61ed8c9da32599d1d41c7caa0..cbcd22b59ebcd0b9a826229b7bcb9367cef47ebd 100644 --- a/setup.py.in +++ b/setup.py.in @@ -242,16 +242,13 @@ def create_swig_extension(name, inc_dirs, src_dirs=None, sources=None): # ------------ Set list of packages required to build the module ------------- # List of modules (directories) to be included with_test = "@WITH_TESTS@" is "ON" -if with_test: - packages = find_packages(exclude=["*opencl*"], where="@CMAKE_SOURCE_DIR@") -else: - packages = find_packages(exclude=["*tests*", "*opencl*"], - where="@CMAKE_SOURCE_DIR@") - -if "@WITH_GPU@" is "ON": - packages.append('hysop.backend.device.opencl') - if with_test: - packages.append('hysop.backend.device.opencl.tests') +with_gpu = "@WITH_GPU@" is "ON" +exclude=[] +if not with_test: + exclude.append('*tests*') +if not with_gpu: + exclude.append('*opencl*') +packages = find_packages(exclude=exclude, where="@CMAKE_SOURCE_DIR@") # Enable this to get debug info DISTUTILS_DEBUG = 1