diff --git a/hysop/codegen/base/function_codegen.py b/hysop/codegen/base/function_codegen.py index 86bd915e068434de2f09c2b338df64d32e2587b4..8d5fb484694bb299eb2bfac0fdfcc98b21e57be5 100644 --- a/hysop/codegen/base/function_codegen.py +++ b/hysop/codegen/base/function_codegen.py @@ -74,7 +74,9 @@ class FunctionBase(object): if isinstance(var, str): sargs.append(var) - elif isinstance(var,arg.__class__): + elif isinstance(var,arg.__class__) \ + or (arg.__class__==CodegenVectorClBuiltin and arg.dim==1 \ + and isinstance(var, CodegenVariable)): sargs.append(var.name) else: msg = 'Matched variable \'{}\' with argument \'{}\' but there was a type mismatch(\'{}\' is not an instance of \'{}\')!'.format(var.name, arg.name, var.__class__, arg.__class__) diff --git a/hysop/codegen/functions/advection_rhs.py b/hysop/codegen/functions/advection_rhs.py index d238bd9732b8546e0aa30101d42a459b73339988..d2b29d4747db3966a354d8c92936d07bbae743df 100644 --- a/hysop/codegen/functions/advection_rhs.py +++ b/hysop/codegen/functions/advection_rhs.py @@ -69,16 +69,15 @@ class DirectionalAdvectionRhsFunction(OpenClFunctionCodeGenerator): args['velocity'] = CodegenVariable('velocity', ftype, typegen, const=True, add_impl_const=True, storage=storage, ptr=True, restrict=restrict, nl=True) - args['X'] = CodegenVectorClBuiltin('X', ftype, nparticles, typegen, const=True, nl=True) + args['X'] = CodegenVectorClBuiltin('X', ftype, nparticles, typegen, add_impl_const=True, nl=True) - if not is_cached: - args['line_index'] = CodegenVariable('line_index', itype, typegen, add_impl_const=True) if is_periodic and (not is_cached): - args['line_width'] = CodegenVariable('line_width', itype, typegen, add_impl_const=True,nl=True) + args['line_width'] = CodegenVariable('line_width', itype, typegen, add_impl_const=True) + args['line_offset'] = CodegenVariable('line_offset', itype, typegen, add_impl_const=True,nl=True) args['rk_step'] = CodegenVariable('rk_step', itype, typegen, add_impl_const=True) args['inv_dx'] = CodegenVariable('inv_dx', ftype, typegen, add_impl_const=True, nl=True) - args['active'] = CodegenVariable('active','bool',typegen) + args['active'] = CodegenVariable('active','bool',typegen, add_impl_const=True) cached = 'cached_' if is_cached else '' basename = 'advection_rhs_{}{}{}p'.format(cached, ftype[0], nparticles) @@ -107,9 +106,9 @@ class DirectionalAdvectionRhsFunction(OpenClFunctionCodeGenerator): velocity = s.args['velocity'] if not is_cached: - line_index = s.args['line_index'] - if is_periodic: - line_width = s.args['line_width'] + line_offset = s.args['line_offset'] + if is_periodic: + line_width = s.args['line_width'] inv_dx = s.args['inv_dx'] rk_step = s.args['rk_step'] @@ -141,28 +140,23 @@ class DirectionalAdvectionRhsFunction(OpenClFunctionCodeGenerator): s.jumpline() with s._if_('{} == 0'.format(rk_step())): - # if is_periodic: - # s.append('{idx} = ({idx}+{size})%{size};'.format(idx=lidx(),size=line_width())) if is_cached: reads = [velocity[lidx[i] ] for i in xrange(nparticles)] else: - s.append('{} = {}+{};'.format(idx(), line_index(),lidx())) + s.append('{} = {}+{};'.format(idx(), line_offset(),lidx())) reads = [velocity[idx[i] ] for i in xrange(nparticles)] init = '({})({})'.format(part_ftype,', '.join(reads)) s.append('{} = {};'.format(dX_dt(), init)) with s._else_(): - if is_periodic: + if is_periodic and not is_cached: s.append('{idx} = ({idx}+{size})%{size};'.format(idx=lidx(),size=line_width())) - if is_cached: - reads = [velocity[lidx[i] ] for i in xrange(nparticles)] - else: - s.append('{} = {}+{};'.format(idx(), line_index(),lidx())) - reads = [velocity[idx[i] ] for i in xrange(nparticles)] + s.append('{} = {}+{};'.format(idx(), line_offset(),lidx())) + reads = [velocity[idx[i] ] for i in xrange(nparticles)] init = '({})({})'.format(part_ftype,', '.join(reads)) Vl.declare(s,init=init) - if is_periodic: + if is_periodic and not is_cached: s.append('{idx} = ({idx}+1)%{size};'.format(idx=lidx(),size=line_width())) - s.append('{} = {}+{};'.format(idx(), line_index(),lidx())) + s.append('{} = {}+{};'.format(idx(), line_offset(),lidx())) else: s.append('{} += 1;'.format(idx())) Vr.declare(s,init=init) diff --git a/hysop/codegen/functions/cache_load.py b/hysop/codegen/functions/cache_load.py index d40d71ff159753f9982d2ffcbe9913948df6d1ab..c655967ecc7bab98aa07ac4836400497576fd91e 100644 --- a/hysop/codegen/functions/cache_load.py +++ b/hysop/codegen/functions/cache_load.py @@ -10,6 +10,7 @@ from hysop.codegen.base.utils import ArgDict from hysop.codegen.base.statistics import WorkStatistics from hysop.codegen.functions.compute_index import ComputeIndexFunction +from hysop.constants import BoundaryCondition class CacheLoadFunction(OpenClFunctionCodeGenerator): @@ -22,22 +23,25 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator): 'data_size':'size of source data' } - def __init__(self, typegen, ftype, dim, + def __init__(self, typegen, ftype, work_dim, boundary, components=1, src_vectorize=True, dst_vectorize=True, - boundary=None, itype='int', + itype='int', with_gid_ghost_offset=True, known_args=None,force_symbolic=False): - assert dim>0 + assert work_dim>0 - if boundary not in [None, 'periodic']: - raise NotImplemented('Boundary \'{}\' not implmented yet!'.format(boundary)) + assert boundary in BoundaryCondition.entries() + boundary = BoundaryCondition.value(boundary) + sboundary = BoundaryCondition.svalue(boundary) + if boundary not in [BoundaryCondition.NONE, BoundaryCondition.PERIODIC]: + raise NotImplemented('Boundary \'{}\' not implemented yet!'.format(sboundary.lower())) tg = typegen fs = force_symbolic vtype = tg.vtype(ftype,components) - name = 'cache_load_{}d'.format(dim) - if boundary is not None: - name+='_{}'.format(boundary) + name = 'cache_load_{}d'.format(work_dim) + if boundary != BoundaryCondition.NONE: + name+='_{}'.format(sboundary.lower()) output = 'void' args = ArgDict() @@ -58,19 +62,21 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator): args[dst] = CodegenVariable(dst, ftype, typegen=tg, ptr=True, const=False, storage='__local' , nl=True, restrict=True) - args['global_id'] = CodegenVectorClBuiltin('gid',itype,dim,typegen, + args['global_id'] = CodegenVectorClBuiltin('gid',itype,work_dim,typegen, add_impl_const=False) - args['local_id'] = CodegenVectorClBuiltin('lid',itype,dim,typegen, + args['local_id'] = CodegenVectorClBuiltin('lid',itype,work_dim,typegen, add_impl_const=False, nl=True) - args['local_size'] = CodegenVectorClBuiltin('L',itype,dim,tg, + args['local_size'] = CodegenVectorClBuiltin('L',itype,work_dim,tg, add_impl_const=True, symbolic_mode=fs) - args['src_size'] = CodegenVectorClBuiltin('N',itype,dim,tg, + args['src_size'] = CodegenVectorClBuiltin('N',itype,work_dim,tg, add_impl_const=True, symbolic_mode=fs, nl=True) - args['lghosts'] = CodegenVectorClBuiltin('lghosts',itype,dim,tg, + args['multiplier'] = CodegenVectorClBuiltin('multiplier',itype,work_dim,tg, add_impl_const=True, symbolic_mode=fs) - args['rghosts'] = CodegenVectorClBuiltin('rghosts',itype,dim,tg, + args['lghosts'] = CodegenVectorClBuiltin('lghosts',itype,work_dim,tg, + add_impl_const=True, symbolic_mode=fs) + args['rghosts'] = CodegenVectorClBuiltin('rghosts',itype,work_dim,tg, add_impl_const=True, symbolic_mode=fs) super(CacheLoadFunction,self).__init__(basename=name, output=output, @@ -80,10 +86,11 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator): self.itype = itype self.ftype = ftype self.vtype = vtype - self.dim = dim + self.work_dim = work_dim self.components = components self.src_vectorize = src_vectorize self.dst_vectorize = dst_vectorize + self.with_gid_ghost_offset = with_gid_ghost_offset self.boundary = boundary self.declare_code() @@ -91,7 +98,7 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator): def declare_code(self): s = self tg = s.typegen - dim = s.dim + work_dim = s.work_dim itype = s.itype ftype = s.ftype vtype = s.vtype @@ -99,6 +106,7 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator): src_vectorize = s.src_vectorize dst_vectorize = s.dst_vectorize boundary=s.boundary + with_gid_ghost_offset = self.with_gid_ghost_offset global_id = s.args['global_id']; local_id = s.args['local_id']; @@ -108,29 +116,34 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator): lghosts = s.args['lghosts']; rghosts = s.args['rghosts']; + multiplier = s.args['multiplier']; - cache_size = CodegenVectorClBuiltin('S' ,itype,dim,tg,const=True) - local_pos = CodegenVectorClBuiltin('lidx',itype,dim,tg) - global_pos = CodegenVectorClBuiltin('gidx',itype,dim,tg) + cache_size = CodegenVectorClBuiltin('S' ,itype,work_dim,tg,const=True) + local_pos = CodegenVectorClBuiltin('lidx',itype,work_dim,tg) + global_pos = CodegenVectorClBuiltin('gidx',itype,work_dim,tg) LID = CodegenVariable('LID',itype, tg) GID = CodegenVariable('GID',itype, tg) tmp = CodegenVectorClBuiltin('tmp', ftype, components, tg) - compute_index = ComputeIndexFunction(tg,dim,wrap=False) + compute_index = ComputeIndexFunction(tg,work_dim,wrap=False) s.require('compute_index',compute_index) with s._function_(): s.jumpline() - s.append(cache_size.declare(init='{}+{}+{}'.format(lghosts(),local_size(),rghosts()))) + s.append(cache_size.declare(init='{}+{}*{}+{}'.format(lghosts(),multiplier(), + local_size(),rghosts()))) s.jumpline() s.append(global_pos.declare()) s.append(local_pos.declare()) s.jumpline() - if boundary == 'periodic': - s.append('{} += {}-{};'.format(global_id(),src_size(),lghosts())) + if boundary == BoundaryCondition.PERIODIC: + if with_gid_ghost_offset: + s.append('{} += {}-{};'.format(global_id(),src_size(),lghosts())) + else: + s.append('{} += {};'.format(global_id(),src_size())) @contextmanager def _cache_iterate_(i): @@ -142,7 +155,7 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator): s.append('{} = ({}+k{i}*{});'.format(local_pos[i],local_id[i], local_size[i],i=i)); s.append('if ({}>={}) continue;'.format(local_pos[i],cache_size[i])) - if boundary=='periodic': + if boundary==BoundaryCondition.PERIODIC: s.append('{} = ({}+k{i}*{})%{};'.format(global_pos[i],global_id[i], local_size[i],src_size[i],i=i)); else: @@ -152,15 +165,15 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator): except: pass - s.mem_fence(read=True,_local=True) + s.barrier(_local=True) with s._block_(): - nested_loops = [_cache_iterate_(i) for i in xrange(dim-1,-1,-1)] + nested_loops = [_cache_iterate_(i) for i in xrange(work_dim-1,-1,-1)] with contextlib.nested(*nested_loops): with s._block_(): s.append(LID.declare(const=True,init=compute_index(idx=local_pos, - size=cache_size[:dim]))) + size=cache_size[:work_dim]))) s.append(GID.declare(const=True,init=compute_index(idx=global_pos, - size=src_size[:dim]))) + size=src_size[:work_dim]))) if src_vectorize: load=s.vars['src'][GID()] @@ -178,14 +191,14 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator): s.append(store) - s.mem_fence(write=True,_local=True) + s.barrier(_local=True) def per_work_statistics(self): typegen = self.typegen itype = self.itype ftype = self.ftype - dim = self.dim + work_dim = self.work_dim components = self.components size = typegen.FLT_BYTES[ftype] @@ -193,8 +206,8 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator): writes = reads ops = {} - ops[itype] = 2*dim - ops[itype] += 10*dim + ops[itype] = 2*work_dim + ops[itype] += 10*work_dim stats = WorkStatistics() stats.ops_per_type = ops @@ -209,12 +222,12 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator): if __name__ == '__main__': - from hysop.codegen.base.test import test_typegen + from hysop.codegen.base.test import _test_typegen - tg = test_typegen('float') - cg = OpenClCodeGenerator(name='base',typegen=tg) + tg = _test_typegen('float') + cg = OpenClCodeGenerator(name='base', typegen=tg) - cache_load0 = CacheLoadFunction(typegen=tg,ftype=tg.fbtype,dim=3, + cache_load0 = CacheLoadFunction(typegen=tg,ftype=tg.fbtype,work_dim=3, components=4, src_vectorize=False, dst_vectorize=True) print cache_load0.per_work_statistics() diff --git a/hysop/codegen/kernels/directional_advection.py b/hysop/codegen/kernels/directional_advection.py index c2f36e6fdfcd0c3b1e2ea7b8eb0ac768bd25bb73..f5a9fae29398722c39c9618998a48e2efe831a3d 100644 --- a/hysop/codegen/kernels/directional_advection.py +++ b/hysop/codegen/kernels/directional_advection.py @@ -201,6 +201,11 @@ class DirectionalAdvectionKernel(KernelCodeGenerator): mesh_info_struct = MeshInfoStruct(typegen=typegen, typedef='MeshInfo_s') reqs['MeshInfoStruct'] = mesh_info_struct + + cache_load = CacheLoadFunction(typegen=tg,ftype=ftype,work_dim=1, + components=1, src_vectorize=False, dst_vectorize=False, + boundary=boundary, with_gid_ghost_offset=False) + reqs['cache_load'] = cache_load advection_rhs = DirectionalAdvectionRhsFunction(typegen=typegen, work_dim=work_dim, ftype=ftype, is_cached=is_cached, @@ -259,6 +264,7 @@ class DirectionalAdvectionKernel(KernelCodeGenerator): boundary = s.boundary storage = s.storage nparticles = s.nparticles + min_ghosts = s.min_ghosts symbolic_mode = s.symbolic_mode @@ -296,21 +302,28 @@ class DirectionalAdvectionKernel(KernelCodeGenerator): s.update_vars(grid_size=grid_size, inv_dx=inv_dx, compute_grid_ghosts=compute_grid_ghosts, compute_grid_size=compute_grid_size) - # compute_index = self.reqs['compute_id'] - runge_kutta = self.reqs['runge_kutta'] + cache_load = self.reqs['cache_load'] + runge_kutta = self.reqs['runge_kutta'] + + line_index = CodegenVariable(name='line_index', ctype=itype, typegen=tg) + line_offset = CodegenVariable(name='line_offset', ctype=itype, typegen=tg,const=True) + line_velocity = CodegenVariable(name='line_velocity', ctype=ftype, ptr=True, + storage='__global', restrict=True, const=True, add_impl_const=True, typegen=tg) - line_index = CodegenVariable(name='line_index', ctype=itype, typegen=tg) - X = CodegenVectorClBuiltin('X', ftype, nparticles, typegen=tg) - pid = CodegenVectorClBuiltin('pid', itype, nparticles, typegen=tg) - poffset = CodegenVectorClBuiltin('poffset', itype, nparticles, typegen=tg) + X = CodegenVectorClBuiltin('X', ftype, nparticles, typegen=tg) + pid = CodegenVectorClBuiltin('pid', itype, nparticles, typegen=tg, const=True) + poffset = CodegenVectorClBuiltin('poffset', itype, nparticles, typegen=tg) + npart = CodegenVariable(name='npart', ctype=itype, typegen=tg, init=nparticles) if is_cached: - cache_ghosts = CodegenVariable('cache_ghosts', itype, typegen=tg, value=min_ghosts) - cache_width = CodegenVariable('cache_width', itype, typegen=tg) + cache_ghosts = CodegenVariable('cache_ghosts', itype, typegen=tg, value=min_ghosts, + symbolic_mode=symbolic_mode) + cache_width = CodegenVariable('cache_width', itype, typegen=tg, + init='{}*{} + 2*{}'.format(npart(),local_size[0],cache_ghosts())) if local_size_known: - local_size = s.known_vars['local_size'] + L = s.known_vars['local_size'] Vc = CodegenArray(name='Vc',dim=1,ctype=ftype,typegen=tg, - shape=(local_size.value[0]+2*min_ghosts,), storage='__local') + shape=(nparticles*L[0]+2*min_ghosts,), storage='__local') else: Vc = s.vars['Vc'] @@ -326,7 +339,7 @@ class DirectionalAdvectionKernel(KernelCodeGenerator): fval = 0 gsize = 1 N = '({Sx}+{npart}*{Lx}-1)/({npart}*{Lx})'.format(Sx=compute_grid_size[i], - npart=nparticles,Lx=local_size[0]) + npart=npart(),Lx=local_size[0]) else: fval = global_id.fval(i) gsize = global_size[i] @@ -348,6 +361,8 @@ class DirectionalAdvectionKernel(KernelCodeGenerator): if work_dim>1: init = '({}+{})*{}'.format(init, global_id[1],grid_size[0]) line_index.declare(s, const=True, init=init) + line_velocity.declare(s, const=True, + init='{}+{}'.format(velocity(),line_index())) yield ctx except: @@ -372,16 +387,54 @@ class DirectionalAdvectionKernel(KernelCodeGenerator): dx.declare(al,align=True) s.jumpline() + with s._align_() as al: + npart.declare(al, const=True, align=True) + if is_cached: + cache_ghosts.declare(al,const=True, align=True) + cache_width.declare(al,const=True, align=True) + s.jumpline() + + if is_cached and local_size_known: + Vc.declare(s); + s.jumpline() + poffset.declare(s,init=range(nparticles),const=True) global_id.declare(s,init=False) s.jumpline() with contextlib.nested(*nested_loops): - active.declare(s, init='({}*{}+{} < {})'.format(nparticles,global_id[0], nparticles-1, + active.declare(s, init='({}*{}+{}-1 < {})'.format(npart(),global_id[0], npart(), compute_grid_size[0])) s.jumpline() - pid.declare(s, init='{}*{}+{}+{}'.format(nparticles,global_id[0],poffset(),compute_grid_ghosts[0])) + if is_cached: + line_offset.declare(s,init='{}*{}*{}-{}'.format('k',local_size[0],npart(), + cache_ghosts())) + else: + line_offset.declare(s,init=0) + + pid.declare(s, init='{}*{}+{}+{}'.format(npart(),global_id[0],poffset(), + compute_grid_ghosts[0])) + s.jumpline() + + if is_cached: + cid = CodegenVariable(name='cid',ctype=itype,typegen=tg,const=True) + cid.declare(s,init='{} + {};'.format(line_offset(), local_id[0])) + kargs = { + 'src0': line_velocity, + 'dst0': Vc, + 'src_size': global_size[0], + 'local_size': grid_size[0], + 'local_id': local_id[0], + 'global_id': cid, + 'lghosts': cache_ghosts, + 'rghosts': cache_ghosts, + 'multiplier': npart, + } + call = cache_load(**kargs) + s.append(call+';') + s.jumpline() + X.declare(s, init='convert_{}({})*{}'.format(part_ftype,pid(),dx())); rk_args = { @@ -389,10 +442,12 @@ class DirectionalAdvectionKernel(KernelCodeGenerator): 'dt': dt, 'active': active, 'inv_dx': inv_dx, - 'velocity': velocity + 'line_offset': line_offset } - if not is_cached: - rk_args['line_index'] = line_index + if is_cached: + rk_args['velocity'] = Vc + else: + rk_args['velocity'] = line_velocity if is_periodic: rk_args['line_width'] = grid_size[0] call = runge_kutta(**rk_args) @@ -400,6 +455,7 @@ class DirectionalAdvectionKernel(KernelCodeGenerator): s.append(code) s.append('{} += {};'.format(X(), xmin())) + s.jumpline() with s._if_(active()): if nparticles>1: @@ -432,10 +488,10 @@ if __name__ == '__main__': work_dim=work_dim, rk_scheme=ExplicitRungeKutta('RK2'), boundary=BoundaryCondition.PERIODIC, - is_cached=False, - min_ghosts=0, + is_cached=True, + min_ghosts=10, nparticles=4, - symbolic_mode=False, + symbolic_mode=True, known_vars=dict( mesh_info=mesh_info, local_size=local_size[:work_dim] diff --git a/hysop/codegen/kernels/tests/test_directional_advection.py b/hysop/codegen/kernels/tests/test_directional_advection.py new file mode 100644 index 0000000000000000000000000000000000000000..36daaf04eccb393b95db3773b0aa42397ff8cd5b --- /dev/null +++ b/hysop/codegen/kernels/tests/test_directional_advection.py @@ -0,0 +1,419 @@ + +import copy, math + +from hysop.gpu import cl +from hysop.constants import np, BoundaryCondition +from hysop.codegen.base.test import _test_mesh_info , _test_typegen +from hysop.methods_keys import ExplicitRungeKutta +from hysop.codegen.kernels.directional_advection import DirectionalAdvectionKernel + +class TestDirectionalAdvection(object): + + @classmethod + def setup_class(cls,enable_error_plots=False): + typegen = _test_typegen('double','dec') + dtype = np.float64 + + queue = cl.CommandQueue(typegen.context) + ctx = typegen.context + + grid_size = np.asarray([64,64,64]) + (A,grid_mesh_info) = _test_mesh_info(typegen,3,0,grid_size) + + dx = A['dx'][0][0] + inv_dx = A['inv_dx'][0][0] + dt = 0.1 + + umax = 10.0 + min_ghosts = int(math.ceil(dt*umax/float(dx))) + ghosts = min_ghosts + 0 + + print 'min ghosts are: {}'.format(min_ghosts) + print 'actual ghosts are: {}'.format(ghosts) + + compute_grid_ghosts = np.asarray([ghosts,0,0]) + compute_grid_size = grid_size + 2*compute_grid_ghosts + (B,compute_grid_mesh_info) = _test_mesh_info(typegen,3,compute_grid_ghosts,compute_grid_size) + + grid_shape = grid_size[::-1] + compute_grid_shape = compute_grid_size[::-1] + + assert A['dx'][0] == B['dx'][0] + + grid_bytes = grid_size.size * typegen.FLT_BYTES[typegen.fbtype] + compute_grid_bytes = compute_grid_size.size * typegen.FLT_BYTES[typegen.fbtype] + + mf = cl.mem_flags + + host_buffers_init = { + 'no_ghosts': { + 'u': umax * np.random.rand(*grid_shape).astype(dtype), + 'pos': -1 * np.ones(shape=grid_shape, dtype=dtype), + 'dbg0': -1 * np.ones(shape=grid_shape, dtype=np.int32), + 'dbg1': -1 * np.ones(shape=grid_shape, dtype=np.int32), + }, + 'with_ghosts': { + 'u': umax * np.random.rand(*compute_grid_shape).astype(dtype), + 'pos': -1 * np.ones(shape=grid_shape, dtype=dtype), + 'dbg0': -1 * np.ones(shape=compute_grid_shape, dtype=np.int32), + 'dbg1': -1 * np.ones(shape=compute_grid_shape, dtype=np.int32), + } + } + + device_buffers = { + 'no_ghosts': { + 'u': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['u']), + 'pos': cl.Buffer(ctx, flags=mf.WRITE_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['pos']), + 'dbg0': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['dbg0']), + 'dbg1': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['dbg1']), + }, + 'with_ghosts': { + 'u': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['u']), + 'pos': cl.Buffer(ctx, flags=mf.WRITE_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['pos']), + 'dbg0': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['dbg0']), + 'dbg1': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['dbg1']), + } + } + + host_buffers_reference = copy.deepcopy(host_buffers_init) + host_buffers_gpu = copy.deepcopy(host_buffers_init) + + cls.typegen = typegen + cls.queue = queue + + cls.grid_size = grid_size + cls.grid_shape = grid_shape + cls.grid_bytes = grid_bytes + + cls.grid_mesh_info = grid_mesh_info + cls.compute_grid_mesh_info = compute_grid_mesh_info + + cls.compute_grid_ghosts = compute_grid_ghosts + cls.compute_grid_size = compute_grid_size + cls.compute_grid_shape = compute_grid_shape + cls.compute_grid_bytes = compute_grid_bytes + + cls.host_buffers_init = host_buffers_init + cls.host_buffers_reference = host_buffers_reference + cls.host_buffers_gpu = host_buffers_gpu + cls.device_buffers = device_buffers + + Lx = min(typegen.device.max_work_item_sizes[0], typegen.device.max_work_group_size) + Lx = min(Lx, grid_size[0]) + + cls.local_work_size = np.asarray([Lx,1,1]) + cls.work_load = np.asarray([1,1,1]) + + cls.dx = dx + cls.inv_dx = inv_dx + cls.dt = dtype(dt) + + cls.enable_error_plots = enable_error_plots + + + @classmethod + def teardown_class(cls): + pass + + def setup_method(self, method): + pass + + def teardown_method(self, method): + pass + + def _do_compute_cpu(self, rk_scheme, boundary): + + dt = self.dt + dx = self.dx + ghosts = self.compute_grid_ghosts + grid_size = self.grid_size + grid_ghosts = self.compute_grid_ghosts + compute_grid_size = self.compute_grid_size + + is_periodic = False + if boundary == BoundaryCondition.PERIODIC: + is_periodic = True + target = 'no_ghosts' + grid_ghosts*=0 + view = [slice(0,grid_size[2]), + slice(0,grid_size[1]), + slice(0,grid_size[0])] + elif boundary == BoundaryCondition.NONE: + target = 'with_ghosts' + view = [slice(ghosts[2],grid_size[2]+ghosts[2]), + slice(ghosts[1],grid_size[1]+ghosts[1]), + slice(ghosts[0],grid_size[0]+ghosts[0])] + else: + raise ValueError() + + host_init_buffers = self.host_buffers_init[target] + host_buffers_reference = self.host_buffers_reference[target] + + velocity = host_init_buffers['u'] + position = host_init_buffers['pos'] + pos = np.zeros_like(position) + pos[:,:,:] = np.arange(grid_size[0]+grid_ghosts[0])[None,None,:] + pos*=dx + + def interp_velocity(X): + Gx = grid_size[0] + X = X.copy() / dx + lidx = np.floor(X).astype(np.int32) + alpha = X-lidx + if is_periodic: + lidx = (lidx+Gx)%Gx + ridx = (lidx+1)%Gx + else: + ridx = lidx+1 + Vl = np.empty_like(X) + Vr = np.empty_like(X) + for i in xrange(*view[0].indices(grid_size[2])): + for j in xrange(*view[1].indices(grid_size[1])): + Vl[i,j,:] = velocity[i,j,lidx[i,j,:]] + Vr[i,j,:] = velocity[i,j,ridx[i,j,:]] + return Vl + alpha*(Vr-Vl) + + if rk_scheme.name() == 'Euler': + pos += velocity[view]*dt + elif rk_scheme.name() == 'RK2': + X0 = pos.copy() + K0 = velocity[view] + X1 = X0 + 0.5*K0*dt; + K1 = interp_velocity(X1) + K = K1; + pos = X0+K*dt + elif rk_scheme.name() == 'RK4': + X0 = pos.copy() + K0 = velocity[view] + X1 = X0 + 0.5*K0*dt; + K1 = interp_velocity(X1) + X2 = X0 + 0.5*K1*dt; + K2 = interp_velocity(X2) + X3 = X0 + 1.0*K2*dt; + K3 = interp_velocity(X3) + K = 1./6*K0 + 2./6*K1 + 2./6*K2 + 1./6*K3 + pos = X0+K*dt + else: + msg = 'Unknown Runge-Kutta scheme {}.'.format(rk_scheme) + raise ValueError(msg) + + host_buffers_reference['pos'] = pos + + def _do_compute_gpu_and_check(self, rk_scheme, boundary, cached, nparticles, work_dim): + + msg = '\nTesting {}directional {}d advection with {} scheme and {} boundaries, {} particles at a time.'\ + .format('cached ' if cached else '', + work_dim, + rk_scheme.name(), + BoundaryCondition.svalue(boundary).lower(), + nparticles) + print msg + + dt = self.dt + work_size = self.grid_size + work_load = self.work_load + local_work_size = self.local_work_size + queue = self.queue + + grid_size = self.grid_size + grid_shape = self.grid_shape + ghosts = self.compute_grid_ghosts + + assert grid_size[0] % nparticles == 0 + + kernel_args = [dt] + if boundary == BoundaryCondition.PERIODIC: + target = 'no_ghosts' + mesh_info = self.grid_mesh_info + view = [slice(0,grid_size[2]), + slice(0,grid_size[1]), + slice(0,grid_size[0])] + elif boundary == BoundaryCondition.NONE: + target = 'with_ghosts' + mesh_info = self.compute_grid_mesh_info + view = [slice(ghosts[2],grid_size[2]+ghosts[2]), + slice(ghosts[1],grid_size[1]+ghosts[1]), + slice(ghosts[0],grid_size[0]+ghosts[0])] + else: + raise ValueError() + + for i in xrange(work_dim,3): + view[2-i] = 0 + + grid_size = grid_size[:work_dim] + grid_shape = grid_shape[:work_dim] + ghosts = ghosts[:work_dim] + + work_size = work_size[:work_dim] + work_load = work_load[:work_dim] + local_work_size = local_work_size[:work_dim] + + known_vars = { + 'local_size': local_work_size, + 'mesh_info': mesh_info + } + + host_init_buffers = self.host_buffers_init[target] + host_buffers_reference = self.host_buffers_reference[target] + host_buffers_gpu = self.host_buffers_gpu[target] + device_buffers = self.device_buffers[target] + + dak = DirectionalAdvectionKernel( + typegen=self.typegen, + ftype=self.typegen.fbtype, + work_dim=work_dim, + rk_scheme=rk_scheme, + is_cached=cached, + boundary=boundary, + nparticles = nparticles, + symbolic_mode=False, + debug_mode=True, + known_vars=known_vars) + + global_work_size = dak.get_global_size(work_size,local_work_size,work_load) + (static_shared_bytes, dynamic_shared_bytes, total_bytes) = \ + dak.required_workgroup_cache_size(local_work_size) + + variables = ['u','pos'] + debug = ['dbg0', 'dbg1'] + for varname in variables+debug: + kernel_args.append(device_buffers[varname]) + if (dynamic_shared_bytes != 0): + shared_buffer = cl.LocalMemory(dynamic_shared_bytes) + kernel_args.append(shared_buffer) + + print '\tGenerating and compiling Kernel...' + source = dak.__str__() + prg = cl.Program(self.typegen.context, source) + prg.build(devices=[self.typegen.device]) + kernel = prg.all_kernels()[0] + kernel.set_args(*kernel_args) + + print '\tCPU => GPU' + for buf in variables+debug: + src = host_init_buffers[buf] + dst = device_buffers[buf] + cl.enqueue_copy(queue,dst,src) + + print '\tKernel execution <<<{},{}>>>'.format(global_work_size,local_work_size) + evt = cl.enqueue_nd_range_kernel(queue, kernel, + global_work_size.tolist(), local_work_size.tolist()) + evt.wait() + + print '\tGPU => CPU' + for buf in variables+debug: + src = device_buffers[buf] + dst = host_buffers_gpu[buf] + cl.enqueue_copy(queue,dst,src) + + print '\tSynchronize queue' + queue.flush() + queue.finish() + + buffers = [(varname,host_buffers_reference[varname],host_buffers_gpu[varname]) + for varname in variables] + self._cmp_buffers(buffers,view,dak,work_dim) + + def _cmp_buffers(self,buffers,view,dak,work_dim): + good = True + err_buffers = [] + + for (name,host,dev) in buffers: + (l1,l2,linf) = self._distances(host,dev,view) + print '\t{} -> l1={} l2={} linf={}'.format(name,l1,l2,linf) + if l2>1e-8: + err_buffers.append(name) + good = False + if not good: + msg = '\n[FAIL] Buffer comparisson failed for buffers {}.\n'.format(err_buffers) + print msg + dak.edit() + if self.enable_error_plots: + from matplotlib import pyplot as plt + for (name,host,dev) in buffers: + if name in err_buffers: + + host = host[view] + dev = dev[view] + + d = (dev-host)*(dev-host) + d -= np.mean(d) + + if work_dim==3: + fig,axes = plt.subplots(2,2) + axes[0][0].imshow(np.sum(d,axis=0),interpolation='nearest') + axes[0][1].imshow(np.sum(d,axis=1),interpolation='nearest') + axes[1][0].imshow(np.sum(d,axis=2),interpolation='nearest') + axes[1][1].imshow(np.sum(d,axis=(0,1))[np.newaxis,:],interpolation='nearest') + plt.title(name) + fig.show() + raw_input('Press enter to close all windows.') + break + elif work_dim==2: + fig,axe = plt.subplots() + axe.imshow(d,interpolation='nearest') + plt.title(name) + fig.show() + raw_input('Press enter to close all windows.') + break + else: + pass + raise RuntimeError(msg) + + def _distances(self,lhs,rhs,view): + d = rhs[view]-lhs[view] + da = np.abs(d) + + l1 = np.sum(da)/d.size + l2 = np.sqrt(np.sum(d*d))/d.size + linf = np.max(da) + return (l1,l2,linf) + + + def _check_kernels(self, rk_scheme): + assert isinstance(rk_scheme, ExplicitRungeKutta) + + cached=[False] + boundaries=[BoundaryCondition.PERIODIC] + nparticles=[1,2,4,8,16] + work_dims=[1,2,3] + + for boundary in boundaries: + self._do_compute_cpu(boundary=boundary, rk_scheme=rk_scheme) + for work_dim in work_dims: + for cache in cached: + for nparticle in nparticles: + self._do_compute_gpu_and_check(boundary=boundary,rk_scheme=rk_scheme, + work_dim=work_dim, nparticles=nparticle, cached=cache) + + + def test_advection_Euler(self): + rk_scheme=ExplicitRungeKutta('Euler') + self._check_kernels(rk_scheme=rk_scheme) + + def test_advection_RK2(self): + rk_scheme=ExplicitRungeKutta('RK2') + self._check_kernels(rk_scheme=rk_scheme) + + def test_advection_RK4(self): + rk_scheme=ExplicitRungeKutta('RK4') + self._check_kernels(rk_scheme=rk_scheme) + +if __name__ == '__main__': + TestDirectionalAdvection.setup_class(enable_error_plots=True) + test = TestDirectionalAdvection() + + test.test_advection_Euler() + test.test_advection_RK2() + test.test_advection_RK4() + + TestDirectionalAdvection.teardown_class() + diff --git a/hysop/codegen/kernels/tests/test_directional_stretching.py b/hysop/codegen/kernels/tests/test_directional_stretching.py index 3b043e0a15348d41a39115faeb3efc123c7f45fa..85f3c89059af9eecd73bf8d37c0ec22a084e7b5b 100644 --- a/hysop/codegen/kernels/tests/test_directional_stretching.py +++ b/hysop/codegen/kernels/tests/test_directional_stretching.py @@ -487,66 +487,66 @@ class TestDirectionalStretching(object): - def test_gradUW_Euler(self): + def test_stretching_gradUW_Euler(self): formulation=StretchingFormulation.GRAD_UW rk_scheme=ExplicitRungeKutta('Euler') self._check_kernels(formulation=formulation, rk_scheme=rk_scheme) - def test_gradUW_T_Euler(self): + def test_stretching_gradUW_T_Euler(self): formulation=StretchingFormulation.GRAD_UW_T rk_scheme=ExplicitRungeKutta('Euler') self._check_kernels(formulation=formulation, rk_scheme=rk_scheme) - def test_mixed_gradUW_Euler(self): + def test_stretching_mixed_gradUW_Euler(self): formulation=StretchingFormulation.MIXED_GRAD_UW rk_scheme=ExplicitRungeKutta('Euler') self._check_kernels(formulation=formulation, rk_scheme=rk_scheme) - def test_conservative_Euler(self): + def test_stretching_conservative_Euler(self): formulation=StretchingFormulation.CONSERVATIVE rk_scheme=ExplicitRungeKutta('Euler') self._check_kernels(formulation=formulation, rk_scheme=rk_scheme) - def test_gradUW_RK2(self): + def test_stretching_gradUW_RK2(self): formulation=StretchingFormulation.GRAD_UW rk_scheme=ExplicitRungeKutta('RK2') self._check_kernels(formulation=formulation, rk_scheme=rk_scheme) - def test_gradUW_T_RK2(self): + def test_stretching_gradUW_T_RK2(self): formulation=StretchingFormulation.GRAD_UW_T rk_scheme=ExplicitRungeKutta('RK2') self._check_kernels(formulation=formulation, rk_scheme=rk_scheme) - def test_mixed_gradUW_RK2(self): + def test_stretching_mixed_gradUW_RK2(self): formulation=StretchingFormulation.MIXED_GRAD_UW rk_scheme=ExplicitRungeKutta('RK2') self._check_kernels(formulation=formulation, rk_scheme=rk_scheme) - def test_conservative_RK2(self): + def test_stretching_conservative_RK2(self): formulation=StretchingFormulation.CONSERVATIVE rk_scheme=ExplicitRungeKutta('RK2') self._check_kernels(formulation=formulation, rk_scheme=rk_scheme) - def test_gradUW_RK4(self): + def test_stretching_gradUW_RK4(self): formulation=StretchingFormulation.GRAD_UW rk_scheme=ExplicitRungeKutta('RK4') self._check_kernels(formulation=formulation, rk_scheme=rk_scheme) - def test_gradUW_T_RK4(self): + def test_stretching_gradUW_T_RK4(self): formulation=StretchingFormulation.GRAD_UW_T rk_scheme=ExplicitRungeKutta('RK4') self._check_kernels(formulation=formulation, rk_scheme=rk_scheme) - def test_mixed_gradUW_RK4(self): + def test_stretching_mixed_gradUW_RK4(self): formulation=StretchingFormulation.MIXED_GRAD_UW rk_scheme=ExplicitRungeKutta('RK4') self._check_kernels(formulation=formulation, rk_scheme=rk_scheme) - def test_conservative_RK4(self): + def test_stretching_conservative_RK4(self): formulation=StretchingFormulation.CONSERVATIVE rk_scheme=ExplicitRungeKutta('RK4') self._check_kernels(formulation=formulation, rk_scheme=rk_scheme) @@ -556,20 +556,20 @@ if __name__ == '__main__': TestDirectionalStretching.setup_class(do_extra_tests=False, enable_error_plots=True) test = TestDirectionalStretching() - test.test_gradUW_Euler() - test.test_gradUW_T_Euler() - test.test_mixed_gradUW_Euler() - test.test_conservative_Euler() + test.test_stretching_gradUW_Euler() + test.test_stretching_gradUW_T_Euler() + test.test_stretching_mixed_gradUW_Euler() + test.test_stretching_conservative_Euler() - test.test_gradUW_RK2() - test.test_gradUW_T_RK2() - test.test_mixed_gradUW_RK2() - test.test_conservative_RK2() + test.test_stretching_gradUW_RK2() + test.test_stretching_gradUW_T_RK2() + test.test_stretching_mixed_gradUW_RK2() + test.test_stretching_conservative_RK2() - test.test_gradUW_RK4() - test.test_gradUW_T_RK4() - test.test_mixed_gradUW_RK4() - test.test_conservative_RK4() + test.test_stretching_gradUW_RK4() + test.test_stretching_gradUW_T_RK4() + test.test_stretching_mixed_gradUW_RK4() + test.test_stretching_conservative_RK4() TestDirectionalStretching.teardown_class()