From 4664a75feb55bd734fe942b300ad294d5b6bdad4 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Keck <Jean-Baptiste.Keck@imag.fr> Date: Mon, 12 Dec 2016 12:20:53 +0100 Subject: [PATCH] stretching requirements --- hysop/codegen/base/kernel_codegen.py | 34 +++++++-- .../codegen/kernels/directional_stretching.py | 70 +++++++++++++++-- .../tests/test_directional_stretching.py | 75 +++++++++++++++---- 3 files changed, 147 insertions(+), 32 deletions(-) diff --git a/hysop/codegen/base/kernel_codegen.py b/hysop/codegen/base/kernel_codegen.py index f9d2791ae..64f40d794 100644 --- a/hysop/codegen/base/kernel_codegen.py +++ b/hysop/codegen/base/kernel_codegen.py @@ -61,8 +61,20 @@ class KernelCodeGenerator(KernelBase, OpenClCodeGenerator): self.gen_kernel_attributes() - def required_workgroup_cache_size(self): - return (0,0) # static & dynamic cache + #return global_work_size from effective work_size and given local_work_size + # /!\ it should be garanted that global_work_size is a multiple of local_work_size + def get_global_work_size(self, work_size, local_work_size): + work_size = np.asarray(work_size) + local_work_size = np.asarray(local_work_size) + return ((work_size+local_work_size-1)/local_work_size) * local_work_size + + def min_ghosts(self): + ghosts = (0,)*self.work_dim + return np.asarray(ghosts) + + #return a tuple of required (static,dynamic) cache bytes per workgroup + def required_workgroup_cache_size(self, local_work_size): + return (0,0) def gen_kernel_variables(self): tg = self.typegen @@ -73,12 +85,18 @@ class KernelCodeGenerator(KernelBase, OpenClCodeGenerator): kvars['work_dim'] = CodegenVariable('work_dim','uint', tg, symbolic_mode=sm) kvars['global_index'] = CodegenVariable('GID', 'int', tg) kvars['local_index'] = CodegenVariable('LID', 'int', tg) - kvars['global_size'] = CodegenVectorClBuiltinFunc('global_size', 'G', 'int',work_dim,tg,symbolic_mode=sm) - kvars['local_size'] = CodegenVectorClBuiltinFunc('local_size', 'L', 'int',work_dim,tg,symbolic_mode=sm) - kvars['global_id'] = CodegenVectorClBuiltinFunc('global_id', 'gid', 'int',work_dim,tg) - kvars['local_id'] = CodegenVectorClBuiltinFunc('local_id', 'lid', 'int',work_dim,tg) - kvars['num_groups'] = CodegenVectorClBuiltinFunc('num_groups', 'ngroups', 'int',work_dim,tg,symbolic_mode=sm) - kvars['group_id'] = CodegenVectorClBuiltinFunc('group_id', 'group_id','int',work_dim,tg) + kvars['global_size'] = CodegenVectorClBuiltinFunc('global_size', 'G', + 'int',work_dim,tg,symbolic_mode=sm) + kvars['local_size'] = CodegenVectorClBuiltinFunc('local_size', 'L', + 'int',work_dim,tg,symbolic_mode=sm) + kvars['global_id'] = CodegenVectorClBuiltinFunc('global_id', 'gid', + 'int',work_dim,tg) + kvars['local_id'] = CodegenVectorClBuiltinFunc('local_id', 'lid', + 'int',work_dim,tg) + kvars['num_groups'] = CodegenVectorClBuiltinFunc('num_groups', 'ngroups', + 'int',work_dim,tg,symbolic_mode=sm) + kvars['group_id'] = CodegenVectorClBuiltinFunc('group_id', 'group_id', + 'int',work_dim,tg) self.update_vars(kvars) diff --git a/hysop/codegen/kernels/directional_stretching.py b/hysop/codegen/kernels/directional_stretching.py index 56e0aa695..b3ddac11f 100644 --- a/hysop/codegen/kernels/directional_stretching.py +++ b/hysop/codegen/kernels/directional_stretching.py @@ -113,12 +113,59 @@ class DirectionalStretchingKernel(KernelCodeGenerator): self.gencode() def min_ghosts(self): - stencil_ghost = self.order/2 - if self.is_conservative: - ghosts = self.rk_scheme.stages * stencil_ghost - else: - ghosts = stencil_ghost - return ghosts + direction = self.direction + ghosts = [0]*self.dim + if self.boundary == BoundaryCondition.PERIODIC: + pass + elif self.boundary == BoundaryCondition.NONE: + stencil_ghost = self.order/2 + if self.is_conservative: + ghosts[direction] = self.rk_scheme.stages * stencil_ghost + else: + ghosts[direction] = stencil_ghost + return np.asarray(ghosts) + + #return global_work_size from effective work_size and given local_work_size + # /!\ it should be garanted that global_work_size is a multiple of local_work_size + def get_global_work_size(self, work_size, local_work_size): + work_size = np.asarray(work_size) + local_work_size = np.asarray(local_work_size) + + cache_ghosts = self.cache_ghosts() + local_work = local_work_size - 2*cache_ghosts + + return ((work_size+local_work-1)/local_work) * local_work_size + + #return a tuple of required (static,dynamic) cache bytes per workgroup + def required_workgroup_cache_size(self, local_work_size): + dim = self.work_dim + ftype = self.ftype + cached = self.cached + direction = self.direction + cache_ghosts = self.cache_ghosts() + is_conservative = self.is_conservative + flt_bytes = self.typegen.FLT_BYTES[ftype] + + sc,dc = 0,0 + if cached: + count = dim*local_work_size[0] + if is_conservative: + count += local_work_size[0] + + if 'local_size' in self.known_vars: + assert (self.known_vars['local_size'] == local_work_size) + sc += count + else: + dc += count + + sc += 2*dim*(2*cache_ghosts) + if self.boundary == BoundaryCondition.PERIODIC: + sc += 2*dim*(1*cache_ghosts) + + sc *= flt_bytes + dc *= flt_bytes + + return (sc,dc) def build_requirements(self,typegen,work_dim,ftype,order,cached,rk_scheme,direction, boundary,force_symbolic,formulation,storage): @@ -189,6 +236,13 @@ class DirectionalStretchingKernel(KernelCodeGenerator): self.xyz = xyz return kargs + + def cache_ghosts(self): + stencil_ghost = self.order/2 + if self.is_conservative: + return self.rk_scheme.stages * stencil_ghost + else: + return stencil_ghost def gencode(self): s = self @@ -242,7 +296,7 @@ class DirectionalStretchingKernel(KernelCodeGenerator): U = CodegenVectorClBuiltin('U',ftype,dim,tg) cache_ghosts = CodegenVariable('cache_ghosts','int',tg, - const=True,value=self.min_ghosts()) + const=True,value=self.cache_ghosts()) local_work = CodegenVariable('lwork','int',tg,const=True) cached_vars = ArgDict() @@ -460,7 +514,7 @@ if __name__ == '__main__': order=4, dim=dim, direction=0, formulation=StretchingFormulation.GRAD_UW, rk_scheme=ExplicitRungeKutta('RK2'), - cached=False, + cached=True, symbolic_mode=True, boundary=BoundaryCondition.NONE, known_vars=dict( diff --git a/hysop/codegen/kernels/tests/test_directional_stretching.py b/hysop/codegen/kernels/tests/test_directional_stretching.py index fe3bc6a10..9eaf7062f 100644 --- a/hysop/codegen/kernels/tests/test_directional_stretching.py +++ b/hysop/codegen/kernels/tests/test_directional_stretching.py @@ -45,20 +45,32 @@ class TestDirectionalStretching(object): device_buffers = { 'no_ghosts': { - 'ux': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['no_ghosts']['ux']), - 'uy': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['no_ghosts']['uy']), - 'uz': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['no_ghosts']['uz']), - 'wx': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['no_ghosts']['wx']), - 'wy': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['no_ghosts']['wy']), - 'wz': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['no_ghosts']['wz']) + 'ux': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['ux']), + 'uy': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['uy']), + 'uz': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['uz']), + 'wx': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['wx']), + 'wy': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['wy']), + 'wz': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['no_ghosts']['wz']) }, 'with_ghosts': { - 'ux': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['with_ghosts']['ux']), - 'uy': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['with_ghosts']['uy']), - 'uz': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['with_ghosts']['uz']), - 'wx': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['with_ghosts']['wx']), - 'wy': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['with_ghosts']['wy']), - 'wz': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['with_ghosts']['wz']) + 'ux': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['ux']), + 'uy': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['uy']), + 'uz': cl.Buffer(ctx, flags=mf.READ_ONLY | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['uz']), + 'wx': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['wx']), + 'wy': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['wy']), + 'wz': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, + hostbuf=host_buffers_init['with_ghosts']['wz']) } } @@ -81,6 +93,7 @@ class TestDirectionalStretching(object): cls.device_buffers = device_buffers cls.dt = 0.5 + cls.local_work_size = np.asarray([16,1,1]) @classmethod def teardown_class(cls): @@ -112,14 +125,30 @@ class TestDirectionalStretching(object): pass def _cmp_buffers(self): - self.to_cpu() + pass def _do_compute_cpu(self,order,direction,boundary): pass def _do_compute_gpu(self, formulation, rk_scheme, order, direction, boundary, cached): - known_vars = {} + dt = self.dt + local_work_size = self.local_work_size + + known_vars = { + 'local_size': local_work_size, + 'dt': dt + } + + kernel_args = [] + if boundary == BoundaryCondition.PERIODIC: + work_size = self.grid_size + gpu_buffers = self.gpu_buffers['no_ghosts'] + elif boundary == BoundaryCondition.NONE: + work_size = self.compute_grid_size + gpu_buffers = self.gpu_buffers['with_ghosts'] + else: + raise ValueError() dsk = DirectionalStretchingKernel( typegen=self.typegen, @@ -134,12 +163,26 @@ class TestDirectionalStretching(object): boundary=boundary, known_vars=known_vars) + global_work_size = dsk.get_global_work_size(work_size,local_work_size) + (static_shared_bytes, dynamic_shared_bytes) = \ + dsk.required_workgroup_cache_size(local_work_size) + + for varname in ['vx','vy','vz','wx','wy','wz']: + kernel_args.append(device_buffers[varname]) + if (dynamic_shared_bytes != 0): + shared_buffer = cl.LocalMemory(dynamic_shared_bytes) + kernel_args.append(shared_buffer) + src = dsk.__str__() - prg = cl.Program(self.typegen.ctx, src) + prg = cl.Program(self.typegen.context, src) kernel = prg.all_kernels()[0] - + kernel.set_args(*kernel_args) self.to_gpu() + evt = cl.enqueue_nd_range_kernel(self.queue, kernel, + list(global_work_size), list(local_work_size)) + evt.wait() + self.to_cpu() def check_kernels(self, formulation, rk_scheme): -- GitLab