From 4664a75feb55bd734fe942b300ad294d5b6bdad4 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Keck <Jean-Baptiste.Keck@imag.fr>
Date: Mon, 12 Dec 2016 12:20:53 +0100
Subject: [PATCH] stretching requirements

---
 hysop/codegen/base/kernel_codegen.py          | 34 +++++++--
 .../codegen/kernels/directional_stretching.py | 70 +++++++++++++++--
 .../tests/test_directional_stretching.py      | 75 +++++++++++++++----
 3 files changed, 147 insertions(+), 32 deletions(-)

diff --git a/hysop/codegen/base/kernel_codegen.py b/hysop/codegen/base/kernel_codegen.py
index f9d2791ae..64f40d794 100644
--- a/hysop/codegen/base/kernel_codegen.py
+++ b/hysop/codegen/base/kernel_codegen.py
@@ -61,8 +61,20 @@ class KernelCodeGenerator(KernelBase, OpenClCodeGenerator):
         self.gen_kernel_attributes()
     
     
-    def required_workgroup_cache_size(self):
-        return (0,0) # static & dynamic cache
+    #return global_work_size from effective work_size and given local_work_size
+    # /!\ it should be garanted that global_work_size is a multiple of local_work_size
+    def get_global_work_size(self, work_size, local_work_size):
+        work_size       = np.asarray(work_size)
+        local_work_size = np.asarray(local_work_size)
+        return ((work_size+local_work_size-1)/local_work_size) * local_work_size
+    
+    def min_ghosts(self):
+        ghosts = (0,)*self.work_dim
+        return np.asarray(ghosts)
+
+    #return a tuple of required (static,dynamic) cache bytes per workgroup
+    def required_workgroup_cache_size(self, local_work_size):
+        return (0,0)
     
     def gen_kernel_variables(self):
         tg = self.typegen
@@ -73,12 +85,18 @@ class KernelCodeGenerator(KernelBase, OpenClCodeGenerator):
         kvars['work_dim']  = CodegenVariable('work_dim','uint', tg, symbolic_mode=sm)
         kvars['global_index'] = CodegenVariable('GID', 'int', tg)
         kvars['local_index']  = CodegenVariable('LID', 'int', tg)
-        kvars['global_size']  = CodegenVectorClBuiltinFunc('global_size', 'G',       'int',work_dim,tg,symbolic_mode=sm)
-        kvars['local_size']   = CodegenVectorClBuiltinFunc('local_size',  'L',       'int',work_dim,tg,symbolic_mode=sm)
-        kvars['global_id']    = CodegenVectorClBuiltinFunc('global_id',   'gid',     'int',work_dim,tg)
-        kvars['local_id']     = CodegenVectorClBuiltinFunc('local_id',    'lid',     'int',work_dim,tg)
-        kvars['num_groups']   = CodegenVectorClBuiltinFunc('num_groups',  'ngroups', 'int',work_dim,tg,symbolic_mode=sm) 
-        kvars['group_id']     = CodegenVectorClBuiltinFunc('group_id',    'group_id','int',work_dim,tg)         
+        kvars['global_size']  = CodegenVectorClBuiltinFunc('global_size', 'G',       
+								'int',work_dim,tg,symbolic_mode=sm)
+        kvars['local_size']   = CodegenVectorClBuiltinFunc('local_size',  'L',       
+								'int',work_dim,tg,symbolic_mode=sm)
+        kvars['global_id']    = CodegenVectorClBuiltinFunc('global_id',   'gid',     
+								'int',work_dim,tg)
+        kvars['local_id']     = CodegenVectorClBuiltinFunc('local_id',    'lid',     
+								'int',work_dim,tg)
+        kvars['num_groups']   = CodegenVectorClBuiltinFunc('num_groups',  'ngroups', 
+								'int',work_dim,tg,symbolic_mode=sm) 
+        kvars['group_id']     = CodegenVectorClBuiltinFunc('group_id',    'group_id',
+								'int',work_dim,tg)         
 
         self.update_vars(kvars)
     
diff --git a/hysop/codegen/kernels/directional_stretching.py b/hysop/codegen/kernels/directional_stretching.py
index 56e0aa695..b3ddac11f 100644
--- a/hysop/codegen/kernels/directional_stretching.py
+++ b/hysop/codegen/kernels/directional_stretching.py
@@ -113,12 +113,59 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
         self.gencode()
 
     def min_ghosts(self):
-        stencil_ghost = self.order/2
-        if self.is_conservative:
-            ghosts = self.rk_scheme.stages * stencil_ghost
-        else:
-            ghosts = stencil_ghost
-        return ghosts
+        direction = self.direction
+        ghosts = [0]*self.dim
+        if self.boundary == BoundaryCondition.PERIODIC:
+            pass
+        elif self.boundary == BoundaryCondition.NONE:
+            stencil_ghost = self.order/2
+            if self.is_conservative:
+                ghosts[direction] = self.rk_scheme.stages * stencil_ghost
+            else:
+                ghosts[direction] = stencil_ghost
+        return np.asarray(ghosts)
+
+    #return global_work_size from effective work_size and given local_work_size
+    # /!\ it should be garanted that global_work_size is a multiple of local_work_size
+    def get_global_work_size(self, work_size, local_work_size):
+        work_size       = np.asarray(work_size)
+        local_work_size = np.asarray(local_work_size)
+        
+        cache_ghosts = self.cache_ghosts()
+        local_work = local_work_size - 2*cache_ghosts
+
+        return ((work_size+local_work-1)/local_work) * local_work_size
+    
+    #return a tuple of required (static,dynamic) cache bytes per workgroup
+    def required_workgroup_cache_size(self, local_work_size):
+        dim             = self.work_dim
+        ftype           = self.ftype
+        cached          = self.cached
+        direction       = self.direction
+        cache_ghosts    = self.cache_ghosts()
+        is_conservative = self.is_conservative
+        flt_bytes       = self.typegen.FLT_BYTES[ftype]
+       
+        sc,dc = 0,0
+        if cached: 
+            count = dim*local_work_size[0]
+            if is_conservative:
+                count += local_work_size[0]
+
+            if 'local_size' in self.known_vars:
+                assert (self.known_vars['local_size'] == local_work_size)
+                sc += count
+            else:
+                dc += count
+
+        sc += 2*dim*(2*cache_ghosts)
+        if self.boundary == BoundaryCondition.PERIODIC:
+            sc += 2*dim*(1*cache_ghosts)
+        
+        sc *= flt_bytes
+        dc *= flt_bytes
+        
+        return (sc,dc)
 
     def build_requirements(self,typegen,work_dim,ftype,order,cached,rk_scheme,direction,
             boundary,force_symbolic,formulation,storage):
@@ -189,6 +236,13 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
         self.xyz = xyz
         
         return kargs
+    
+    def cache_ghosts(self):
+        stencil_ghost = self.order/2
+        if self.is_conservative:
+            return self.rk_scheme.stages * stencil_ghost
+        else:
+            return stencil_ghost
 
     def gencode(self):
         s = self
@@ -242,7 +296,7 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
         U          = CodegenVectorClBuiltin('U',ftype,dim,tg)
 
         cache_ghosts = CodegenVariable('cache_ghosts','int',tg,
-                const=True,value=self.min_ghosts())
+                const=True,value=self.cache_ghosts())
         local_work = CodegenVariable('lwork','int',tg,const=True)
         
         cached_vars = ArgDict()
@@ -460,7 +514,7 @@ if __name__ == '__main__':
         order=4, dim=dim, direction=0, 
         formulation=StretchingFormulation.GRAD_UW,
         rk_scheme=ExplicitRungeKutta('RK2'),
-        cached=False,
+        cached=True,
         symbolic_mode=True,
         boundary=BoundaryCondition.NONE,
         known_vars=dict(
diff --git a/hysop/codegen/kernels/tests/test_directional_stretching.py b/hysop/codegen/kernels/tests/test_directional_stretching.py
index fe3bc6a10..9eaf7062f 100644
--- a/hysop/codegen/kernels/tests/test_directional_stretching.py
+++ b/hysop/codegen/kernels/tests/test_directional_stretching.py
@@ -45,20 +45,32 @@ class TestDirectionalStretching(object):
 
         device_buffers = {
                 'no_ghosts': {
-                    'ux': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['no_ghosts']['ux']),
-                    'uy': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['no_ghosts']['uy']),
-                    'uz': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['no_ghosts']['uz']),
-                    'wx': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['no_ghosts']['wx']),
-                    'wy': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['no_ghosts']['wy']),
-                    'wz': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['no_ghosts']['wz'])
+                    'ux': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['ux']),
+                    'uy': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['uy']),
+                    'uz': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['uz']),
+                    'wx': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['wx']),
+                    'wy': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['wy']),
+                    'wz': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['wz'])
                 },
                 'with_ghosts': {
-                    'ux': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['with_ghosts']['ux']),
-                    'uy': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['with_ghosts']['uy']),
-                    'uz': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['with_ghosts']['uz']),
-                    'wx': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['with_ghosts']['wx']),
-                    'wy': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['with_ghosts']['wy']),
-                    'wz': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_buffers_init['with_ghosts']['wz'])
+                    'ux': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['ux']),
+                    'uy': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['uy']),
+                    'uz': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['uz']),
+                    'wx': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['wx']),
+                    'wy': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['wy']),
+                    'wz': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['wz'])
                 }
         }
         
@@ -81,6 +93,7 @@ class TestDirectionalStretching(object):
         cls.device_buffers         = device_buffers
 
         cls.dt = 0.5
+        cls.local_work_size = np.asarray([16,1,1])
 
     @classmethod
     def teardown_class(cls):
@@ -112,14 +125,30 @@ class TestDirectionalStretching(object):
         pass
 
     def _cmp_buffers(self):
-        self.to_cpu()
+        pass
 
     def _do_compute_cpu(self,order,direction,boundary):
         pass
     
     def _do_compute_gpu(self, formulation, rk_scheme, order, direction, boundary, cached):
 
-        known_vars = {}
+        dt = self.dt
+        local_work_size = self.local_work_size
+
+        known_vars = {
+                'local_size': local_work_size,
+                'dt': dt
+        }
+        
+        kernel_args = []
+        if boundary   == BoundaryCondition.PERIODIC:
+            work_size   = self.grid_size
+            gpu_buffers = self.gpu_buffers['no_ghosts']
+        elif boundary == BoundaryCondition.NONE:
+            work_size = self.compute_grid_size
+            gpu_buffers = self.gpu_buffers['with_ghosts']
+        else:
+            raise ValueError()
 
         dsk = DirectionalStretchingKernel(
             typegen=self.typegen, 
@@ -134,12 +163,26 @@ class TestDirectionalStretching(object):
             boundary=boundary,
             known_vars=known_vars)
         
+        global_work_size = dsk.get_global_work_size(work_size,local_work_size)
+        (static_shared_bytes, dynamic_shared_bytes) = \
+                dsk.required_workgroup_cache_size(local_work_size)
+        
+        for varname in ['vx','vy','vz','wx','wy','wz']:
+            kernel_args.append(device_buffers[varname])
+        if (dynamic_shared_bytes != 0):
+            shared_buffer = cl.LocalMemory(dynamic_shared_bytes)
+            kernel_args.append(shared_buffer)
+
         src    = dsk.__str__()
-        prg    = cl.Program(self.typegen.ctx, src)
+        prg    = cl.Program(self.typegen.context, src)
         kernel = prg.all_kernels()[0]
-
+        kernel.set_args(*kernel_args)
         
         self.to_gpu()
+        evt = cl.enqueue_nd_range_kernel(self.queue, kernel, 
+                list(global_work_size), list(local_work_size))
+        evt.wait()
+        self.to_cpu()
     
     
     def check_kernels(self, formulation, rk_scheme):
-- 
GitLab