From 3dc9fd9d2d92c7211630b8b5b5b7a1c348d76711 Mon Sep 17 00:00:00 2001
From: Keck Jean-Baptiste <jbkeck@hotmail.com>
Date: Wed, 6 Jul 2016 11:33:29 +0200
Subject: [PATCH] began kernel autotuner

---
 hysop/gpu/gpu_stretching.py |  35 +++++------
 hysop/gpu/tools.py          | 121 ++++++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 19 deletions(-)

diff --git a/hysop/gpu/gpu_stretching.py b/hysop/gpu/gpu_stretching.py
index e6aa7cc41..c2b1bcbf3 100644
--- a/hysop/gpu/gpu_stretching.py
+++ b/hysop/gpu/gpu_stretching.py
@@ -58,7 +58,6 @@ class GPUStretching(DiscreteOperator, GPUOperator):
         self.order = 2 if self.method[SpaceDiscretisation] is FDC2 else 4
         
         # Worksize handling
-        #TODO
         self._cl_work_size = 0
 
         ## GPU allocations
@@ -112,21 +111,22 @@ class GPUStretching(DiscreteOperator, GPUOperator):
             raise NotImplementedError(msg)
 
     def _gen_cl_src(self):
-        topo   = self.velocity.topology
-        mesh   = topo.mesh
-        dim    = self.dim
+        typegen = self.cl_env.typegen
+        topo    = self.velocity.topology
+        dim     = self.dim
+        mesh    = topo.mesh
 
-        gwi = (256,256,256)
-        lwi = (4,4,4)
+        gwi = mesh.
+        lwi = (8,8,8)
 
         codegen, prg = self._gen_and_build_kernel(lwi, dump_src=True)
-        
-        cache_bytes    = codegen.cache_alloc_bytes(local_size=lwi)
+        cache_bytes = codegen.cache_alloc_bytes(local_size=lwi)
+
         self.local_mem = cl.LocalMemory(cache_bytes)
         self.size_local_alloc += cache_bytes
 
         from hysop.codegen.structs.mesh_info import MeshInfoStruct
-        mesh_info = MeshInfoStruct.build_instance_from_mesh(self.cl_env.typegen, mesh)
+        mesh_info = MeshInfoStruct.build_instance_from_mesh(typegen, mesh)
         mesh_info_buffer = cl.Buffer(self.cl_env.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                 hostbuf=mesh_info)
         self.mesh_info_buffer = mesh_info_buffer
@@ -165,20 +165,17 @@ class GPUStretching(DiscreteOperator, GPUOperator):
 
         return codegen, prg
 
-    def _compute_stretching(self, simulation, to_gpu=True, to_host=True):
-        if to_gpu:
-            for field in self.input:
-                field.toDevice()
-        
-        input_events  = [evt for input in self.input for evt in input.events]
-        
+    def _compute_stretching(self, simulation):
         dt = self.cl_env.typegen.make_floatn(simulation.time_step,1)
         kernel_args    = [dt] + self.velocity.gpu_data + self.vorticity.gpu_data \
-                            + [self.mesh_info_buffer] + [self.local_mem]
+                            + [self.mesh_info_buffer, self.local_mem]
+
+        input_events  = [evt for input in self.input for evt in input.events]
         stretching_evt = self.kernels['stretching'](*kernel_args, wait_for=input_events)
+        output_events = [stretching_evt]
         
-        if to_host:
-            self.vorticity.toHost()
+        self.vorticity.events.append(output_events)
+
 
     def apply(self, simulation):
         self._compute(simulation)
diff --git a/hysop/gpu/tools.py b/hysop/gpu/tools.py
index 7fc2d7e7d..b37ed79ac 100644
--- a/hysop/gpu/tools.py
+++ b/hysop/gpu/tools.py
@@ -13,6 +13,127 @@ FLOAT_GPU, DOUBLE_GPU = np.float32, np.float64
 __cl_env = None
 
 
+class KernelError(Exception):
+    def __init__(self, msg, err):
+        super(KernelError,self).__init__(msg)
+        self.msg = msg
+        self.err = err
+
+    def __str__(self):
+        return self.err + ': ' + self.msg
+
+class OpenClKernelStatistics(object):
+    def __init__(self, events=None):
+        if events is not None:
+            p0 = events[0].profile
+            t0 = p0.end - p0.start
+            total = 0
+            maxi = t0 
+            mini = t0
+            for evt in events:
+                dt = evt.profile.end - evt.profile.start
+                total += dt
+                if dt<mini: 
+                    mini = dt
+                if dt>maxi:
+                    maxi = dt
+            
+            self.tot = total
+            self.min = mini
+            self.max = maxi
+            self.mean = total/len(events)
+        else:
+            self.tot  = 0
+            self.min  = 0
+            self.max  = 0
+            self.mean = 0
+
+    def __str__(self):
+        mini  = self.min   * 1e-6
+        maxi  = self.max   * 1e-6
+        total = self.tot   * 1e-6
+        mean  = self.mean  * 1e-6
+        return 'min={:.2f}ms, max={:.2f}ms, mean={:.2f}ms'.format(mini,maxi,mean)
+            
+
+class KernelAutotuner(object):
+    """OpenCl kernel work group size autotuner.
+    """
+    def __init__(self,work_dim, runs=10):
+        """Initialize a KernelAutotuner.
+        
+        Parameters
+        ----------
+        work_dim: int
+            Work dimension used in targetted OpenCL kernels.
+        """
+        self.work_dim = work_dim
+        self.nruns    = nruns
+        self._load_default_filters()
+
+    def add_filter(fname, f):
+        self.filters[fname] = f
+        return self
+
+    def bench(self,ctx,device,global_size,args,kernel=None,kernel_generator=None,**kargs):
+        assert isinstance(args, list)
+        if (kernel is None) ^ (kernel_generator is None):
+            raise ValueError('Either kernel or kernel_generator should not be None!')
+        if (kernel_generator is None):
+            kernel_generator = lambda **kargs: (kernel, args)
+
+        for local_size in _get_wi_candidates(ctx,device,global_size,**kargs):
+            kernel, args = kernel_generator(ctx,device,global_size,**kargs)
+            stats = self._bench_one(global_size,local_size,kernels,args)
+            print '{}\t{}'.format(local_size,stats)
+    
+    def _bench_one(global_size,local_size,kernel,kargs):
+        evts = []
+        with cl.CommandQueue(ctx,device,cl.command_queue_properties.PROFILING_ENABLE) as queue:
+            for i in xrange(self.nruns):
+                evt = stretching_kernel(queue, global_size, local_size, *kargs)
+                evts.append(evt)
+        stats = OpenClKernelStatistics(evts)
+        return stats
+
+    def _get_wi_candidates(ctx,device,global_size,**kargs):
+        pows = []
+        size = device.max_work_group_size
+        while(size>0):
+            pows.append(size)
+            size >>= 1
+
+        candidates = itertools.product(pows,pows,pows)
+        for f in self.filters.values:
+            F = f(ctx=ctx,device=device,global_size=global_size,**kargs)
+            candidates = itertools.ifilter(F, candidates)
+        return candidates
+
+    def _load_default_filters(self):
+        self.filters = {}
+        self.add_filter('dim_reqs',self._dim_filter)
+        self.add_filter('ordering',self._ordering_filter)
+        self.add_filter('minmax_wi'self._minmax_workitems_filter)
+
+    
+    #filters
+    def _dim_filter(self, device,**kargs):
+            work_dim   = self.work_dim
+            max_wi_dim = device.max_work_item_dimensions
+            return lambda local_size: (work_dim<=max_wi_dim) and (local_size[work_dim:]==1).all()
+    def _ordering_filter(self, **kargs):
+        return lambda local_size: (local_size[2]<=local_size[1]) and (local_size[1]<=local_size[0])
+    def _global_size_filter(self, global_size, **kargs):
+        return lambda local_size: (local_size[0]<=global_size[0]) and (local_size[1]<=global_size[1]) and (local_size[2]<=global_size[2])
+    def _minmax_workitems_filter(self, device,**kargs):
+        def filter(local_size):
+            max_wi_size = device.max_work_item_sizes
+            wi=1
+            for i in xrange(3):
+                wi*=local_size[i]
+            return (wi>=max_wi_size/8) and (wi<=max_wi_size)
+
+
 class OpenCLEnvironment(object):
     """OpenCL environment informations and useful functions.
     """
-- 
GitLab