From a393d2c115bb1e38f78ba505d1b29759f72104fd Mon Sep 17 00:00:00 2001
From: Keck Jean-Baptiste <jean-baptiste.keck@imag.fr>
Date: Thu, 8 Dec 2016 00:26:29 +0100
Subject: [PATCH] bandwidth

---
 examples/advection_gpu.py                     |  19 ++--
 hysop/codegen/base/kernel_codegen.py          |  13 +--
 hysop/codegen/kernels/bandwidth.py            | 102 ++++++++++++++++++
 .../codegen/kernels/directional_stretching.py |   2 +-
 hysop/constants.py                            |   2 +-
 hysop/gpu/kernel_autotuner.py                 |   4 +-
 .../gpu/static_gpu_particle_advection_dir.py  |   4 +-
 hysop/tools/callback.py                       |  29 ++++-
 8 files changed, 154 insertions(+), 21 deletions(-)
 create mode 100644 hysop/codegen/kernels/bandwidth.py

diff --git a/examples/advection_gpu.py b/examples/advection_gpu.py
index 6c58095b7..9454306e3 100644
--- a/examples/advection_gpu.py
+++ b/examples/advection_gpu.py
@@ -25,13 +25,20 @@ from hysop.numerics.remeshing import L8_4
 
 if __name__=='__main__':
 
-    DIM       = 3
     GHOSTS    = 0 
     NSCALARS  = 0
 
-    f_resolution = (513,513,513)[:DIM]
+    #DIM       = 3
+    #f_resolution = (129,129,129)[:DIM]
+    #f_resolution = (257,257,257)[:DIM]
+    #f_resolution = (513,513,257)[:DIM]
+
+    #DIM       = 2
+    #f_resolution = (8193,8193,8193)[:DIM]
+    #f_resolution = (2049,2049,2049)[:DIM]
+    #f_resolution = (4097,4097,4097)[:DIM]
+
     v_resolution = f_resolution
-    #v_resolution = (33,33,33)[:DIM]
     ghosts       = (GHOSTS,)*DIM
 
     def initVel(res, *args):
@@ -140,9 +147,9 @@ if __name__=='__main__':
                 'autotuner_config':autotuner_config,
                 'use_builtin_copy':True,
                 'stretching': {
-                    'rk_scheme': ExplicitRungeKutta('RK4_38'),
-                    'formulation': StretchingFormulation.MIXED_GRAD_UW,
-                    'order':8
+                    'rk_scheme': ExplicitRungeKutta('Euler'),
+                    'formulation': StretchingFormulation.GRAD_UW,
+                    'order':4
                 }
             }
         }
diff --git a/hysop/codegen/base/kernel_codegen.py b/hysop/codegen/base/kernel_codegen.py
index 18ca84aff..c0957a7c8 100644
--- a/hysop/codegen/base/kernel_codegen.py
+++ b/hysop/codegen/base/kernel_codegen.py
@@ -30,12 +30,13 @@ class KernelCodeGenerator(KernelBase, OpenClCodeGenerator):
         assert isinstance(typegen, OpenClTypeGen)
         assert isinstance(kernel_args, ArgDict)
         assert work_dim>0 and work_dim<=3
-        
-        if (vec_type_hint not in typegen.builtin_types):
-            msg = 'Invalid vec_type_hint \'{}\'.'.format(vec_type_hint)
-            raise ValueError(msg)
-        if typegen.components(vec_type_hint) == 1:
-            vec_type_hint = None
+
+        if (vec_type_hint is not None):
+            if (vec_type_hint not in typegen.builtin_types):
+                msg = 'Invalid vec_type_hint \'{}\'.'.format(vec_type_hint)
+                raise ValueError(msg)
+            if typegen.components(vec_type_hint) == 1:
+                    vec_type_hint = None
 
         self.vec_type_hint = vec_type_hint
         self.work_dim = work_dim
diff --git a/hysop/codegen/kernels/bandwidth.py b/hysop/codegen/kernels/bandwidth.py
new file mode 100644
index 000000000..faa59e665
--- /dev/null
+++ b/hysop/codegen/kernels/bandwidth.py
@@ -0,0 +1,102 @@
+
+import contextlib
+from contextlib import contextmanager
+
+import operator, hashlib
+
+from hysop import __VERBOSE__, __KERNEL_DEBUG__
+
+from hysop.tools.misc import Utils
+from hysop.constants import np
+
+from hysop.codegen.base.opencl_codegen import OpenClCodeGenerator
+from hysop.codegen.base.kernel_codegen import KernelCodeGenerator
+from hysop.codegen.base.variables      import CodegenVariable, CodegenVectorClBuiltin
+from hysop.codegen.base.types          import OpenClTypeGen
+from hysop.codegen.base.utils          import WriteOnceDict, ArgDict
+
+from hysop.gpu import cl, clCharacterize
+from hysop.gpu.tools import OpenCLEnvironment
+from hysop.gpu.gpu_kernel import KernelLauncher
+from hysop.gpu.kernel_autotuner import KernelAutotuner, AutotunerConfig
+
+class BandwidthKernel(KernelCodeGenerator):
+
+    @staticmethod
+    def codegen_name(vtype,nreads,nwrites):
+        vtype = vtype.replace(' ','_')
+        name = 'bandwidth_{}_{}_{}'.format(vtype,nreads,nwrites)
+        return name
+    
+    def __init__(self, typegen, vtype, nreads, nwrites, known_vars=None):
+
+        assert nreads>0 and nwrites>0
+        
+        kernel_args = self.gen_kernel_arguments(typegen, vtype)
+        
+        name = BandwidthKernel.codegen_name(vtype, nreads, nwrites)
+
+        super(BandwidthKernel,self).__init__(
+                name=name,
+                typegen=typegen,
+                work_dim=1, 
+                kernel_args=kernel_args,
+                known_vars=known_vars)
+        
+        self.vtype   = vtype
+        self.nreads  = nreads
+        self.nwrites = nwrites
+        self.gencode()
+    
+
+    def gen_kernel_arguments(self, typegen, vtype):
+        kargs = ArgDict()
+        kargs['dst'] = CodegenVariable(ctype=vtype,name='dst',ptr=True,typegen=typegen,
+                nl=True,restrict=True,storage='__global')
+        kargs['src'] = CodegenVariable(ctype=vtype,name='src',ptr=True,typegen=typegen,
+                nl=True,restrict=True,storage='__global',const=True)
+        return kargs
+
+    def gencode(self):
+        s = self
+        vtype = s.vtype
+
+        global_id    = s.vars['global_id']
+        global_size  = s.vars['global_size']
+
+        dst   = s.vars['dst']
+        src   = s.vars['src']
+
+        buf = CodegenVariable(ctype=vtype,name='buffer',typegen=self.typegen,init=0)
+
+        with s._kernel_():
+            global_size.declare(s,const=True)
+            global_id.declare(s,const=True)
+            s.jumpline()
+            buf.declare(s)
+            s.jumpline()
+            
+            if global_size.known():
+                s.pragma('unroll')
+            with s._for_('int {i}=0; {i}<{N}; {i}++'.format(i='i',N=self.nreads)):
+                offset = '{}+i*{}'.format(global_id(),global_size())
+                code = '{} += {};'.format(buf(), src[offset])
+                s.append(code)
+
+            s.jumpline()
+            
+            if global_size.known():
+                s.pragma('unroll')
+            with s._for_('int {i}=0; {i}<{N}; {i}++'.format(i='i',N=self.nwrites)):
+                offset = '{}+i*{}'.format(global_id(),global_size())
+                code = '{} = {};'.format(dst[offset], buf())
+                s.append(code)
+
+if __name__ == '__main__':
+    from hysop.gpu import default_typegen
+    
+    tg = default_typegen('float')
+    vtype = 'float'
+    
+    ck = BandwidthKernel(tg, vtype, nreads=99, nwrites=1, known_vars={'global_size':1024})
+    ck.edit()
diff --git a/hysop/codegen/kernels/directional_stretching.py b/hysop/codegen/kernels/directional_stretching.py
index 823fa3596..1250c0294 100644
--- a/hysop/codegen/kernels/directional_stretching.py
+++ b/hysop/codegen/kernels/directional_stretching.py
@@ -188,7 +188,7 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
                                           [clCharacterize.get_simd_group_size(device,1),1,1])
             max_local_size  = np.maximum(min_local_size,
                               np.minimum(Utils.upper_pow2(work_size), 
-                     [clCharacterize.usable_local_mem_size(device)//cl_env.prec_size,1,1])
+                     [clCharacterize.usable_local_mem_size(device)/(dim*cl_env.prec_size),1,1])
                                 )
             autotuner_flag = autotuner_config.autotuner_flag
             if (autotuner_flag == AutotunerFlags.ESTIMATE):
diff --git a/hysop/constants.py b/hysop/constants.py
index 6c909fed5..f85a459e9 100755
--- a/hysop/constants.py
+++ b/hysop/constants.py
@@ -13,7 +13,7 @@ import math
 PI = math.pi
 
 # Set default type for real and integer numbers
-HYSOP_REAL = np.float32
+HYSOP_REAL = np.float64
 SIZEOF_HYSOP_REAL = int(HYSOP_REAL(1.).nbytes)
 
 # type for array indices
diff --git a/hysop/gpu/kernel_autotuner.py b/hysop/gpu/kernel_autotuner.py
index cebf91af9..b8d33873e 100644
--- a/hysop/gpu/kernel_autotuner.py
+++ b/hysop/gpu/kernel_autotuner.py
@@ -479,7 +479,7 @@ class KernelAutotuner(object):
             min_local_size = [min_local_size]
         else:
             min_local_size = list(min_local_size)
-        min_local_size += [1]*(3-self.work_dim)
+        #min_local_size += [1]*(3-self.work_dim)
         min_local_size = np.asarray(min_local_size)
         return lambda local_size,**kargs: (local_size>=min_local_size).all()
     def max_workitems_per_direction(self, max_local_size, **kargs):
@@ -487,7 +487,7 @@ class KernelAutotuner(object):
             max_local_size = [max_local_size]
         else:
             max_local_size = list(max_local_size)
-        max_local_size+=[1]*(3-self.work_dim)
+        #max_local_size+=[1]*(3-self.work_dim)
         max_local_size = np.asarray(max_local_size)
         return lambda local_size,**kargs: (local_size<=max_local_size).all()
         
diff --git a/hysop/gpu/static_gpu_particle_advection_dir.py b/hysop/gpu/static_gpu_particle_advection_dir.py
index 146831b40..bda9b17df 100644
--- a/hysop/gpu/static_gpu_particle_advection_dir.py
+++ b/hysop/gpu/static_gpu_particle_advection_dir.py
@@ -448,6 +448,7 @@ class StaticGPUParticleAdvectionDir(GPUParticleAdvectionDir):
 
         self.vorticity = self.fields_on_grid[0]
 
+        ftype = self.cl_env.typegen.fbtype
         compute_resolution = self.fields_topo.mesh.compute_resolution
         boundary='periodic'
         order=self._stretching['order']
@@ -459,7 +460,7 @@ class StaticGPUParticleAdvectionDir(GPUParticleAdvectionDir):
 
         (kernel_launcher, kernel_args, kernel_args_mapping, 
                 total_work, per_work_statistic, cached_bytes) = \
-                DirectionalStretchingKernel.autotune(self.cl_env, self.cl_env.typegen.fbtype, 
+                DirectionalStretchingKernel.autotune(self.cl_env, ftype,
                         self.dim,self.direction,
                         compute_resolution,
                         boundary,order,
@@ -470,6 +471,7 @@ class StaticGPUParticleAdvectionDir(GPUParticleAdvectionDir):
                         autotuner_config=self._autotuner_config)
 
         callback_profiler.register_tasks('stretching', 
+                ftype=ftype,
                 total_work=total_work, 
                 per_work_statistic=per_work_statistic)
 
diff --git a/hysop/tools/callback.py b/hysop/tools/callback.py
index dd91f8538..a89cbb16c 100644
--- a/hysop/tools/callback.py
+++ b/hysop/tools/callback.py
@@ -121,7 +121,7 @@ class MemcpyInterface(MemInterface):
         return TimerInterface.__str__(self) + s
 
 class ComputeInterface(MemInterface):
-    def __init__(self,total_work,per_work_statistic,**kargs):
+    def __init__(self,total_work,per_work_statistic,ftype='float',**kargs):
         if not isinstance(per_work_statistic, WorkStatistics):
             raise ValueError('per_work_statistic is not a WorkStatistics')
         if total_work<1:
@@ -129,7 +129,8 @@ class ComputeInterface(MemInterface):
         
         membytes = total_work*per_work_statistic.global_mem_transactions()
         super(ComputeInterface, self).__init__(membytes=membytes,**kargs)
-
+        
+        self.ftype = ftype
         self.total_work           = total_work
         self.per_work_statistic   = per_work_statistic
         self.total_work_statistic = WorkStatistics()
@@ -150,13 +151,33 @@ class ComputeInterface(MemInterface):
         
         timed_stats = self.stats_per_second()
         if (timed_stats is not None):
+
+            if self.ftype=='float':
+                float_op_category = 'FLOPS (FP32)'
+                float_op_factor=1.0
+            elif self.ftype == 'double':
+                float_op_category = 'FLOPS (FP64)'
+                float_op_factor=0.5
+            elif self.ftype == 'half':
+                float_op_category = 'FLOPS (FP16)'
+                float_op_factor=2.0
+            else:
+                raise ValueError()
+            flops = timed_stats.ops_per_category()['FLOPS']
+            flops *= float_op_factor
+
+            opi = flops/timed_stats.global_mem_transactions()
+        
             if timed_stats.global_mem_throughput()>0:
                 s += '  throughput={}'.format(bdw2str(timed_stats.global_mem_throughput()))
                 if timed_stats.global_mem_throughput() < timed_stats.total_mem_throughput():
                     s+= ' (tot={})'.format(bdw2str(timed_stats.total_mem_throughput()))
-                s += ' OPI={}'.format(unit2str(float(timed_stats.ops_per_category()['FLOPS'])/timed_stats.global_mem_transactions(), 'FLOP/B',decimal=True,rounded=2))
+                s += ' OPI={}'.format(unit2str(opi,'FLOP/B',decimal=True,rounded=2))
             for (op_category, ops_per_second) in timed_stats.ops_per_second().iteritems():
-                s += '  {}'.format(unit2str(ops_per_second,op_category,decimal=True,rounded=2))
+                if op_category!='FLOPS':
+                    s += '  {}'.format(unit2str(ops_per_second,op_category,decimal=True,rounded=2))
+                else:
+                    s += '  {}'.format(unit2str(ops_per_second*float_op_factor,float_op_category,decimal=True,rounded=2))
 
         return TimerInterface.__str__(self) + s
 
-- 
GitLab