From a393d2c115bb1e38f78ba505d1b29759f72104fd Mon Sep 17 00:00:00 2001 From: Keck Jean-Baptiste <jean-baptiste.keck@imag.fr> Date: Thu, 8 Dec 2016 00:26:29 +0100 Subject: [PATCH] bandwidth --- examples/advection_gpu.py | 19 ++-- hysop/codegen/base/kernel_codegen.py | 13 +-- hysop/codegen/kernels/bandwidth.py | 102 ++++++++++++++++++ .../codegen/kernels/directional_stretching.py | 2 +- hysop/constants.py | 2 +- hysop/gpu/kernel_autotuner.py | 4 +- .../gpu/static_gpu_particle_advection_dir.py | 4 +- hysop/tools/callback.py | 29 ++++- 8 files changed, 154 insertions(+), 21 deletions(-) create mode 100644 hysop/codegen/kernels/bandwidth.py diff --git a/examples/advection_gpu.py b/examples/advection_gpu.py index 6c58095b7..9454306e3 100644 --- a/examples/advection_gpu.py +++ b/examples/advection_gpu.py @@ -25,13 +25,20 @@ from hysop.numerics.remeshing import L8_4 if __name__=='__main__': - DIM = 3 GHOSTS = 0 NSCALARS = 0 - f_resolution = (513,513,513)[:DIM] + #DIM = 3 + #f_resolution = (129,129,129)[:DIM] + #f_resolution = (257,257,257)[:DIM] + #f_resolution = (513,513,257)[:DIM] + + #DIM = 2 + #f_resolution = (8193,8193,8193)[:DIM] + #f_resolution = (2049,2049,2049)[:DIM] + #f_resolution = (4097,4097,4097)[:DIM] + v_resolution = f_resolution - #v_resolution = (33,33,33)[:DIM] ghosts = (GHOSTS,)*DIM def initVel(res, *args): @@ -140,9 +147,9 @@ if __name__=='__main__': 'autotuner_config':autotuner_config, 'use_builtin_copy':True, 'stretching': { - 'rk_scheme': ExplicitRungeKutta('RK4_38'), - 'formulation': StretchingFormulation.MIXED_GRAD_UW, - 'order':8 + 'rk_scheme': ExplicitRungeKutta('Euler'), + 'formulation': StretchingFormulation.GRAD_UW, + 'order':4 } } } diff --git a/hysop/codegen/base/kernel_codegen.py b/hysop/codegen/base/kernel_codegen.py index 18ca84aff..c0957a7c8 100644 --- a/hysop/codegen/base/kernel_codegen.py +++ b/hysop/codegen/base/kernel_codegen.py @@ -30,12 +30,13 @@ class KernelCodeGenerator(KernelBase, OpenClCodeGenerator): assert isinstance(typegen, OpenClTypeGen) assert isinstance(kernel_args, ArgDict) assert work_dim>0 and work_dim<=3 - - if (vec_type_hint not in typegen.builtin_types): - msg = 'Invalid vec_type_hint \'{}\'.'.format(vec_type_hint) - raise ValueError(msg) - if typegen.components(vec_type_hint) == 1: - vec_type_hint = None + + if (vec_type_hint is not None): + if (vec_type_hint not in typegen.builtin_types): + msg = 'Invalid vec_type_hint \'{}\'.'.format(vec_type_hint) + raise ValueError(msg) + if typegen.components(vec_type_hint) == 1: + vec_type_hint = None self.vec_type_hint = vec_type_hint self.work_dim = work_dim diff --git a/hysop/codegen/kernels/bandwidth.py b/hysop/codegen/kernels/bandwidth.py new file mode 100644 index 000000000..faa59e665 --- /dev/null +++ b/hysop/codegen/kernels/bandwidth.py @@ -0,0 +1,102 @@ + +import contextlib +from contextlib import contextmanager + +import operator, hashlib + +from hysop import __VERBOSE__, __KERNEL_DEBUG__ + +from hysop.tools.misc import Utils +from hysop.constants import np + +from hysop.codegen.base.opencl_codegen import OpenClCodeGenerator +from hysop.codegen.base.kernel_codegen import KernelCodeGenerator +from hysop.codegen.base.variables import CodegenVariable, CodegenVectorClBuiltin +from hysop.codegen.base.types import OpenClTypeGen +from hysop.codegen.base.utils import WriteOnceDict, ArgDict + +from hysop.gpu import cl, clCharacterize +from hysop.gpu.tools import OpenCLEnvironment +from hysop.gpu.gpu_kernel import KernelLauncher +from hysop.gpu.kernel_autotuner import KernelAutotuner, AutotunerConfig + +class BandwidthKernel(KernelCodeGenerator): + + @staticmethod + def codegen_name(vtype,nreads,nwrites): + vtype = vtype.replace(' ','_') + name = 'bandwidth_{}_{}_{}'.format(vtype,nreads,nwrites) + return name + + def __init__(self, typegen, vtype, nreads, nwrites, known_vars=None): + + assert nreads>0 and nwrites>0 + + kernel_args = self.gen_kernel_arguments(typegen, vtype) + + name = BandwidthKernel.codegen_name(vtype, nreads, nwrites) + + super(BandwidthKernel,self).__init__( + name=name, + typegen=typegen, + work_dim=1, + kernel_args=kernel_args, + known_vars=known_vars) + + self.vtype = vtype + self.nreads = nreads + self.nwrites = nwrites + self.gencode() + + + def gen_kernel_arguments(self, typegen, vtype): + kargs = ArgDict() + kargs['dst'] = CodegenVariable(ctype=vtype,name='dst',ptr=True,typegen=typegen, + nl=True,restrict=True,storage='__global') + kargs['src'] = CodegenVariable(ctype=vtype,name='src',ptr=True,typegen=typegen, + nl=True,restrict=True,storage='__global',const=True) + return kargs + + def gencode(self): + s = self + vtype = s.vtype + + global_id = s.vars['global_id'] + global_size = s.vars['global_size'] + + dst = s.vars['dst'] + src = s.vars['src'] + + buf = CodegenVariable(ctype=vtype,name='buffer',typegen=self.typegen,init=0) + + with s._kernel_(): + global_size.declare(s,const=True) + global_id.declare(s,const=True) + s.jumpline() + buf.declare(s) + s.jumpline() + + if global_size.known(): + s.pragma('unroll') + with s._for_('int {i}=0; {i}<{N}; {i}++'.format(i='i',N=self.nreads)): + offset = '{}+i*{}'.format(global_id(),global_size()) + code = '{} += {};'.format(buf(), src[offset]) + s.append(code) + + s.jumpline() + + if global_size.known(): + s.pragma('unroll') + with s._for_('int {i}=0; {i}<{N}; {i}++'.format(i='i',N=self.nwrites)): + offset = '{}+i*{}'.format(global_id(),global_size()) + code = '{} = {};'.format(dst[offset], buf()) + s.append(code) + +if __name__ == '__main__': + from hysop.gpu import default_typegen + + tg = default_typegen('float') + vtype = 'float' + + ck = BandwidthKernel(tg, vtype, nreads=99, nwrites=1, known_vars={'global_size':1024}) + ck.edit() diff --git a/hysop/codegen/kernels/directional_stretching.py b/hysop/codegen/kernels/directional_stretching.py index 823fa3596..1250c0294 100644 --- a/hysop/codegen/kernels/directional_stretching.py +++ b/hysop/codegen/kernels/directional_stretching.py @@ -188,7 +188,7 @@ class DirectionalStretchingKernel(KernelCodeGenerator): [clCharacterize.get_simd_group_size(device,1),1,1]) max_local_size = np.maximum(min_local_size, np.minimum(Utils.upper_pow2(work_size), - [clCharacterize.usable_local_mem_size(device)//cl_env.prec_size,1,1]) + [clCharacterize.usable_local_mem_size(device)/(dim*cl_env.prec_size),1,1]) ) autotuner_flag = autotuner_config.autotuner_flag if (autotuner_flag == AutotunerFlags.ESTIMATE): diff --git a/hysop/constants.py b/hysop/constants.py index 6c909fed5..f85a459e9 100755 --- a/hysop/constants.py +++ b/hysop/constants.py @@ -13,7 +13,7 @@ import math PI = math.pi # Set default type for real and integer numbers -HYSOP_REAL = np.float32 +HYSOP_REAL = np.float64 SIZEOF_HYSOP_REAL = int(HYSOP_REAL(1.).nbytes) # type for array indices diff --git a/hysop/gpu/kernel_autotuner.py b/hysop/gpu/kernel_autotuner.py index cebf91af9..b8d33873e 100644 --- a/hysop/gpu/kernel_autotuner.py +++ b/hysop/gpu/kernel_autotuner.py @@ -479,7 +479,7 @@ class KernelAutotuner(object): min_local_size = [min_local_size] else: min_local_size = list(min_local_size) - min_local_size += [1]*(3-self.work_dim) + #min_local_size += [1]*(3-self.work_dim) min_local_size = np.asarray(min_local_size) return lambda local_size,**kargs: (local_size>=min_local_size).all() def max_workitems_per_direction(self, max_local_size, **kargs): @@ -487,7 +487,7 @@ class KernelAutotuner(object): max_local_size = [max_local_size] else: max_local_size = list(max_local_size) - max_local_size+=[1]*(3-self.work_dim) + #max_local_size+=[1]*(3-self.work_dim) max_local_size = np.asarray(max_local_size) return lambda local_size,**kargs: (local_size<=max_local_size).all() diff --git a/hysop/gpu/static_gpu_particle_advection_dir.py b/hysop/gpu/static_gpu_particle_advection_dir.py index 146831b40..bda9b17df 100644 --- a/hysop/gpu/static_gpu_particle_advection_dir.py +++ b/hysop/gpu/static_gpu_particle_advection_dir.py @@ -448,6 +448,7 @@ class StaticGPUParticleAdvectionDir(GPUParticleAdvectionDir): self.vorticity = self.fields_on_grid[0] + ftype = self.cl_env.typegen.fbtype compute_resolution = self.fields_topo.mesh.compute_resolution boundary='periodic' order=self._stretching['order'] @@ -459,7 +460,7 @@ class StaticGPUParticleAdvectionDir(GPUParticleAdvectionDir): (kernel_launcher, kernel_args, kernel_args_mapping, total_work, per_work_statistic, cached_bytes) = \ - DirectionalStretchingKernel.autotune(self.cl_env, self.cl_env.typegen.fbtype, + DirectionalStretchingKernel.autotune(self.cl_env, ftype, self.dim,self.direction, compute_resolution, boundary,order, @@ -470,6 +471,7 @@ class StaticGPUParticleAdvectionDir(GPUParticleAdvectionDir): autotuner_config=self._autotuner_config) callback_profiler.register_tasks('stretching', + ftype=ftype, total_work=total_work, per_work_statistic=per_work_statistic) diff --git a/hysop/tools/callback.py b/hysop/tools/callback.py index dd91f8538..a89cbb16c 100644 --- a/hysop/tools/callback.py +++ b/hysop/tools/callback.py @@ -121,7 +121,7 @@ class MemcpyInterface(MemInterface): return TimerInterface.__str__(self) + s class ComputeInterface(MemInterface): - def __init__(self,total_work,per_work_statistic,**kargs): + def __init__(self,total_work,per_work_statistic,ftype='float',**kargs): if not isinstance(per_work_statistic, WorkStatistics): raise ValueError('per_work_statistic is not a WorkStatistics') if total_work<1: @@ -129,7 +129,8 @@ class ComputeInterface(MemInterface): membytes = total_work*per_work_statistic.global_mem_transactions() super(ComputeInterface, self).__init__(membytes=membytes,**kargs) - + + self.ftype = ftype self.total_work = total_work self.per_work_statistic = per_work_statistic self.total_work_statistic = WorkStatistics() @@ -150,13 +151,33 @@ class ComputeInterface(MemInterface): timed_stats = self.stats_per_second() if (timed_stats is not None): + + if self.ftype=='float': + float_op_category = 'FLOPS (FP32)' + float_op_factor=1.0 + elif self.ftype == 'double': + float_op_category = 'FLOPS (FP64)' + float_op_factor=0.5 + elif self.ftype == 'half': + float_op_category = 'FLOPS (FP16)' + float_op_factor=2.0 + else: + raise ValueError() + flops = timed_stats.ops_per_category()['FLOPS'] + flops *= float_op_factor + + opi = flops/timed_stats.global_mem_transactions() + if timed_stats.global_mem_throughput()>0: s += ' throughput={}'.format(bdw2str(timed_stats.global_mem_throughput())) if timed_stats.global_mem_throughput() < timed_stats.total_mem_throughput(): s+= ' (tot={})'.format(bdw2str(timed_stats.total_mem_throughput())) - s += ' OPI={}'.format(unit2str(float(timed_stats.ops_per_category()['FLOPS'])/timed_stats.global_mem_transactions(), 'FLOP/B',decimal=True,rounded=2)) + s += ' OPI={}'.format(unit2str(opi,'FLOP/B',decimal=True,rounded=2)) for (op_category, ops_per_second) in timed_stats.ops_per_second().iteritems(): - s += ' {}'.format(unit2str(ops_per_second,op_category,decimal=True,rounded=2)) + if op_category!='FLOPS': + s += ' {}'.format(unit2str(ops_per_second,op_category,decimal=True,rounded=2)) + else: + s += ' {}'.format(unit2str(ops_per_second*float_op_factor,float_op_category,decimal=True,rounded=2)) return TimerInterface.__str__(self) + s -- GitLab