From 3dc9fd9d2d92c7211630b8b5b5b7a1c348d76711 Mon Sep 17 00:00:00 2001 From: Keck Jean-Baptiste <jbkeck@hotmail.com> Date: Wed, 6 Jul 2016 11:33:29 +0200 Subject: [PATCH] began kernel autotuner --- hysop/gpu/gpu_stretching.py | 35 +++++------ hysop/gpu/tools.py | 121 ++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 19 deletions(-) diff --git a/hysop/gpu/gpu_stretching.py b/hysop/gpu/gpu_stretching.py index e6aa7cc41..c2b1bcbf3 100644 --- a/hysop/gpu/gpu_stretching.py +++ b/hysop/gpu/gpu_stretching.py @@ -58,7 +58,6 @@ class GPUStretching(DiscreteOperator, GPUOperator): self.order = 2 if self.method[SpaceDiscretisation] is FDC2 else 4 # Worksize handling - #TODO self._cl_work_size = 0 ## GPU allocations @@ -112,21 +111,22 @@ class GPUStretching(DiscreteOperator, GPUOperator): raise NotImplementedError(msg) def _gen_cl_src(self): - topo = self.velocity.topology - mesh = topo.mesh - dim = self.dim + typegen = self.cl_env.typegen + topo = self.velocity.topology + dim = self.dim + mesh = topo.mesh - gwi = (256,256,256) - lwi = (4,4,4) + gwi = mesh. + lwi = (8,8,8) codegen, prg = self._gen_and_build_kernel(lwi, dump_src=True) - - cache_bytes = codegen.cache_alloc_bytes(local_size=lwi) + cache_bytes = codegen.cache_alloc_bytes(local_size=lwi) + self.local_mem = cl.LocalMemory(cache_bytes) self.size_local_alloc += cache_bytes from hysop.codegen.structs.mesh_info import MeshInfoStruct - mesh_info = MeshInfoStruct.build_instance_from_mesh(self.cl_env.typegen, mesh) + mesh_info = MeshInfoStruct.build_instance_from_mesh(typegen, mesh) mesh_info_buffer = cl.Buffer(self.cl_env.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=mesh_info) self.mesh_info_buffer = mesh_info_buffer @@ -165,20 +165,17 @@ class GPUStretching(DiscreteOperator, GPUOperator): return codegen, prg - def _compute_stretching(self, simulation, to_gpu=True, to_host=True): - if to_gpu: - for field in self.input: - field.toDevice() - - input_events = [evt for input in self.input for evt in input.events] - + def _compute_stretching(self, simulation): dt = self.cl_env.typegen.make_floatn(simulation.time_step,1) kernel_args = [dt] + self.velocity.gpu_data + self.vorticity.gpu_data \ - + [self.mesh_info_buffer] + [self.local_mem] + + [self.mesh_info_buffer, self.local_mem] + + input_events = [evt for input in self.input for evt in input.events] stretching_evt = self.kernels['stretching'](*kernel_args, wait_for=input_events) + output_events = [stretching_evt] - if to_host: - self.vorticity.toHost() + self.vorticity.events.append(output_events) + def apply(self, simulation): self._compute(simulation) diff --git a/hysop/gpu/tools.py b/hysop/gpu/tools.py index 7fc2d7e7d..b37ed79ac 100644 --- a/hysop/gpu/tools.py +++ b/hysop/gpu/tools.py @@ -13,6 +13,127 @@ FLOAT_GPU, DOUBLE_GPU = np.float32, np.float64 __cl_env = None +class KernelError(Exception): + def __init__(self, msg, err): + super(KernelError,self).__init__(msg) + self.msg = msg + self.err = err + + def __str__(self): + return self.err + ': ' + self.msg + +class OpenClKernelStatistics(object): + def __init__(self, events=None): + if events is not None: + p0 = events[0].profile + t0 = p0.end - p0.start + total = 0 + maxi = t0 + mini = t0 + for evt in events: + dt = evt.profile.end - evt.profile.start + total += dt + if dt<mini: + mini = dt + if dt>maxi: + maxi = dt + + self.tot = total + self.min = mini + self.max = maxi + self.mean = total/len(events) + else: + self.tot = 0 + self.min = 0 + self.max = 0 + self.mean = 0 + + def __str__(self): + mini = self.min * 1e-6 + maxi = self.max * 1e-6 + total = self.tot * 1e-6 + mean = self.mean * 1e-6 + return 'min={:.2f}ms, max={:.2f}ms, mean={:.2f}ms'.format(mini,maxi,mean) + + +class KernelAutotuner(object): + """OpenCl kernel work group size autotuner. + """ + def __init__(self,work_dim, runs=10): + """Initialize a KernelAutotuner. + + Parameters + ---------- + work_dim: int + Work dimension used in targetted OpenCL kernels. + """ + self.work_dim = work_dim + self.nruns = nruns + self._load_default_filters() + + def add_filter(fname, f): + self.filters[fname] = f + return self + + def bench(self,ctx,device,global_size,args,kernel=None,kernel_generator=None,**kargs): + assert isinstance(args, list) + if (kernel is None) ^ (kernel_generator is None): + raise ValueError('Either kernel or kernel_generator should not be None!') + if (kernel_generator is None): + kernel_generator = lambda **kargs: (kernel, args) + + for local_size in _get_wi_candidates(ctx,device,global_size,**kargs): + kernel, args = kernel_generator(ctx,device,global_size,**kargs) + stats = self._bench_one(global_size,local_size,kernels,args) + print '{}\t{}'.format(local_size,stats) + + def _bench_one(global_size,local_size,kernel,kargs): + evts = [] + with cl.CommandQueue(ctx,device,cl.command_queue_properties.PROFILING_ENABLE) as queue: + for i in xrange(self.nruns): + evt = stretching_kernel(queue, global_size, local_size, *kargs) + evts.append(evt) + stats = OpenClKernelStatistics(evts) + return stats + + def _get_wi_candidates(ctx,device,global_size,**kargs): + pows = [] + size = device.max_work_group_size + while(size>0): + pows.append(size) + size >>= 1 + + candidates = itertools.product(pows,pows,pows) + for f in self.filters.values: + F = f(ctx=ctx,device=device,global_size=global_size,**kargs) + candidates = itertools.ifilter(F, candidates) + return candidates + + def _load_default_filters(self): + self.filters = {} + self.add_filter('dim_reqs',self._dim_filter) + self.add_filter('ordering',self._ordering_filter) + self.add_filter('minmax_wi'self._minmax_workitems_filter) + + + #filters + def _dim_filter(self, device,**kargs): + work_dim = self.work_dim + max_wi_dim = device.max_work_item_dimensions + return lambda local_size: (work_dim<=max_wi_dim) and (local_size[work_dim:]==1).all() + def _ordering_filter(self, **kargs): + return lambda local_size: (local_size[2]<=local_size[1]) and (local_size[1]<=local_size[0]) + def _global_size_filter(self, global_size, **kargs): + return lambda local_size: (local_size[0]<=global_size[0]) and (local_size[1]<=global_size[1]) and (local_size[2]<=global_size[2]) + def _minmax_workitems_filter(self, device,**kargs): + def filter(local_size): + max_wi_size = device.max_work_item_sizes + wi=1 + for i in xrange(3): + wi*=local_size[i] + return (wi>=max_wi_size/8) and (wi<=max_wi_size) + + class OpenCLEnvironment(object): """OpenCL environment informations and useful functions. """ -- GitLab