Skip to content
Snippets Groups Projects
Commit 3dc9fd9d authored by Keck Jean-Baptiste's avatar Keck Jean-Baptiste
Browse files

began kernel autotuner

parent 9513cb95
No related branches found
No related tags found
No related merge requests found
...@@ -58,7 +58,6 @@ class GPUStretching(DiscreteOperator, GPUOperator): ...@@ -58,7 +58,6 @@ class GPUStretching(DiscreteOperator, GPUOperator):
self.order = 2 if self.method[SpaceDiscretisation] is FDC2 else 4 self.order = 2 if self.method[SpaceDiscretisation] is FDC2 else 4
# Worksize handling # Worksize handling
#TODO
self._cl_work_size = 0 self._cl_work_size = 0
## GPU allocations ## GPU allocations
...@@ -112,21 +111,22 @@ class GPUStretching(DiscreteOperator, GPUOperator): ...@@ -112,21 +111,22 @@ class GPUStretching(DiscreteOperator, GPUOperator):
raise NotImplementedError(msg) raise NotImplementedError(msg)
def _gen_cl_src(self): def _gen_cl_src(self):
topo = self.velocity.topology typegen = self.cl_env.typegen
mesh = topo.mesh topo = self.velocity.topology
dim = self.dim dim = self.dim
mesh = topo.mesh
gwi = (256,256,256) gwi = mesh.
lwi = (4,4,4) lwi = (8,8,8)
codegen, prg = self._gen_and_build_kernel(lwi, dump_src=True) codegen, prg = self._gen_and_build_kernel(lwi, dump_src=True)
cache_bytes = codegen.cache_alloc_bytes(local_size=lwi)
cache_bytes = codegen.cache_alloc_bytes(local_size=lwi)
self.local_mem = cl.LocalMemory(cache_bytes) self.local_mem = cl.LocalMemory(cache_bytes)
self.size_local_alloc += cache_bytes self.size_local_alloc += cache_bytes
from hysop.codegen.structs.mesh_info import MeshInfoStruct from hysop.codegen.structs.mesh_info import MeshInfoStruct
mesh_info = MeshInfoStruct.build_instance_from_mesh(self.cl_env.typegen, mesh) mesh_info = MeshInfoStruct.build_instance_from_mesh(typegen, mesh)
mesh_info_buffer = cl.Buffer(self.cl_env.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, mesh_info_buffer = cl.Buffer(self.cl_env.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
hostbuf=mesh_info) hostbuf=mesh_info)
self.mesh_info_buffer = mesh_info_buffer self.mesh_info_buffer = mesh_info_buffer
...@@ -165,20 +165,17 @@ class GPUStretching(DiscreteOperator, GPUOperator): ...@@ -165,20 +165,17 @@ class GPUStretching(DiscreteOperator, GPUOperator):
return codegen, prg return codegen, prg
def _compute_stretching(self, simulation, to_gpu=True, to_host=True): def _compute_stretching(self, simulation):
if to_gpu:
for field in self.input:
field.toDevice()
input_events = [evt for input in self.input for evt in input.events]
dt = self.cl_env.typegen.make_floatn(simulation.time_step,1) dt = self.cl_env.typegen.make_floatn(simulation.time_step,1)
kernel_args = [dt] + self.velocity.gpu_data + self.vorticity.gpu_data \ kernel_args = [dt] + self.velocity.gpu_data + self.vorticity.gpu_data \
+ [self.mesh_info_buffer] + [self.local_mem] + [self.mesh_info_buffer, self.local_mem]
input_events = [evt for input in self.input for evt in input.events]
stretching_evt = self.kernels['stretching'](*kernel_args, wait_for=input_events) stretching_evt = self.kernels['stretching'](*kernel_args, wait_for=input_events)
output_events = [stretching_evt]
if to_host: self.vorticity.events.append(output_events)
self.vorticity.toHost()
def apply(self, simulation): def apply(self, simulation):
self._compute(simulation) self._compute(simulation)
......
...@@ -13,6 +13,127 @@ FLOAT_GPU, DOUBLE_GPU = np.float32, np.float64 ...@@ -13,6 +13,127 @@ FLOAT_GPU, DOUBLE_GPU = np.float32, np.float64
__cl_env = None __cl_env = None
class KernelError(Exception):
def __init__(self, msg, err):
super(KernelError,self).__init__(msg)
self.msg = msg
self.err = err
def __str__(self):
return self.err + ': ' + self.msg
class OpenClKernelStatistics(object):
def __init__(self, events=None):
if events is not None:
p0 = events[0].profile
t0 = p0.end - p0.start
total = 0
maxi = t0
mini = t0
for evt in events:
dt = evt.profile.end - evt.profile.start
total += dt
if dt<mini:
mini = dt
if dt>maxi:
maxi = dt
self.tot = total
self.min = mini
self.max = maxi
self.mean = total/len(events)
else:
self.tot = 0
self.min = 0
self.max = 0
self.mean = 0
def __str__(self):
mini = self.min * 1e-6
maxi = self.max * 1e-6
total = self.tot * 1e-6
mean = self.mean * 1e-6
return 'min={:.2f}ms, max={:.2f}ms, mean={:.2f}ms'.format(mini,maxi,mean)
class KernelAutotuner(object):
"""OpenCl kernel work group size autotuner.
"""
def __init__(self,work_dim, runs=10):
"""Initialize a KernelAutotuner.
Parameters
----------
work_dim: int
Work dimension used in targetted OpenCL kernels.
"""
self.work_dim = work_dim
self.nruns = nruns
self._load_default_filters()
def add_filter(fname, f):
self.filters[fname] = f
return self
def bench(self,ctx,device,global_size,args,kernel=None,kernel_generator=None,**kargs):
assert isinstance(args, list)
if (kernel is None) ^ (kernel_generator is None):
raise ValueError('Either kernel or kernel_generator should not be None!')
if (kernel_generator is None):
kernel_generator = lambda **kargs: (kernel, args)
for local_size in _get_wi_candidates(ctx,device,global_size,**kargs):
kernel, args = kernel_generator(ctx,device,global_size,**kargs)
stats = self._bench_one(global_size,local_size,kernels,args)
print '{}\t{}'.format(local_size,stats)
def _bench_one(global_size,local_size,kernel,kargs):
evts = []
with cl.CommandQueue(ctx,device,cl.command_queue_properties.PROFILING_ENABLE) as queue:
for i in xrange(self.nruns):
evt = stretching_kernel(queue, global_size, local_size, *kargs)
evts.append(evt)
stats = OpenClKernelStatistics(evts)
return stats
def _get_wi_candidates(ctx,device,global_size,**kargs):
pows = []
size = device.max_work_group_size
while(size>0):
pows.append(size)
size >>= 1
candidates = itertools.product(pows,pows,pows)
for f in self.filters.values:
F = f(ctx=ctx,device=device,global_size=global_size,**kargs)
candidates = itertools.ifilter(F, candidates)
return candidates
def _load_default_filters(self):
self.filters = {}
self.add_filter('dim_reqs',self._dim_filter)
self.add_filter('ordering',self._ordering_filter)
self.add_filter('minmax_wi'self._minmax_workitems_filter)
#filters
def _dim_filter(self, device,**kargs):
work_dim = self.work_dim
max_wi_dim = device.max_work_item_dimensions
return lambda local_size: (work_dim<=max_wi_dim) and (local_size[work_dim:]==1).all()
def _ordering_filter(self, **kargs):
return lambda local_size: (local_size[2]<=local_size[1]) and (local_size[1]<=local_size[0])
def _global_size_filter(self, global_size, **kargs):
return lambda local_size: (local_size[0]<=global_size[0]) and (local_size[1]<=global_size[1]) and (local_size[2]<=global_size[2])
def _minmax_workitems_filter(self, device,**kargs):
def filter(local_size):
max_wi_size = device.max_work_item_sizes
wi=1
for i in xrange(3):
wi*=local_size[i]
return (wi>=max_wi_size/8) and (wi<=max_wi_size)
class OpenCLEnvironment(object): class OpenCLEnvironment(object):
"""OpenCL environment informations and useful functions. """OpenCL environment informations and useful functions.
""" """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment