Skip to content
Snippets Groups Projects
Commit a023fbd9 authored by Jean-Baptiste Keck's avatar Jean-Baptiste Keck
Browse files

directional stretching autotuning ok

parent 83819765
No related branches found
No related tags found
No related merge requests found
...@@ -27,10 +27,10 @@ if __name__=='__main__': ...@@ -27,10 +27,10 @@ if __name__=='__main__':
NSCALARS = 0 NSCALARS = 0
DIM = 3 DIM = 3
f_resolution = (33,33,33)[:DIM] # f_resolution = (33,33,33)[:DIM]
#f_resolution = (65,65,65)[:DIM] # f_resolution = (65,65,65)[:DIM]
#f_resolution = (129,129,129)[:DIM] f_resolution = (129,129,129)[:DIM]
#f_resolution = (257,257,257)[:DIM] # f_resolution = (257,257,257)[:DIM]
#f_resolution = (513,513,257)[:DIM] #f_resolution = (513,513,257)[:DIM]
#DIM = 2 #DIM = 2
......
...@@ -201,11 +201,12 @@ class KernelAutotuner(object): ...@@ -201,11 +201,12 @@ class KernelAutotuner(object):
self.enable_variable_workload = True self.enable_variable_workload = True
if np.isscalar(max_workitem_workload): if np.isscalar(max_workitem_workload):
workloads = _compute_pows(max_workitem_workload) workloads = _compute_pows(max_workitem_workload)
self.workloads = itertools.product(workloads, repeat=3) workloads = itertools.product(workloads, repeat=3)
else: else:
workloads = [_compute_pows(mww) for mww in max_workitem_workload] workloads = [_compute_pows(mww) for mww in max_workitem_workload]
self.workloads = itertools.product(*workloads) workloads = itertools.product(*workloads)
self.workloads = [w for w in workloads]
def register_extra_parameter(self,name, values): def register_extra_parameter(self,name, values):
if self.extra_parameters is None: if self.extra_parameters is None:
self.extra_parameters = {} self.extra_parameters = {}
...@@ -257,7 +258,9 @@ class KernelAutotuner(object): ...@@ -257,7 +258,9 @@ class KernelAutotuner(object):
platform = typegen.platform platform = typegen.platform
device = typegen.device device = typegen.device
ctx = typegen.context ctx = typegen.context
if self.autotuner_config.verbose:
verbose = self.autotuner_config.verbose
if verbose:
print '== Kernel {} Autotuning =='.format(self.name) print '== Kernel {} Autotuning =='.format(self.name)
print 'platform: {}'.format(platform.name) print 'platform: {}'.format(platform.name)
print 'device: {}'.format(device.name) print 'device: {}'.format(device.name)
...@@ -280,20 +283,33 @@ class KernelAutotuner(object): ...@@ -280,20 +283,33 @@ class KernelAutotuner(object):
best_global_size = None best_global_size = None
best_local_size = None best_local_size = None
best_stats = None best_stats = None
best_extra_params = {'is_cached':True} best_extra_params = None
dump_cache = False
separator = '_'*100
indent = lambda i: ' '*i
for extra_parameters in self.get_extra_parameters(): for extra_parameters in self.get_extra_parameters():
if self.extra_parameters is None: if self.extra_parameters is None:
extra_parameters = {} extra_parameters = {}
else: else:
extra_parameters = dict(zip(self.extra_parameters.keys(), extra_parameters)) extra_parameters = dict(zip(self.extra_parameters.keys(), extra_parameters))
params_hash = hashlib.sha256(str(hash(frozenset(sorted(extra_parameters.items()))))).hexdigest()
params_hash = params_hash[:8]
best_params_workload = None best_params_workload = None
best_params_global_size = None best_params_global_size = None
best_params_local_size = None best_params_local_size = None
best_params_stats = None best_params_stats = None
for workload in self.get_workloads(work_size): if verbose:
print separator
print '::Current tuning parameters:: {}'.format(extra_parameters)
workloads = self.get_workloads(work_size)
for workload in workloads:
workload=np.asarray(workload) workload=np.asarray(workload)
max_global_size = get_max_global_size(work_size, workload) max_global_size = get_max_global_size(work_size, workload)
...@@ -303,11 +319,10 @@ class KernelAutotuner(object): ...@@ -303,11 +319,10 @@ class KernelAutotuner(object):
best_workload_candidate = None best_workload_candidate = None
best_workload_ids = None best_workload_ids = None
dump_cache = False if verbose:
print separator
print '_'*100 print indent(1)+'::Current workload {}::'.format(workload)
print '::Current workload {}::'.format(workload) print indent(2)+'-> global_size is set to {}'.format(max_global_size)
print ' -> global_size is set to {}'.format(max_global_size)
candidates = self.get_candidates(ctx=ctx,device=device, candidates = self.get_candidates(ctx=ctx,device=device,
max_global_size=max_global_size,**kargs) max_global_size=max_global_size,**kargs)
...@@ -321,8 +336,8 @@ class KernelAutotuner(object): ...@@ -321,8 +336,8 @@ class KernelAutotuner(object):
pruned_count = 0 pruned_count = 0
unpruned_count = 0 unpruned_count = 0
if self.autotuner_config.verbose: if verbose:
msg='\n Step {} :: running {} candidates over {} runs:' msg='\n'+indent(2)+'Step {} :: running {} candidates over {} runs:'
print msg.format(step,candidates.shape[0],nruns) print msg.format(step,candidates.shape[0],nruns)
for local_work_size in candidates: for local_work_size in candidates:
...@@ -338,31 +353,45 @@ class KernelAutotuner(object): ...@@ -338,31 +353,45 @@ class KernelAutotuner(object):
extra_parameters=extra_parameters, extra_parameters=extra_parameters,
**kargs) **kargs)
except KernelGenerationError: except KernelGenerationError:
pruned_count += 1
continue continue
global_size = np.asarray(global_size) global_size = np.asarray(global_size)
gwi = tuple(global_size) gwi = tuple(global_size)
update=False update=False
if (not self.autotuner_config.override_cache) \ pms = params_hash
and src_hash in results.keys() \
and ws in results[src_hash].keys() \ if (not self.autotuner_config.override_cache) \
and gwi in results[src_hash][ws].keys() \ and src_hash in results.keys() \
and lwi in results[src_hash][ws][gwi].keys(): and pms in results[src_hash].keys() \
and ws in results[src_hash][pms].keys() \
and gwi in results[src_hash][pms][ws].keys() \
and lwi in results[src_hash][pms][ws][gwi].keys():
stat = self.results[config][src_hash][ws][gwi][lwi] stat = self.results[config][src_hash][pms][ws][gwi][lwi]
if stat.nruns >= nruns: if stat.nruns >= nruns:
if self.autotuner_config.verbose: if self.autotuner_config.verbose:
print ' {} {} => {} (cached)'.format(gwi, lwi, stat) print indent(3)+'{} {} => {} (cached)'.format(gwi, lwi, stat)
unpruned_candidates[unpruned_count,:] = local_work_size unpruned_candidates[unpruned_count,:] = local_work_size
unpruned_count+=1 unpruned_count+=1
stats.append(stat) stats.append(stat)
continue continue
else: else:
update=True update=True
if (best_stats is not None):
current_best_stats = best_stats
elif (best_params_stats is not None):
current_best_stats = best_params_stats
elif (best_workload_stats is not None):
current_best_stats = best_workload_stats
else:
current_best_stats = None
(stat,pruned) = self._bench_one(ctx,device,gwi,lwi,_kernel,_kernel_args,nruns, (stat,pruned) = self._bench_one(ctx,device,gwi,lwi,_kernel,_kernel_args,nruns,
best_stats if (best_stats is not None) else best_workload_stats) current_best_stats)
if not pruned: if not pruned:
unpruned_candidates[unpruned_count,:] = local_work_size unpruned_candidates[unpruned_count,:] = local_work_size
unpruned_count+=1 unpruned_count+=1
...@@ -372,17 +401,19 @@ class KernelAutotuner(object): ...@@ -372,17 +401,19 @@ class KernelAutotuner(object):
pruned_count+=1 pruned_count+=1
status='pruned' status='pruned'
if self.autotuner_config.verbose: if verbose:
print ' {} {} => {} ({})'.format(gwi, lwi, stat, status) print indent(3)+'{} {} => {} ({})'.format(gwi, lwi, stat, status)
if not pruned: if not pruned:
if src_hash not in results.keys(): if src_hash not in results.keys():
results[src_hash] = {} results[src_hash] = {}
if ws not in results[src_hash].keys(): if pms not in results[src_hash].keys():
results[src_hash][ws] = {} results[src_hash][pms] = {}
if gwi not in results[src_hash][ws].keys(): if ws not in results[src_hash][pms].keys():
results[src_hash][ws][gwi] = {} results[src_hash][pms][ws] = {}
results[src_hash][ws][gwi][lwi] = stat if gwi not in results[src_hash][pms][ws].keys():
results[src_hash][pms][ws][gwi] = {}
results[src_hash][pms][ws][gwi][lwi] = stat
dump_cache = True dump_cache = True
all_pruned = (pruned_count==candidates.shape[0]) all_pruned = (pruned_count==candidates.shape[0])
...@@ -399,10 +430,10 @@ class KernelAutotuner(object): ...@@ -399,10 +430,10 @@ class KernelAutotuner(object):
step += 1 step += 1
if all_pruned: if all_pruned:
if self.autotuner_config.verbose: if verbose:
print '_'*100 print separator
print ' Workload {} winner for kernel {}:'.format(workload, self.name) print indent(1)+' Workload {} winner for kernel {}:'.format(workload, self.name)
print ' no winner (all candidates were prunes)' print indent(2)+'no winner (all candidates were pruned)'
continue continue
if (candidates.shape[0]!=1 or len(best_workload_ids)!=1): if (candidates.shape[0]!=1 or len(best_workload_ids)!=1):
...@@ -427,22 +458,41 @@ class KernelAutotuner(object): ...@@ -427,22 +458,41 @@ class KernelAutotuner(object):
extra_parameters=extra_parameters, extra_parameters=extra_parameters,
**kargs) **kargs)
if self.autotuner_config.verbose: if verbose:
print '_'*100 print separator
print ' Workload {} winner for kernel {}:'.format(workload, self.name) print indent(1)+' Workload {} winner for kernel {}:'.format(workload, self.name)
print ' {} {} => {}'.format(best_workload_global_size, best_workload_local_size, best_workload_stats) print indent(2)+'{} {} => {}'.format(best_workload_global_size,
best_workload_local_size, best_workload_stats)
if (best_stats is None) or (best_workload_stats<best_stats):
best_workload = workload if (best_params_stats is None) or (best_workload_stats<best_workload_stats):
best_stats = best_workload_stats best_params_workload = workload
best_global_size = best_workload_global_size best_params_stats = best_workload_stats
best_local_size = best_workload_local_size best_params_global_size = best_workload_global_size
best_params_local_size = best_workload_local_size
if verbose:
print separator
print ' Current parameters winner for kernel {}:'.format(self.name)
if (best_params_stats is None):
print indent(1)+'no winner (all candidates were pruned)'
else:
print indent(1)+'{} {} => {}'.format(best_params_global_size,
best_params_local_size, best_params_stats)
if (best_params_stats is not None) and \
((best_stats is None) or (best_params_stats<best_stats)):
best_workload = best_params_workload
best_stats = best_params_stats
best_global_size = best_params_global_size
best_local_size = best_params_local_size
best_extra_params = extra_parameters
if self.autotuner_config.verbose: if verbose:
print '_'*100 print separator
print ' BEST OVERALL RESULT for kernel {}:'.format(self.name) print ' BEST OVERALL RESULT for kernel {}:'.format(self.name)
print ' => WL={} G={} L={} => {}'.format(best_workload, best_global_size, best_local_size, best_stats) print ' => WL={} G={} L={} => {}'.format(best_workload, best_global_size, best_local_size,
print '_'*100 best_stats)
print separator
print print
return (best_global_size, best_local_size, best_stats, best_workload, best_extra_params) return (best_global_size, best_local_size, best_stats, best_workload, best_extra_params)
...@@ -523,7 +573,8 @@ class KernelAutotuner(object): ...@@ -523,7 +573,8 @@ class KernelAutotuner(object):
#user available filters #user available filters
def ordering_filter(self, **kargs): def ordering_filter(self, **kargs):
return lambda local_work_size: (local_work_size[2]<=local_work_size[1]) and (local_work_size[1]<=local_work_size[0]) return lambda local_work_size: (local_work_size[2]<=local_work_size[1]) \
and (local_work_size[1]<=local_work_size[0])
def min_workitems_per_direction(self, min_local_size, **kargs): def min_workitems_per_direction(self, min_local_size, **kargs):
if np.isscalar(min_local_size): if np.isscalar(min_local_size):
min_local_size = [min_local_size] min_local_size = [min_local_size]
...@@ -539,4 +590,3 @@ class KernelAutotuner(object): ...@@ -539,4 +590,3 @@ class KernelAutotuner(object):
max_local_size = np.asarray(max_local_size) max_local_size = np.asarray(max_local_size)
return lambda local_work_size,**kargs: (local_work_size<=max_local_size).all() return lambda local_work_size,**kargs: (local_work_size<=max_local_size).all()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment