diff --git a/hysop/gpu/gpu_stretching.py b/hysop/gpu/gpu_stretching.py index e6aa7cc4192047e6f68dbe9b81d545df4d6a723d..2f99d245519a270f2f906badc21b0c4ab80047d4 100644 --- a/hysop/gpu/gpu_stretching.py +++ b/hysop/gpu/gpu_stretching.py @@ -58,7 +58,6 @@ class GPUStretching(DiscreteOperator, GPUOperator): self.order = 2 if self.method[SpaceDiscretisation] is FDC2 else 4 # Worksize handling - #TODO self._cl_work_size = 0 ## GPU allocations @@ -112,21 +111,22 @@ class GPUStretching(DiscreteOperator, GPUOperator): raise NotImplementedError(msg) def _gen_cl_src(self): - topo = self.velocity.topology - mesh = topo.mesh - dim = self.dim - + typegen = self.cl_env.typegen + topo = self.velocity.topology + dim = self.dim + mesh = topo.mesh + gwi = (256,256,256) - lwi = (4,4,4) + lwi = (8,8,8) codegen, prg = self._gen_and_build_kernel(lwi, dump_src=True) - - cache_bytes = codegen.cache_alloc_bytes(local_size=lwi) + cache_bytes = codegen.cache_alloc_bytes(local_size=lwi) + self.local_mem = cl.LocalMemory(cache_bytes) self.size_local_alloc += cache_bytes from hysop.codegen.structs.mesh_info import MeshInfoStruct - mesh_info = MeshInfoStruct.build_instance_from_mesh(self.cl_env.typegen, mesh) + mesh_info = MeshInfoStruct.build_instance_from_mesh(typegen, mesh) mesh_info_buffer = cl.Buffer(self.cl_env.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=mesh_info) self.mesh_info_buffer = mesh_info_buffer @@ -165,20 +165,17 @@ class GPUStretching(DiscreteOperator, GPUOperator): return codegen, prg - def _compute_stretching(self, simulation, to_gpu=True, to_host=True): - if to_gpu: - for field in self.input: - field.toDevice() - - input_events = [evt for input in self.input for evt in input.events] - + def _compute_stretching(self, simulation): dt = self.cl_env.typegen.make_floatn(simulation.time_step,1) kernel_args = [dt] + self.velocity.gpu_data + self.vorticity.gpu_data \ - + [self.mesh_info_buffer] + [self.local_mem] + + [self.mesh_info_buffer, self.local_mem] + + input_events = [evt for input in self.input for evt in input.events] stretching_evt = self.kernels['stretching'](*kernel_args, wait_for=input_events) + output_events = [stretching_evt] - if to_host: - self.vorticity.toHost() + self.vorticity.events.append(output_events) + def apply(self, simulation): self._compute(simulation) diff --git a/hysop/gpu/tools.py b/hysop/gpu/tools.py index 7fc2d7e7d5f1f2eb37b7527dd5f51a1ffe7ae3d9..132a62ed25cafe4e785ceb3c48ba52dd6796754e 100644 --- a/hysop/gpu/tools.py +++ b/hysop/gpu/tools.py @@ -13,6 +13,127 @@ FLOAT_GPU, DOUBLE_GPU = np.float32, np.float64 __cl_env = None +#class KernelError(Exception): + #def __init__(self, msg, err): + #super(KernelError,self).__init__(msg) + #self.msg = msg + #self.err = err + + #def __str__(self): + #return self.err + ': ' + self.msg + +#class OpenClKernelStatistics(object): + #def __init__(self, events=None): + #if events is not None: + #p0 = events[0].profile + #t0 = p0.end - p0.start + #total = 0 + #maxi = t0 + #mini = t0 + #for evt in events: + #dt = evt.profile.end - evt.profile.start + #total += dt + #if dt<mini: + #mini = dt + #if dt>maxi: + #maxi = dt + + #self.tot = total + #self.min = mini + #self.max = maxi + #self.mean = total/len(events) + #else: + #self.tot = 0 + #self.min = 0 + #self.max = 0 + #self.mean = 0 + + #def __str__(self): + #mini = self.min * 1e-6 + #maxi = self.max * 1e-6 + #total = self.tot * 1e-6 + #mean = self.mean * 1e-6 + #return 'min={:.2f}ms, max={:.2f}ms, mean={:.2f}ms'.format(mini,maxi,mean) + + +#class KernelAutotuner(object): + #"""OpenCl kernel work group size autotuner. + #""" + #def __init__(self,work_dim, runs=10): + #"""Initialize a KernelAutotuner. + + #Parameters + #---------- + #work_dim: int + #Work dimension used in targetted OpenCL kernels. + #""" + #self.work_dim = work_dim + #self.nruns = nruns + #self._load_default_filters() + + #def add_filter(fname, f): + #self.filters[fname] = f + #return self + + #def bench(self,ctx,device,global_size,args,kernel=None,kernel_generator=None,**kargs): + #assert isinstance(args, list) + #if (kernel is None) ^ (kernel_generator is None): + #raise ValueError('Either kernel or kernel_generator should not be None!') + #if (kernel_generator is None): + #kernel_generator = lambda **kargs: (kernel, args) + + #for local_size in _get_wi_candidates(ctx,device,global_size,**kargs): + #kernel, args = kernel_generator(ctx,device,global_size,**kargs) + #stats = self._bench_one(global_size,local_size,kernels,args) + #print '{}\t{}'.format(local_size,stats) + + #def _bench_one(global_size,local_size,kernel,kargs): + #evts = [] + #with cl.CommandQueue(ctx,device,cl.command_queue_properties.PROFILING_ENABLE) as queue: + #for i in xrange(self.nruns): + #evt = stretching_kernel(queue, global_size, local_size, *kargs) + #evts.append(evt) + #stats = OpenClKernelStatistics(evts) + #return stats + + #def _get_wi_candidates(ctx,device,global_size,**kargs): + #pows = [] + #size = device.max_work_group_size + #while(size>0): + #pows.append(size) + #size >>= 1 + + #candidates = itertools.product(pows,pows,pows) + #for f in self.filters.values: + #F = f(ctx=ctx,device=device,global_size=global_size,**kargs) + #candidates = itertools.ifilter(F, candidates) + #return candidates + + #def _load_default_filters(self): + #self.filters = {} + #self.add_filter('dim_reqs',self._dim_filter) + #self.add_filter('ordering',self._ordering_filter) + #self.add_filter('minmax_wi'self._minmax_workitems_filter) + + + #filters + #def _dim_filter(self, device,**kargs): + #work_dim = self.work_dim + #max_wi_dim = device.max_work_item_dimensions + #return lambda local_size: (work_dim<=max_wi_dim) and (local_size[work_dim:]==1).all() + #def _ordering_filter(self, **kargs): + #return lambda local_size: (local_size[2]<=local_size[1]) and (local_size[1]<=local_size[0]) + #def _global_size_filter(self, global_size, **kargs): + #return lambda local_size: (local_size[0]<=global_size[0]) and (local_size[1]<=global_size[1]) and (local_size[2]<=global_size[2]) + #def _minmax_workitems_filter(self, device,**kargs): + #def filter(local_size): + #max_wi_size = device.max_work_item_sizes + #wi=1 + #for i in xrange(3): + #wi*=local_size[i] + #return (wi>=max_wi_size/8) and (wi<=max_wi_size) + + class OpenCLEnvironment(object): """OpenCL environment informations and useful functions. """