diff --git a/Examples/mainJM.py b/Examples/mainJM.py index 8aa4df466bc6721efc411b2db16fc7f3b55b11a5..eb321bd7c924bc6837b0da10fa27cd9f5ead1be2 100644 --- a/Examples/mainJM.py +++ b/Examples/mainJM.py @@ -20,7 +20,7 @@ def scalaire(x, y, z): def run(): # Parameters - timeStep = 1. # 0.02 + timeStep = 0.02 finalTime = 1. outputFilePrefix = './res/RK2_' outputModulo = 0 @@ -31,8 +31,6 @@ def run(): box = pp.Box(3, length=[1., 1., 1.], origin=[0., 0., 0.]) ## Fields - #scal = pp.AnalyticalField(domain=box, formula=scalaire, name='Scalar') - #velo = pp.AnalyticalField(domain=box, formula=vitesse, name='Velocity', vector=True) scal = pp.ContinuousField(domain=box, name='Scalar') velo = pp.ContinuousField(domain=box, name='Velocity', vector=True) @@ -47,7 +45,6 @@ def run(): ## Setting solver to Problem pb.setSolver(finalTime, timeStep, 'gpu', src='./mainJM_kernels.cl') - #pb.setSolver(finalTime, timeStep, 'gpu') # ## Setting io to Problem pb.setIO(pp.Printer(fields=[scal, velo], frequency=outputModulo, outputPrefix=outputFilePrefix)) @@ -55,10 +52,11 @@ def run(): t1 = time.time() ## Solve problem - #pb.solve() + pb.solve() tf = time.time() + print "\n" print "Total time : ", tf - t0, "sec (CPU)" print "Init time : ", t1 - t0, "sec (CPU)" print "Solving time : ", tf - t1, "sec (CPU)" diff --git a/HySoP/hysop/__init__.py.in b/HySoP/hysop/__init__.py.in index 14ffc8da560d25f8dd3f8b892a4a793d17a5e57a..435cd62c9bbead836e7b422042c0b4abcb7a6a2f 100755 --- a/HySoP/hysop/__init__.py.in +++ b/HySoP/hysop/__init__.py.in @@ -10,6 +10,7 @@ __all__ = ['Box', 'CartesianTopology', 'ScalarField'] import os import site import mpi4py.MPI as MPI +import pyopencl as cl rank_world = MPI.COMM_WORLD.Get_rank() if(rank_world == 0): diff --git a/HySoP/hysop/constants.py b/HySoP/hysop/constants.py index ebe7f3745c3894036d95733f2bfd451ddbb034bd..434b9d86b9012fd1c4d4aae2080eac24881d0acf 100644 --- a/HySoP/hysop/constants.py +++ b/HySoP/hysop/constants.py @@ -16,6 +16,7 @@ PI = math.pi PARMES_REAL = np.float64 PARMES_INTEGER = np.uint32 PARMES_REAL_GPU = np.float32 +PARMES_INTEGER_GPU = np.uint32 ## default array layout (fortran or C convention) ORDER = 'F' DEBUG = 0 diff --git a/HySoP/hysop/fields/topology.py b/HySoP/hysop/fields/topology.py index 41448182e61b8e3129139e8852ddcab889a06927..c57cc2b188f81d79e74251a4d78719b565676c45 100644 --- a/HySoP/hysop/fields/topology.py +++ b/HySoP/hysop/fields/topology.py @@ -65,7 +65,7 @@ class CartesianTopology(object): localResolution[i] = nbpoints[i] + remainingPoints[i] start = np.zeros((domain.dimension), dtype=PARMES_INTEGER) start[:self.dim] = self.coords * nbpoints - self.mesh = LocalMesh(self.rank, resolution=localResolution, start=start, dom_size=self.domain.length/self.resolution, ghosts=self.ghosts) + self.mesh = LocalMesh(self.rank, resolution=localResolution, start=start, dom_size=self.domain.length / self.resolution, ghosts=self.ghosts) def __str__(self): """ Topology info display """ diff --git a/HySoP/hysop/operator/Transport.py b/HySoP/hysop/operator/Transport.py index 8bf58be176205a5c5f8a4ae44f3cd26a6c4ed3ab..0490eb21b22354dd80e835deca2b45ac5e2de75f 100644 --- a/HySoP/hysop/operator/Transport.py +++ b/HySoP/hysop/operator/Transport.py @@ -42,7 +42,7 @@ class Transport(ContinuousOperator): """ToString method""" s = "Advection operator (ContinuousOperator)" if self.discreteOperator is not None: - s += "with the following discretization:\n" + s += " with the following discretization:\n" s += str(self.discreteOperator) else: s += ". Not discretised" diff --git a/HySoP/hysop/operator/advection_d.py b/HySoP/hysop/operator/advection_d.py index 9cb78ff52b1a2b54b212a64c0e438b3e3c69ff6e..6f5226e054e234da7687b63016e85f6a20903b7e 100644 --- a/HySoP/hysop/operator/advection_d.py +++ b/HySoP/hysop/operator/advection_d.py @@ -3,8 +3,10 @@ @package operator Discrete advection representation """ +from ..constants import * from .discrete import DiscreteOperator import pyopencl as cl +import numpy as np import time @@ -45,12 +47,49 @@ class Advection_P(DiscreteOperator): # self.total_bytes_accessed = 0 def apply(self, t, dt, splittingDirection): - c_time = time.time() - print "Advection operator step, direction :", splittingDirection, - c_time = (time.time() - c_time) - self.compute_time += c_time - if self.numMethod is None: + c_time = 0 + #print "Advection operator step :", + if self.numMethod is not None: + dim = self.scalar.topology.dim + # Compute OpenCL Global Index Space + l = list(self.scalar.data.shape) + nb_ligne = l.pop(splittingDirection) + opencl_gis = tuple(l) + # \todo Compute OpenCL Local Index Space + l = list(opencl_gis) + l[0] = 1 + opencl_lis = tuple(l) + # Compute local domain min coord + coord_min = np.ones(4, dtype=PARMES_REAL_GPU) + coord_min[0:dim] = self.scalar.topology.mesh.start + # Compute local domain length + mesh_size = np.ones(4, dtype=PARMES_REAL_GPU) + mesh_size[0:dim] = self.scalar.topology.mesh.size + # Compute global domain resolution + domain_length = np.ones(4, dtype=PARMES_REAL_GPU) + domain_length[0:dim] = self.scalar.topology.domain.length + # Initialize particle scalar + evt_copy = cl.enqueue_copy(self.numMethod.queue, self.res_scalar.gpu_data, self.scalar.gpu_data) + # Perform advection + evt = self.numMethod.launch(opencl_gis, opencl_lis, + self.velocity.gpu_data[splittingDirection], + self.res_position.gpu_data, + PARMES_REAL_GPU(t), + PARMES_REAL_GPU(dt), + PARMES_INTEGER_GPU(splittingDirection), + PARMES_INTEGER_GPU(nb_ligne), + coord_min, + domain_length, + mesh_size) + self.numMethod.queue.finish() + c_time = (evt_copy.profile.end - evt_copy.profile.start) * 1e-9 + c_time += (evt.profile.end - evt.profile.start) * 1e-9 + else: + c_time = time.time() print "nothing to do" + c_time = (time.time() - c_time) + self.total_time += c_time + return c_time # """ # Apply advection operator. @@ -124,6 +163,10 @@ class Advection_P(DiscreteOperator): # self.total_bytes_accessed += 2 * nb_ligne * np.prod(l) * 4 # 2 floats lus + 1+DIM float écrits # #print "Advection:", self.compute_time * 1e-9 + + def printComputeTime(self): + print "Advection total time : ", self.total_time + def __str__(self): s = "Advection_P (DiscreteOperator). " + DiscreteOperator.__str__(self) return s diff --git a/HySoP/hysop/operator/continuous.py b/HySoP/hysop/operator/continuous.py index a1083989aed3d47dc8ba3f44ee5cee395195e2db..0375e3ec60f3f6cbcebc64273666608c55bf0f1c 100644 --- a/HySoP/hysop/operator/continuous.py +++ b/HySoP/hysop/operator/continuous.py @@ -37,7 +37,19 @@ class ContinuousOperator: self.variables.append(v) def apply(self, *args): - self.discreteOperator.apply(*args) + return self.discreteOperator.apply(*args) + + def setMethod(self, method): + if self.discreteOperator is not None: + self.discreteOperator.setMethod(method) + else: + raise ValueError("Cannot set mumerical method to non discretized operator") + + def printComputeTime(self): + if self.discreteOperator is not None: + self.discreteOperator.printComputeTime() + else: + raise ValueError("Cannot print compute time of a non discretized operator") @abstractmethod def discretize(self, spec=None): diff --git a/HySoP/hysop/operator/discrete.py b/HySoP/hysop/operator/discrete.py index 301674518908a166a8484d54c6d0b8b3c1d5668f..600b1833f9a140c707422c26adc08850ee0883ae 100644 --- a/HySoP/hysop/operator/discrete.py +++ b/HySoP/hysop/operator/discrete.py @@ -28,7 +28,7 @@ class DiscreteOperator: self.gpu_kernel = None self.gpu_kernel_name = "" self.discreteOperator = self - self.compute_time = 0. + self.total_time = 0. def setMethod(self, method): """ @@ -49,6 +49,13 @@ class DiscreteOperator: if self.variables.count(v) == 0: self.variables.append(v) + @abstractmethod + def printComputeTime(self): + """ + Print total computing time + """ + raise NotImplementedError("Need to override method in a subclass of " + providedClass) + @abstractmethod def apply(self): """ diff --git a/HySoP/hysop/operator/remeshing.py b/HySoP/hysop/operator/remeshing.py index c49c0375314e93dc64c6a08438772d9095c6c27c..7ba2e597f80663a4224373e77468490376f531d7 100644 --- a/HySoP/hysop/operator/remeshing.py +++ b/HySoP/hysop/operator/remeshing.py @@ -4,6 +4,7 @@ Operator representation """ from .discrete import DiscreteOperator +from ..constants import * import pyopencl as cl import time @@ -32,13 +33,13 @@ class Remeshing(DiscreteOperator): ## Particles scalar values. self.pscal = partScalar ## Result scalar - self.resultScalar = resscal - self.addVariable([self.ppos, self.pscal, self.resultScalar]) + self.res_scalar = resscal + self.addVariable([self.ppos, self.pscal, self.res_scalar]) self.numMethod = method ## input fields self.input = [self.ppos, self.pscal] ## output fields - self.output = [self.resultScalar] + self.output = [self.res_scalar] self.needSplitting = True # self.gpu_kernel_name = "remeshing" # self.compute_time = [0., 0., 0.] @@ -51,12 +52,43 @@ class Remeshing(DiscreteOperator): """ Apply Remeshing operator. """ - c_time = time.time() - print "Remeshing operator step, direction :", splittingDirection, - c_time = (time.time() - c_time) - self.compute_time += c_time - if self.numMethod is None: + c_time = 0 + #print "Remeshing operator step :", + if self.numMethod is not None: + dim = self.res_scalar.topology.dim + # Compute OpenCL Global Index Space + l = list(self.res_scalar.data.shape) + nb_ligne = l.pop(splittingDirection) + opencl_gis = tuple(l) + # \todo Compute OpenCL Local Index Space + l = list(opencl_gis) + l[0] = 1 + opencl_lis = tuple(l) + # Compute local domain min coord + coord_min = np.ones(4, dtype=PARMES_REAL_GPU) + coord_min[0:dim] = self.res_scalar.topology.mesh.start + # Compute local domain length + mesh_size = np.ones(4, dtype=PARMES_REAL_GPU) + mesh_size[0:dim] = self.res_scalar.topology.mesh.size + # Perform remeshing + evt = self.numMethod.launch(opencl_gis, opencl_lis, + self.ppos.gpu_data, + self.pscal.gpu_data, + self.res_scalar.gpu_data, + PARMES_REAL_GPU(t), + PARMES_REAL_GPU(dt), + PARMES_INTEGER_GPU(splittingDirection), + PARMES_INTEGER_GPU(nb_ligne), + coord_min, + mesh_size) + self.numMethod.queue.finish() + c_time = (evt.profile.end - evt.profile.start) * 1e-9 + else: + c_time = time.time() print "nothing to do" + c_time = (time.time() - c_time) + self.total_time += c_time + return c_time # c_time = time.time() # if self.gpu_kernel is None: @@ -95,8 +127,11 @@ class Remeshing(DiscreteOperator): # self.total_bytes_accessed += ((2 + 1) * nb_ligne) * np.prod(l) * 4 # 2 float lus + 1 float écrits # #print "Remeshing M'6:", self.compute_time * 1e-9 + def printComputeTime(self): + print "Remeshing total time : ", self.total_time + def __str__(self): - s = "Advection_P (DiscreteOperator). " + DiscreteOperator.__str__(self) + s = "Remeshing (DiscreteOperator). " + DiscreteOperator.__str__(self) return s if __name__ == "__main__": diff --git a/HySoP/hysop/operator/splitting.py b/HySoP/hysop/operator/splitting.py index 82b6657c1cb03f1dee67f7ce2ba923d5e4a8dbee..0eda955b58e44110889b7831b231b5f6dcfc37e6 100644 --- a/HySoP/hysop/operator/splitting.py +++ b/HySoP/hysop/operator/splitting.py @@ -26,19 +26,34 @@ class Splitting(DiscreteOperator): [self.splitting.append((i, 0.5)) for i in xrange(dim - 1)] self.splitting.append((dim - 1, 1.)) [self.splitting.append((dim - 2 - i, 0.5)) for i in xrange(dim - 1)] + self.compute_time_details = [[0. for i in xrange(dim)] for op in self.operators] def apply(self, t, dt): """ Apply Remeshing operator. """ - print "Splitting operator step :" - c_time = time.time() + #print "Splitting operator step :" + c_time = 0. for split in self.splitting: - print "direction : " + str(split[0]) + " dt*" + str(split[1]) + #print "direction : " + str(split[0]) + " dt*" + str(split[1]) + op_index = 0 for op in self.operators: - op.apply(t, split[1] * dt, split[0]) - c_time = (time.time() - c_time) - self.compute_time += c_time + temp_time = op.apply(t, split[1] * dt, split[0]) + c_time += temp_time + self.compute_time_details[op_index][split[0]] += temp_time + op_index += 1 + self.total_time += c_time + return c_time + + def printComputeTime(self): + print "Splitting total time : ", self.total_time + op_index = 0 + for op in self.operators: + print " ", + op.printComputeTime() + for d in xrange(len(self.compute_time_details[op_index])): + print " direction :", d, " : ", self.compute_time_details[op_index][d] + op_index += 1 def __str__(self): s = "Splitting (DiscreteOperator). Splitting steps : \n" diff --git a/HySoP/hysop/particular_solvers/basic.py b/HySoP/hysop/particular_solvers/basic.py index c5783fd23b3d02f118907f37741b4947abe8962e..115389e5ef75e0a5ad3e546480548915254e8fce 100644 --- a/HySoP/hysop/particular_solvers/basic.py +++ b/HySoP/hysop/particular_solvers/basic.py @@ -79,15 +79,15 @@ class ParticleSolver(Solver): else: op.discretize() ## Create remeshing operator - remesh = Remeshing(partPositions=self.p_position, - partScalar=self.p_scalar, - resscal=self.advection.scalar, - method=self.RemeshingMethod) + self.remesh = Remeshing(partPositions=self.advection.discreteOperator.res_position, + partScalar=self.advection.discreteOperator.res_scalar, + resscal=self.advection.discreteOperator.scalar, + method=self.RemeshingMethod) for op in self.problem.operators: if op.discreteOperator.needSplitting: if op is self.advection: index = self.problem.operators.index(op) - self.problem.operators[index] = Splitting([op, remesh]) + self.problem.operators[index] = Splitting([op, self.remesh]) else: self.problem.operators[index] = Splitting([op]) diff --git a/HySoP/hysop/particular_solvers/gpu.py b/HySoP/hysop/particular_solvers/gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..a9a98bd3c766e20a1c89ef68bb091b435324123f --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu.py @@ -0,0 +1,422 @@ +# -*- coding: utf-8 -*- +""" +@package Utils +GPU Particular solver description. +""" +from ..constants import * +from basic import ParticleSolver +from ..fields.continuous import ContinuousField +from ..operator.Transport import Transport +from ..operator.remeshing import Remeshing +from ..operator.splitting import Splitting +from ..tools.timer import Timer +from ..tools.printer import Printer +import pyopencl as cl +from ..tools.gpu_data_transfer import hostToDevice + + +class GPUParticleSolver(ParticleSolver): + """ + GPU Particular solver description. + Link with differents numericals methods used. Prepare GPU side (memory, kernels, ...) + """ + def __init__(self, problem, t_end, dt, + ODESolver=None, + InterpolationMethod=None, + RemeshingMethod=None, + splittingMethod='strang', + timer=None, + io=None, + platform_id=0, device_id=0, + device_type='gpu', + src=None): + """ + Constructor. + Create a solver description. + """ + ParticleSolver.__init__(self, problem, t_end, dt) + self.user_gpu_src = src + print "=== Discover OpenCL environment ===" + #Get platform. + try: + self.platform = cl.get_platforms()[platform_id] + except IndexError: + print " Incorrect platform_id :", platform_id, ".", + print " Only ", len(cl.get_platforms()), " available.", + print " Getting defalut platform. " + self.platform = cl.get_platforms()[0] + print " Platform " + print " - Name :", self.platform.name + print " - Version :", self.platform.version + #Get device. + self.device = self.platform.get_devices(eval("cl.device_type." + device_type.upper()))[device_id] + print " Device" + print " - Name :", self.device.name + print " - Type :", cl.device_type.to_string(self.device.type) + print " - C Version :", self.device.opencl_c_version + print " - Global mem size :", self.device.global_mem_size / (1024 ** 3), "GB" + print "===\n" + #Creates GPU Context + self.ctx = cl.Context([self.device]) + #Create CommandQueue on the GPU Context + self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + + def initialize(self): + """ + Solver initialisation for a given DiscreteProblem.DiscreteProblem. + + @param discreteProblem : Problem to initialize. + """ + ParticleSolver.initialize(self) + ## kernels compilation + print "=== Kernel sources compiling ===" + print "Sources files (default): ", GPU_SRC + f = open(GPU_SRC, 'r') + self.gpu_src = "".join(f.readlines()) + f.close() + if self.user_gpu_src is not None: + print "Sources files (user): ", self.user_gpu_src + f = open(self.user_gpu_src, 'r') + self.gpu_src += "".join(f.readlines()) + f.close() + build_options = "-cl-single-precision-constant -cl-opt-disable" + build_options += " -D DIM=" + str(self.problem.topology.dim) + build_options += " -D MAX_LINE_POINTS=" + str(np.max(self.problem.topology.mesh.resolution)) + self.prg = cl.Program(self.ctx, self.gpu_src).build(build_options) + print self.prg.get_build_info(self.device, cl.program_build_info.LOG) + print self.prg.get_build_info(self.device, cl.program_build_info.OPTIONS) + print self.prg.get_build_info(self.device, cl.program_build_info.STATUS) + print "===\n" + ## Convert fields to simple preceision floats (PARMES_REAL_GPU) + for f in self.problem.variables: + for df in f.discreteField: + if f.vector: + for d in xrange(len(df.data)): + df.data[d] = df.data[d].astype(PARMES_REAL_GPU) + else: + df.data = df.data.astype(PARMES_REAL_GPU) + ## Create OpenCL Buffers from fields + print "=== OpenCL Buffer allocations ===" + total_mem_used = 0 + for f in self.problem.variables: + print f.name, "allocate ", + temp_mem_used = 0 + for df in f.discreteField: + if f.vector: + for d in xrange(len(df.data)): + df.gpu_data[d] = cl.Buffer(self.ctx, + cl.mem_flags.READ_WRITE, + size=df.data[d].nbytes) + temp_mem_used += df.data[d].nbytes + else: + df.gpu_data = cl.Buffer(self.ctx, + cl.mem_flags.READ_WRITE, + size=df.data.nbytes) + temp_mem_used += df.data.nbytes + total_mem_used += temp_mem_used + print temp_mem_used, "Bytes (", temp_mem_used / (1024 ** 2), "MB)" + print "Total Global Memory used : ", total_mem_used, "Bytes (", total_mem_used / (1024 ** 2), "MB)", + print "({0:.3f} %)".format(100 * total_mem_used / (self.device.global_mem_size * 1.)) + print "===\n" + print "=== OpenCL Buffer initialisation ===" + data, transfer_time, compute_time = 0, 0., 0. + for f in self.problem.variables: + initKernel = None + # Looking for initKernel + for k in self.prg.all_kernels(): + if k.get_info(cl.kernel_info.FUNCTION_NAME) == ('init' + f.name): + initKernel = cl.Kernel(self.prg, 'init' + f.name) + if initKernel is not None: + for df in f.discreteField: + print f.name, ": kernel init ... ", + global_wg = tuple(df.resolution.tolist()) + if global_wg[0] / 64 > 0: + local_wg = (64, 1, 1) + else: + local_wg = None + dim = df.topology.dim + coord_min = np.ones(4, dtype=PARMES_REAL_GPU) + mesh_size = np.ones(4, dtype=PARMES_REAL_GPU) + coord_min[0:dim] = df.topology.mesh.start + mesh_size[0:dim] = df.topology.mesh.size + if f.vector: + if dim == 3: + evt = initKernel(self.queue, + global_wg, + local_wg, + df.gpu_data[0], df.gpu_data[1], df.gpu_data[2], + coord_min, + mesh_size) + else: + evt = initKernel(self.queue, + global_wg, + local_wg, + df.gpu_data[0], df.gpu_data[1], + coord_min, + mesh_size) + else: + evt = initKernel(self.queue, + global_wg, + local_wg, + df.gpu_data, + coord_min, + mesh_size) + self.queue.finish() + temp_time = (evt.profile.end - evt.profile.start) * 1e-9 + print "Done in ", temp_time, "sec" + compute_time += temp_time + else: + for df in f.discreteField: + if df.contains_data: + print f.name, ":", + temp_data, temp_time = hostToDevice(self.queue, df) + data += temp_data + transfer_time += temp_time + if data > 0: + print "Total Transfers : ", data, "Bytes transfered at {0:.3f} GBytes/sec".format((data * 1e-9) / transfer_time) + if compute_time > 0.: + print "Total Computing : ", compute_time, "sec" + ## Setting advection and remeshing kernels: + self.advection.setMethod(KernelLauncher(self.prg.advection, self.queue)) + self.remesh.setMethod(KernelLauncher(self.prg.remeshing, self.queue)) + + # total_data, total_time = 0, 0. + # for f in self.problem.variables: + # initKernel = None + # for k in self.prg.all_kernels(): + # if k.get_info(cl.kernel_info.FUNCTION_NAME) == ('init'+f.name): + # initKernel = k + # if initKernel is not None: + # print "Initialisation kernel found", initKernel + # else: + # for df in f.discreteField: + # if df.contains_data: + # print f.name, ":", + # temp_data, temp_time = hostToDevice(self.queue, df) + # total_data += temp_data + # total_time += temp_time + # if total_data > 0: + # print "Total : ", total_data, "Bytes transfered at {0:.3f} GBytes/sec".format((total_data * 1e-9) / total_time) + # self.queue.finish() + print "===\n" + ##\todo brancher les objets opencl sur les opérateurs et les fields + + def __str__(self): + """ToString method""" + s = " Particular solver " + return s + +if __name__ == "__main__": + print __doc__ + print "- Provided class : ParticularSolver" + print ParticularSolver.__doc__ + + +# def parameters_initialization(self): +# # Modifying domains parameters to fit with float4 or int4 opencl objects +# self.gpu_shape = tuple(self.grid.elementNumber) +# padding_float = np.zeros(4 - self.grid.dimension, dtype=dtype_real_gpu) +# padding_uint = np.zeros(4 - self.grid.dimension, dtype=dtype_integer) +# self.grid.min = np.concatenate((self.grid.min, padding_float)).astype(dtype_real_gpu) +# self.grid.max = np.concatenate((self.grid.max, padding_float)).astype(dtype_real_gpu) +# self.grid.length = np.concatenate((self.grid.length, padding_float)).astype(dtype_real_gpu) +# self.grid.elementSize = np.concatenate((self.grid.elementSize, padding_float)).astype(dtype_real_gpu) +# self.grid.elementNumber = np.concatenate((self.grid.elementNumber, padding_uint + 1)).astype(dtype_integer) +# self.ppos.values.resize(self.gpu_shape, refcheck=False) + +# def buffer_allocations(self): +# self.gpu_used_mem = 0 +# for v in self.discreteProblem.variables: +# v.values = np.asarray(v.values, dtype=dtype_real_gpu) +# v.gpu_mem_object = cl.Buffer(self.ctx, +# cl.mem_flags.READ_WRITE, +# size=v.values.nbytes +# ) +# self.gpu_used_mem += v.gpu_mem_object.size +# print "Allocation " + v.name + " on gpu, size:", v.gpu_mem_object.size * 1e-6, 'MBytes' + +# if debug > 0: +# debug_size = 0 +# self.discreteProblem.advecOp.debug_integer = np.zeros(debug, dtype=dtype_integer) +# self.discreteProblem.advecOp.debug_integer_buffer = cl.Buffer( +# self.ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, +# hostbuf=self.discreteProblem.advecOp.debug_integer) +# debug_size += self.discreteProblem.advecOp.debug_integer_buffer.size +# self.discreteProblem.advecOp.debug_float = np.zeros(debug, dtype=dtype_real_gpu) +# self.discreteProblem.advecOp.debug_float_buffer = cl.Buffer( +# self.ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, +# hostbuf=self.discreteProblem.advecOp.debug_float) +# debug_size += self.discreteProblem.advecOp.debug_float_buffer.size +# self.gpu_used_mem += debug_size +# print "Allocation debug buffer on gpu, size:", debug_size, 'B' + +# def initialize(self): +# """ +# Solver initialisation for a given DiscreteProblem.DiscreteProblem. + +# @param discreteProblem : Problem to initialize. +# """ +# # OpenCL Buffer allocations +# print "\n=== Device memory allocations ===" +# self.buffer_allocations() +# p = self.gpu_used_mem * 100 / (self.device.global_mem_size * 1.) +# print "Total memory allocated on gpu :", self.gpu_used_mem * 1e-6, "MBytes (", p, " % of total available memory)" + +# # Kernels creation/compilation +# print "\n=== Kernel sources compiling ===" +# f = open(os.path.join(os.path.split(os.path.abspath(__file__))[0], "gpu_src.cl"), 'r') +# self.gpu_src = "".join(f.readlines()) +# f.close() +# if not self.user_src is None: +# f = open(self.user_src, 'r') +# self.gpu_src += "".join(f.readlines()) +# f.close() +# build_options = "-cl-single-precision-constant -cl-opt-disable" +# build_options += " -D DIM=" + str(self.dim) +# build_options += " -D DEBUG=" + str(debug) +# build_options += " -D MAX_LINE_POINTS=" + str(np.max(self.grid.elementNumber)) +# self.prg = cl.Program(self.ctx, self.gpu_src).build(build_options) +# print self.prg.get_build_info(self.device, cl.program_build_info.LOG) +# print self.prg.get_build_info(self.device, cl.program_build_info.OPTIONS) +# print self.prg.get_build_info(self.device, cl.program_build_info.STATUS) + +# for op in self.discreteProblem.operators: +# for k in self.prg.all_kernels(): +# if op.gpu_kernel_name == k.get_info(cl.kernel_info.FUNCTION_NAME): +# op.gpu_kernel = self.prg +# op.gpu_queue = self.queue +# op.gpu_shape = self.gpu_shape + +# print "\n=== Data initialization ===" +# for v in self.discreteProblem.variables: +# v_is_init = False +# print v.name, +# for i,k in enumerate(self.prg.all_kernels()): +# if ('init' + v.name) == k.get_info(cl.kernel_info.FUNCTION_NAME): +# print 'Initialization Kernel found ', +# evt = eval("self.prg."+"init"+v.name)(self.queue, +# self.gpu_shape, +# None, +# v.gpu_mem_object, +# dtype_real_gpu(0.), +# v.domain.min.astype(dtype_real_gpu), +# v.domain.elementNumber.astype(dtype_integer), +# v.domain.elementSize.astype(dtype_real_gpu) +# ) +# self.queue.finish() +# print 'compute time :', (evt.profile.end - evt.profile.start) * 1e-9, 's', +# v.contains_data, v_is_init = False, True +# if not v_is_init: +# v.init() +# print '... Done' + +# print "\n=== Data Transfers host->device ===" +# copy_evts = [] +# transfered_size = 0 +# for v in self.discreteProblem.variables: +# if v.contains_data: +# print v.name, +# if v.dimension == self.dim: +# print "Memoire arrangée", +# offset = 0 +# evt = cl.enqueue_copy(self.queue, v.gpu_mem_object, +# v.values[..., 0].ravel(), +# device_offset=np.long(offset)) +# copy_evts.append(evt) +# offset += v.values[..., 0].nbytes +# evt = cl.enqueue_copy(self.queue, v.gpu_mem_object, +# v.values[..., 1].swapaxes(1, 0).ravel(), +# device_offset=np.long(offset)) +# copy_evts.append(evt) +# if self.dim == 3: +# offset += v.values[..., 1].nbytes +# evt = cl.enqueue_copy(self.queue, v.gpu_mem_object, +# v.values[..., 2].swapaxes(1, 0).swapaxes(2, 0).ravel(), +# device_offset=np.long(offset)) +# copy_evts.append(evt) +# else: +# print "Normal layout", +# evt = cl.enqueue_copy(self.queue, v.gpu_mem_object, v.values) +# copy_evts.append(evt) +# print "Transfering", v.values.nbytes * 1e-6, "MBytes ..." +# transfered_size += v.values.nbytes +# self.queue.finish() +# if len(copy_evts)>0: +# t = 0. +# for evt in copy_evts: +# t += (evt.profile.end - evt.profile.start) +# print "Transfering host -> device :", t * 1e-9, "s \t", transfered_size / t, "GB/s" +# else: +# print "No transfers" + +# def collect_data(self): +# self.queue.finish() +# print "\n=== Data Transfers device->host ===" +# copy_evts = [] +# transfered_size = 0 +# for v in self.discreteProblem.variables: +# if v.contains_data: +# print v.name, +# if v.dimension == self.dim: +# print "Memoire arrangée", +# temp = np.empty(self.gpu_shape, dtype=dtype_real_gpu) +# offset = 0 +# evt = cl.enqueue_copy(self.queue, temp, v.gpu_mem_object, device_offset=np.long(offset)) +# copy_evts.append(evt) +# v.values[..., 0] = temp.reshape(self.gpu_shape) +# offset += temp.nbytes +# evt = cl.enqueue_copy(self.queue, temp, v.gpu_mem_object, device_offset=np.long(offset)) +# copy_evts.append(evt) +# v.values[..., 1] = temp.reshape(self.gpu_shape).swapaxes(1, 0) +# if self.dim == 3: +# offset += temp.nbytes +# evt = cl.enqueue_copy(self.queue, temp, v.gpu_mem_object, device_offset=np.long(offset)) +# copy_evts.append(evt) +# v.values[..., 2] = temp.reshape(self.gpu_shape).swapaxes(2, 0).swapaxes(1, 0) +# else: +# print "Normal layout", +# evt = cl.enqueue_copy(self.queue, v.values, v.gpu_mem_object) +# copy_evts.append(evt) +# print "Transfering", v.values.nbytes * 1e-6, "MBytes ..." +# transfered_size += v.values.nbytes +# self.queue.finish() +# t = 0. +# for evt in copy_evts: +# t += (evt.profile.end - evt.profile.start) +# print "Transfering device -> host :", t * 1e-9, "s \t", transfered_size / t, "GB/s" + +# def terminate(self): +# self.queue.finish() + +# def __str__(self): +# """ToString method""" +# s = "GPU Particular solver " +# s += "\n - ODESolver : " + str(self.ODESolver) +# s += "\n - Interpolation : " + str(self.InterpolationMethod) +# s += "\n - Remeshing : " + str(self.RemeshingMethod) +# s += "\n - GPU informations : " +# s += "\n - platform : " + self.platform.name +# s += " OpenCL version: " + self.platform.version +# s += "\n - device : " + self.device.name +# s += " Global Memory available : " + str(self.device.global_mem_size // 1024 // 1024) + " MB ( " + str(self.device.global_mem_size) + " B)" +# s += "\n Space index max : " + str(self.device.max_work_item_sizes) +# s += " Max workgroup size : " + str(self.device.max_work_group_size) +# return s + +# if __name__ == "__main__": +# print __doc__ +# print "- Provided class : GPUParticularSolver" +# print GPUParticularSolver.__doc__ + +class KernelLauncher: + """ + kernel launcher + """ + def __init__(self, kernel, queue): + self.kernel = kernel + self.queue = queue + self.name = "Kernel Launcher for : " + self.kernel.function_name + + def launch(self, *args): + evt = self.kernel(self.queue, *args) + return evt diff --git a/HySoP/hysop/particular_solvers/gpu_src.cl b/HySoP/hysop/particular_solvers/gpu_src.cl index fb7fa0265abb9c288c14f2b2a7fad1b403df7e00..c161a82344e0be8622f5a8ae27b43d9454454363 100644 --- a/HySoP/hysop/particular_solvers/gpu_src.cl +++ b/HySoP/hysop/particular_solvers/gpu_src.cl @@ -139,16 +139,13 @@ void advection_init_2_1(__private float4 size, __private float *line_c) // Integration Kernel (RK2 method) __kernel void advection(__global const float* gvelo, - __global float* gscal, __global float* ppos, - __global float* pscal, __private const float t, __private const float dt, __private const uint dir, __private const uint line_nb_pts, __private const float4 min_v, __private const float4 length_v, - __private const uint4 nb, __private const float4 size ) { @@ -163,47 +160,63 @@ __kernel void advection(__global const float* gvelo, __private float line_c[DIM]; // Particle final position __private float gvelo_buffer_loc[MAX_LINE_POINTS]__attribute__((aligned)); // Local buffer to store gvelo for the current line __private uint ind_m, stride_m, stride_component; + uint nbx, nby, nbz; // Compute strides and index. Depend on array order (C order here) dx_dir = size[dir]; if(DIM == 3) { if (dir == 0) { - ind_m = get_global_id(0)*nb.z+get_global_id(1); - stride_component = (nb.x*nb.y*nb.z)*dir; - stride_m = nb.y*nb.z; + nbx = line_nb_pts; + nby = get_global_size(0); + nbz = get_global_size(1); + ind_m = get_global_id(0)*nbz+get_global_id(1); + stride_component = (nbx*nby*nbz)*dir; + stride_m = nby*nbz; } else if (dir == 1) { - ind_m = get_global_id(0)*nb.z+get_global_id(1); - stride_component = (nb.x*nb.y*nb.z)*dir; - stride_m = nb.x*nb.z; + nbx = get_global_size(0); + nby = line_nb_pts; + nbz = get_global_size(1); + ind_m = get_global_id(0)*nbz+get_global_id(1); + stride_component = (nbx*nby*nbz)*dir; + stride_m = nbx*nbz; } else { - ind_m = get_global_id(0)*nb.y+get_global_id(1); - stride_component = (nb.x*nb.y*nb.z)*dir; - stride_m = nb.x*nb.y; + nbx = get_global_size(0); + nby = get_global_size(1); + nbz = line_nb_pts; + ind_m = get_global_id(0)*nby+get_global_id(1); + stride_component = (nbx*nby*nbz)*dir; + stride_m = nbx*nby; } } else { if (dir == 0) { + nbx = line_nb_pts; + nby = get_global_size(0); + nbz = 1; ind_m = get_global_id(0); - stride_component = (nb.x*nb.y)*dir; - stride_m = nb.y; + stride_component = (nbx*nby)*dir; + stride_m = nby; } else { + nbx = get_global_size(0); + nby = line_nb_pts; + nbz = 1; ind_m = get_global_id(0); - stride_component = (nb.x*nb.y)*dir; - stride_m = nb.x; + stride_component = (nbx*nby)*dir; + stride_m = nbx; } } //Get velocity local copy for(p=0;p<line_nb_pts;p++) - gvelo_buffer_loc[p] = gvelo[ind_m + stride_component + p*stride_m]; //Read 1float + gvelo_buffer_loc[p] = gvelo[ind_m + p*stride_m]; //Read 1float // Advection algorithm along line for(p=0;p<line_nb_pts;p++) @@ -233,11 +246,9 @@ __kernel void remeshing(__global const float* part, __global float* gscal, __private const float t, __private const float dt, - __private const int dir, + __private const uint dir, __private const uint line_nb_pts, __private const float4 min_v, - __private const float4 max_v, - __private const uint4 nb, __private const float4 size ) { @@ -253,39 +264,59 @@ __kernel void remeshing(__global const float* part, __private float weights[6]; // Remeshing weights __private float gscal_buffer_loc[MAX_LINE_POINTS]; // Local buffer to store gscal for the current line __private uint ind_m, stride_m; + uint nbx, nby, nbz; + uint4 nb; // Compute strides and index. Depend on array order (C order here) dx_dir = size[dir]; if(DIM == 3){ if (dir == 0) { + nbx = line_nb_pts; + nby = get_global_size(0); + nbz = get_global_size(1); + nb = (uint4)(nbx, nby ,nbz, 1); kernel_init_3_0(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); - ind_m = get_global_id(0)*nb.z+get_global_id(1); - stride_m = nb.y*nb.z; + ind_m = get_global_id(0)*nbz+get_global_id(1); + stride_m = nby*nbz; } else if (dir == 1) { + nbx = get_global_size(0); + nby = line_nb_pts; + nbz = get_global_size(1); + nb = (uint4)(nbx, nby ,nbz, 1); kernel_init_3_1(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); - ind_m = get_global_id(0)*nb.z+get_global_id(1); - stride_m = nb.x*nb.z; + ind_m = get_global_id(0)*nbz+get_global_id(1); + stride_m = nbx*nbz; } else { + nbx = get_global_size(0); + nby = get_global_size(1); + nbz = line_nb_pts; + nb = (uint4)(nbx, nby ,nbz, 1); kernel_init_3_2(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); - ind_m = get_global_id(0)*nb.y+get_global_id(1); - stride_m = nb.x*nb.y; + ind_m = get_global_id(0)*nby+get_global_id(1); + stride_m = nbx*nby; }} else{ if (dir == 0) { + nbx = line_nb_pts; + nby = get_global_size(0); + nb = (uint4)(nbx, nby ,nbz, 1); kernel_init_2_0(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); ind_m = get_global_id(0); - stride_m = nb.y; + stride_m = nby; } else { + nbx = get_global_size(0); + nby = line_nb_pts; + nb = (uint4)(nbx, nby ,nbz, 1); kernel_init_2_1(nb, &stride_along_dir, &stride_along_dir_scal, &ind_line, &ind_line_scal); ind_m = get_global_id(0); - stride_m = nb.x; + stride_m = nbx; } } diff --git a/HySoP/hysop/problem/problem.py b/HySoP/hysop/problem/problem.py index fb209fcd723bf4c55f24c1e2bffc7d18c6e40faf..433b6744c17964923ca430c5961138100cffe5de 100644 --- a/HySoP/hysop/problem/problem.py +++ b/HySoP/hysop/problem/problem.py @@ -46,13 +46,18 @@ class Problem(): self.solver.initialize() def solve(self): - print"\n\n Start solving ..." + print "\n\n Start solving ..." while not self.timer.end: - print "\n==== Iteration : " + str(self.timer.ite + 1) + "\t t=" + str(self.timer.t + self.timer.dt) + " ====" + print "==== Iteration : {0:3d} t={1:6.2f} ====".format(self.timer.ite + 1, self.timer.t + self.timer.dt) for op in self.operators: op.apply(self.timer.t, self.timer.dt) self.timer.step() self.io.step() + print "\n\n End solving\n" + print "=== Timings ===" + for op in self.operators: + op.printComputeTime() + print "===\n" def addVariable(self, cVariable): """ diff --git a/HySoP/hysop/tools/gpu_data_transfer.py b/HySoP/hysop/tools/gpu_data_transfer.py new file mode 100644 index 0000000000000000000000000000000000000000..710bb902abe570498d72d6d38bf76342916834b2 --- /dev/null +++ b/HySoP/hysop/tools/gpu_data_transfer.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +""" +@package tools +GPU Memory transfering methods +""" +from ..constants import * +import pyopencl as cl + + +def hostToDevice(queue, discreteField): + print "host->device :", + data, time = 0, 0. + if discreteField.vector: + evt = cl.enqueue_copy(queue, discreteField.gpu_data[XDIR], + discreteField.data[XDIR].ravel()) + data += discreteField.data[XDIR].nbytes + time += (evt.profile.end - evt.profile.start) * 1e-9 + evt = cl.enqueue_copy(queue, discreteField.gpu_data[YDIR], + discreteField.data[YDIR].swapaxes(1, 0).ravel()) + data += discreteField.data[YDIR].nbytes + time += (evt.profile.end - evt.profile.start) * 1e-9 + if len(discreteField.data) == 3: + evt = cl.enqueue_copy(queue, discreteField.gpu_data[ZDIR], + discreteField.data[ZDIR].swapaxes(1, 0).swapaxes(2, 0).ravel()) + data += discreteField.data[ZDIR].nbytes + time += (evt.profile.end - evt.profile.start) * 1e-9 + else: + evt = cl.enqueue_copy(queue, discreteField.gpu_data, discreteField.data) + data += discreteField.data.nbytes + time += (evt.profile.end - evt.profile.start) * 1e-9 + print data, "Bytes transfered at {0:.3f} GBytes/sec".format(data / (time * 1024 ** 3)) + return data, time + + +def deviceToHost(queue, discreteField): + print "device->host :", + data, time = 0, 0. + # if discreteField.vector: + # evt = cl.enqueue_copy(queue, discreteField.gpu_data[XDIR], + # discreteField.data[XDIR].ravel()) + # data += discreteField.data[XDIR].nbytes + # time += (evt.profile.end - evt.profile.start) * 1e-9 + # evt = cl.enqueue_copy(queue, discreteField.gpu_data[YDIR], + # discreteField.data[YDIR].swapaxes(1, 0).ravel()) + # data += discreteField.data[YDIR].nbytes + # time += (evt.profile.end - evt.profile.start) * 1e-9 + # if len(discreteField.data) == 3: + # evt = cl.enqueue_copy(queue, discreteField.gpu_data[ZDIR], + # discreteField.data[ZDIR].swapaxes(1, 0).swapaxes(2, 0).ravel()) + # data += discreteField.data[ZDIR].nbytes + # time += (evt.profile.end - evt.profile.start) * 1e-9 + # else: + # evt = cl.enqueue_copy(queue, discreteField.gpu_data, discreteField.data) + # data += discreteField.data.nbytes + # time += (evt.profile.end - evt.profile.start) * 1e-9 + print data, "Bytes transfered at {0:.3f} GBytes/sec".format(0) # (data * 1e-9) / time) + return data, time