diff --git a/HySoP/hysop/constants.py b/HySoP/hysop/constants.py index 4161890efb209d4872e71e04ae1ff2b0e06ccece..6ec26194ddabf8a6e357fc599b2d5a10a8b8d792 100644 --- a/HySoP/hysop/constants.py +++ b/HySoP/hysop/constants.py @@ -32,4 +32,4 @@ PERIODIC = 0 ## Directions string S_DIR = ["_X", "_Y", "_Z"] ## GPU deflault sources -GPU_SRC = os.path.join(solver_path[0], "gpu_src.cl") +GPU_SRC = os.path.join(solver_path[0], "gpu_src") diff --git a/HySoP/hysop/domain/topology.py b/HySoP/hysop/domain/topology.py index 908110a093d31c5f91966f92b592a0e3fe8145e5..83c589af08d8ad696ab63d7bf2034548c7f5da7c 100644 --- a/HySoP/hysop/domain/topology.py +++ b/HySoP/hysop/domain/topology.py @@ -56,8 +56,13 @@ class CartesianTopology(object): self.west, self.east = self.topo.Shift(YDIR, 1) if(self.dim > 2): self.south, self.north = self.topo.Shift(ZDIR, 1) - self.tabSort = np.array([[self.up,0] , [self.down,1] , [self.east,2], [self.west,3], - [self.north,4],[self.south,5]]) + if self.dim == 3: + self.tabSort = np.array([[self.up, 0], [self.down, 1], + [self.east, 2], [self.west, 3], + [self.north, 4], [self.south, 5]]) + else: + self.tabSort = np.array([[self.up, 0] , [self.down, 1], + [self.east, 2], [self.west, 3]]) tabSort1 = self.tabSort[self.tabSort[:,0]<=self.rank] tabSort2 = self.tabSort[self.tabSort[:,0]>self.rank] if (np.asarray(tabSort1[:,0].shape)>0): @@ -147,7 +152,7 @@ class LocalMesh(object): # Local resolution self.resolution = resolution.copy() + 2 * ghosts # index of the lower point (in each dir) in the global mesh - self.start = start # - ghosts + self.start = start # - ghosts # index of the upper point (in each dir), global mesh self.end = self.start + self.resolution - 1 # Mesh step size @@ -157,12 +162,11 @@ class LocalMesh(object): self.origin = coord_start coord_end = (self.end +1) * self.size + dom_origin - ghosts* self.size # print 'Print TOPO' -# print 'resolution', self.resolution , 'shape', (coord_end-coord_start)/self.size, 'coord_end',coord_end, 'coord_start',coord_start +# print 'resolution', self.resolution , 'shape', (coord_end-coord_start)/self.size, 'coord_end',coord_end, 'coord_start',coord_start if self.start.size == 3: self.coords = np.ogrid[coord_start[0]:coord_end[0]:self.size[0], coord_start[1]:coord_end[1]:self.size[1], coord_start[2]:coord_end[2]:self.size[2]] - self.resolution = self.coords[0][:,0,0].shape[0],self.coords[1][0,:,0].shape[0],self.coords[2][0,0,:].shape[0] elif self.start.size == 2: self.coords = np.ogrid[coord_start[0]:coord_end[0]:self.size[0], coord_start[1]:coord_end[1]:self.size[1]] diff --git a/HySoP/hysop/operator/remeshing.py b/HySoP/hysop/operator/remeshing.py index d8afc7ee28a67d2874edec8ec9805389001a2cdc..53baa23ecd583e15c300b4acca4e75c96d8cc591 100644 --- a/HySoP/hysop/operator/remeshing.py +++ b/HySoP/hysop/operator/remeshing.py @@ -37,6 +37,7 @@ class Remeshing(DiscreteOperator): ## Compute time detailed per directions self.compute_time = [0., 0., 0.] self.name = "remeshing" + self.call_number = 0 def apply(self, t, dt, splittingDirection): """ @@ -52,6 +53,7 @@ class Remeshing(DiscreteOperator): @li 2. Profile timings of OpenCL kernels. """ c_time = 0. + self.call_number += 1 if self.numMethod is not None: ## Launching kernel evt = self.numMethod.launch(self.ppos.gpu_data, @@ -72,8 +74,8 @@ class Remeshing(DiscreteOperator): self.timings_info[0] = "\"Remeshing total\" \"Remeshing x\" \"Remeshing y\" \"Remeshing z\" " self.timings_info[1] = str(self.total_time) + " " + str(self.compute_time[0]) + " " self.timings_info[1] += str(self.compute_time[1]) + " " + str(self.compute_time[2]) + " " - print "Remeshing total time : ", self.total_time - print "\t Remeshing: ", self.compute_time + print "Remeshing total time : ", self.total_time, self.call_number + print "\t Remeshing: ", self.compute_time, self.numMethod.call_number def __str__(self): s = "Remeshing (DiscreteOperator). " + DiscreteOperator.__str__(self) diff --git a/HySoP/hysop/operator/transport_d.py b/HySoP/hysop/operator/transport_d.py index 658d94cd7ff4a3b0962e0c9fcf73dcbb04847de6..1b4e6d7e31ecd25c9ce4a77f134352f44c648a11 100644 --- a/HySoP/hysop/operator/transport_d.py +++ b/HySoP/hysop/operator/transport_d.py @@ -48,6 +48,7 @@ class Transport_d(DiscreteOperator): ## Compute time for transposition detailed per directions self.compute_time_swap = [0., 0., 0.] self.name = "advection" + self.call_number = 0 def apply(self, t, dt, splittingDirection): """ @@ -66,6 +67,7 @@ class Transport_d(DiscreteOperator): @li 3. Profile timings of OpenCL kernels. """ c_time, c_time_init = 0., 0. + self.call_number += 1 if self.numMethod is not None and self.init_transpose is not None and self.init_copy is not None: # Particle init if (self.old_splitting_direction == splittingDirection) or self.old_splitting_direction is None: @@ -110,10 +112,10 @@ class Transport_d(DiscreteOperator): self.timings_info[1] = str(self.total_time) + " " + str(self.compute_time_copy[0]) + " " self.timings_info[1] += str(self.compute_time_swap[0]) + " " + str(self.compute_time_swap[1]) + " " self.timings_info[1] += str(self.compute_time[0]) + " " + str(self.compute_time[1]) + " " + str(self.compute_time[2]) + " " - print "Advection total time : ", self.total_time - print "\t Advection init copy :", self.compute_time_copy - print "\t Advection init swap :", self.compute_time_swap - print "\t Advection :", self.compute_time + print "Advection total time : ", self.total_time, self.call_number + print "\t Advection init copy :", self.compute_time_copy, self.init_copy.call_number + print "\t Advection init swap :", self.compute_time_swap, self.init_transpose.call_number + print "\t Advection :", self.compute_time, self.numMethod.call_number def __str__(self): s = "Advection_P (DiscreteOperator). " + DiscreteOperator.__str__(self) diff --git a/HySoP/hysop/particular_solvers/gpu.py b/HySoP/hysop/particular_solvers/gpu.py index d47d4ef46c70832b131ca47cec5449e941e6bf84..ff9c3272e84162416f73436c64b4cb5a5bc9233f 100644 --- a/HySoP/hysop/particular_solvers/gpu.py +++ b/HySoP/hysop/particular_solvers/gpu.py @@ -93,15 +93,12 @@ class GPUParticleSolver(ParticleSolver): Allocate device memory as OpenCL Buffers.\n Initialize memory on device. \n Link kernels ans parmepy.operators. + @todo : Take non square resolutions """ ParticleSolver.initialize(self) # kernels compilation print "=== Kernel sources compiling ===" - print "Sources files (default): ", GPU_SRC - f = open(GPU_SRC, 'r') - ## OpenCL kernel sources - self.gpu_src = "".join(f.readlines()) - f.close() + self.gpu_src = "" if self.user_gpu_src is not None: if isinstance(self.user_gpu_src, list): for user_src in self.user_gpu_src: @@ -114,36 +111,171 @@ class GPUParticleSolver(ParticleSolver): f = open(self.user_gpu_src, 'r') self.gpu_src += "".join(f.readlines()) f.close() - workItemNumber = int(min(64, min(self.problem.domains[0].discreteDomain.resolution))) - padding = 32 - # Enabling optimisations regarding problem size - float4_enable = True - if min(self.problem.domains[0].discreteDomain.resolution) < 4 * workItemNumber: - float4_enable = False - - build_options = "-cl-single-precision-constant -cl-opt-disable" - build_options += " -D WIDTH=" + str(np.max(self.problem.topology.mesh.resolution)) + resolution = self.problem.topology.mesh.resolution + ## Optimal work item number + if len(resolution) == 3: + workItemNumber = 64 + else: + workItemNumber = 256 + print " Work-Item number: ", workItemNumber + ## Precision supported + print " Precision capability ", + self.correctlyroundeddividesqrt = True + if PARMES_REAL_GPU is np.float32: + print "for Single Precision: " + for v in ['DENORM', 'INF_NAN', + 'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF', + 'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']: + try: + if eval('(self.device.single_fp_config & cl.device_fp_config.' + + v + ') == cl.device_fp_config.' + v): + print v + except AttributeError as ae: + if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT': + self.correctlyroundeddividesqrt = False + print v, 'is not supported in OpenCL C 1.2.\n Exception catched : ', ae + else: + print "for Double Precision: " + if self.device.double_fp_config > 0: + for v in ['DENORM', 'INF_NAN', + 'ROUND_TO_NEAREST', 'ROUND_TO_ZERO', 'ROUND_TO_INF', + 'FMA', 'CORRECTLY_ROUNDED_DIVIDE_SQRT', 'SOFT_FLOAT']: + try: + if eval('(self.device.double_fp_config & cl.device_fp_config.' + + v + ') == cl.device_fp_config.' + v): + print v + except AttributeError as ae: + if v is 'CORRECTLY_ROUNDED_DIVIDE_SQRT': + self.correctlyroundeddividesqrt = False + print v, 'is supported in OpenCL C 1.2.\n Exception catched : ', ae + else: + raise ValueError("Double Precision is not supported by device") + ## Building options and kernel settings + build_options = '' + if self.correctlyroundeddividesqrt: + build_options += " -cl-fp32-correctly-rounded-divide-sqrt " + if PARMES_REAL_GPU is np.float32: + build_options += " -cl-single-precision-constant" + build_options += " -D WIDTH=" + str(resolution[0]) build_options += " -D WGN=" + str(workItemNumber) - build_options += " -D PADDING=" + str(padding) - if not float4_enable: - build_options += " -D BASIC=1" + + ## Collect OpenCL source code + print "\n Source code:" + ## advection + if len(resolution) == 3: + if PARMES_REAL_GPU is np.float32: + src_advec = 'advection_3D_opt4_builtin.cl' + else: + src_advec = 'advection_3D_opt2_builtin.cl' + advec_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2])) + advec_lwi = (int(workItemNumber), 1, 1) + else: + if PARMES_REAL_GPU is np.float32: + src_advec = 'advection_2D_opt4_builtin.cl' + else: + src_advec = 'advection_2D_buitin.cl' + advec_gwi = (int(workItemNumber), int(resolution[1])) + advec_lwi = (int(workItemNumber), 1) + f = open(os.path.join(GPU_SRC, src_advec)) + print " - advection:", f.name + self.gpu_src += "".join(f.readlines()) + f.close() + ## remeshing + if len(resolution) == 3: + src_remesh = 'remeshing_3D_opt4_private4.cl' + src_remesh_weights = 'weights_m6prime_opt4_builtin.cl' + remesh_gwi = (int(workItemNumber), int(resolution[1]), int(resolution[2])) + remesh_lwi = (int(workItemNumber), 1, 1) + else: + src_remesh = 'remeshing_2D_opt4_noBC.cl' + src_remesh_weights = 'weights_m6prime_builtin.cl' + remesh_gwi = (int(workItemNumber), int(resolution[1])) + remesh_lwi = (int(workItemNumber), 1) + f = open(os.path.join(GPU_SRC, src_remesh_weights)) + self.gpu_src += "".join(f.readlines()) + print " - remeshing weights:", f.name + f.close() + f = open(os.path.join(GPU_SRC, src_remesh)) + print " - remeshing:", f.name + self.gpu_src += "".join(f.readlines()) + f.close() + ## copy + if len(resolution) == 3: + if PARMES_REAL_GPU is np.float32: + src_copy = 'copy_3D_opt4.cl' + build_options += " -D TILE_DIM_COPY=16 -D BLOCK_ROWS_COPY=8" + copy_gwi = (int(resolution[0] / 4), int(resolution[1] / 2), int(resolution[2])) + copy_lwi = (4, 8, 1) + else: + src_copy = 'copy_3D_locMem.cl' + build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=4" + copy_gwi = (int(resolution[0]), int(resolution[1] / 8), int(resolution[2])) + copy_lwi = (32, 4, 1) + else: + if PARMES_REAL_GPU is np.float32: + src_copy = 'copy_2D_opt2.cl' + build_options += " -D TILE_DIM_COPY=128 -D BLOCK_ROWS_COPY=4" + copy_gwi = (int(resolution[0] / 2), int(resolution[1] / 32)) + copy_lwi = (64, 4) + else: + src_copy = 'copy_2D.cl' + build_options += " -D TILE_DIM_COPY=32 -D BLOCK_ROWS_COPY=2" + copy_gwi = (int(resolution[0]), int(resolution[1] / 16)) + copy_lwi = (32, 2) + f = open(os.path.join(GPU_SRC, src_copy)) + print " - copy:", f.name + self.gpu_src += "".join(f.readlines()) + f.close() + ## transpose + if len(resolution) == 3: + if PARMES_REAL_GPU is np.float32: + src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl' + build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=8" + transpose_xy_gwi = (int(resolution[0] / 2), int(resolution[1] / 4), int(resolution[2])) + transpose_xy_lwi = (16, 8, 1) + src_transpose_xz = 'transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl' + build_options += " -D TILE_DIM_XZ=16 -D BLOCK_ROWS_XZ=4" + transpose_xz_gwi = (int(resolution[0]), int(resolution[1] / 4), int(resolution[2] / 4)) + transpose_xz_lwi = (16, 4, 4) + else: + src_transpose_xy = 'transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl' + build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=4" + transpose_xy_gwi = (int(resolution[0] / 4), int(resolution[1] / 8), int(resolution[2])) + transpose_xy_lwi = (8, 4, 1) + src_transpose_xz = 'transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl' + build_options += " -D TILE_DIM_XZ=8 -D BLOCK_ROWS_XZ=2" + transpose_xz_gwi = (int(resolution[0]), int(resolution[1] / 4), int(resolution[2] / 4)) + transpose_xz_lwi = (8, 2, 2) + else: + src_transpose_xy = 'transpose_2D_xy_coalesced_locPad_Diag_opt4.cl' + build_options += " -D TILE_DIM_XY=32 -D BLOCK_ROWS_XY=2" + transpose_xy_gwi = (int(resolution[0] / 4), int(resolution[1]) / 16) + transpose_xy_lwi = (8, 2) + f = open(os.path.join(GPU_SRC, src_transpose_xy)) + print " - transpose_xy:", f.name + self.gpu_src += "".join(f.readlines()) + f.close() + if len(resolution) == 3: + f = open(os.path.join(GPU_SRC, src_transpose_xz)) + self.gpu_src += "".join(f.readlines()) + f.close() + + ## Build code + if PARMES_REAL_GPU is np.float32: + prg = cl.Program(self.ctx, self.gpu_src.replace('.0', '.0f')) + else: + prg = cl.Program(self.ctx, self.gpu_src.replace('float', 'double')) + ## OpenCL program - self.prg = cl.Program(self.ctx, self.gpu_src).build(build_options) - print self.prg.get_build_info(self.device, cl.program_build_info.LOG) + self.prg = prg.build(build_options) + print "Compiler options : ", print self.prg.get_build_info(self.device, cl.program_build_info.OPTIONS) + print "Compiler status : ", print self.prg.get_build_info(self.device, cl.program_build_info.STATUS) + print "Compiler log : ", + print self.prg.get_build_info(self.device, cl.program_build_info.LOG) print "===\n" - # Pad the particle scalar (for transposition efficiency) - if len(self.p_scalar.discreteField[0].data.shape) == 2: - new_shape = (self.p_scalar.discreteField[0].data.shape[0] + padding, - self.p_scalar.discreteField[0].data.shape[1] + padding) - self.p_scalar.discreteField[0].data = np.resize(self.p_scalar.discreteField[0].data, new_shape) - else: - new_shape = (self.p_scalar.discreteField[0].data.shape[0] + padding, - self.p_scalar.discreteField[0].data.shape[1] + padding, - self.p_scalar.discreteField[0].data.shape[2] + padding) - self.p_scalar.discreteField[0].data = np.resize(self.p_scalar.discreteField[0].data, new_shape) - # Convert fields to simple precision floats (PARMES_REAL_GPU) and ensure order + # Convert fields to recquired precision (PARMES_REAL_GPU) and ensure order for f in self.problem.variables: for df in f.discreteField: if f.vector: @@ -184,7 +316,7 @@ class GPUParticleSolver(ParticleSolver): initKernel = cl.Kernel(self.prg, 'init' + f.name) if initKernel is not None: for df in f.discreteField: - print f.name, ": kernel init ... ", + print f.name, ": kernel init", global_wg = df.resolution.tolist() if global_wg[0] / workItemNumber > 0: global_wg[0] = workItemNumber @@ -195,11 +327,13 @@ class GPUParticleSolver(ParticleSolver): else: local_wg = None global_wg = tuple(global_wg) + print global_wg, local_wg, dim = df.topology.dim coord_min = np.ones(4, dtype=PARMES_REAL_GPU) mesh_size = np.ones(4, dtype=PARMES_REAL_GPU) coord_min[0:dim] = df.topology.mesh.origin mesh_size[0:dim] = df.topology.mesh.size + print "...", if f.vector: if dim == 3: evt = initKernel(self.queue, @@ -244,57 +378,42 @@ class GPUParticleSolver(ParticleSolver): if op.discreteOperator.name == "splitting": for sop in op.operators: print sop.discreteOperator.name, - sop.setMethod(KernelLauncher(eval("self.prg." + sop.discreteOperator.name), + if sop.discreteOperator.name == 'advection': + sop.setMethod(KernelLauncher(self.prg.advection, self.queue, - global_wg, - local_wg)) + advec_gwi, + advec_lwi)) + elif sop.discreteOperator.name == 'remeshing': + sop.setMethod(KernelLauncher(self.prg.remeshing, + self.queue, + remesh_gwi, + remesh_lwi)) + else: + raise ValueError("Unknown operator : " + sop.discreteOperator.name) else: - print op.discreteOperator.name, - op.setMethod(KernelLauncher(eval("self.prg." + op.discreteOperator.name), - self.queue, - global_wg, - local_wg)) + raise ValueError("Unknown operator : " + op.discreteOperator.name) print self.advection.discreteOperator.name, - self.advection.discreteOperator.init_copy = KernelLauncher(self.prg.advec_init_copy, + self.advection.discreteOperator.init_copy = KernelLauncher(self.prg.copy, self.queue, - global_wg, - local_wg) - resolution = self.advection.scalar.discreteField[0].domain.resolution - if self.problem.topology.dim == 2: - # A work-group perform a (32,2) part of input and (0,16) times in a internal loop - # The problem size becomes (nb, nb/16) to let workGroup deal with internal loop - # workGroup size (32,2) is rearranged in (64,1) - # Problem size (nb, nb/16) is is rearranged in (nb*2,nb/(16*2)) + copy_gwi, + copy_lwi) + if len(resolution) == 3: print self.advection.discreteOperator.name, - self.advection.discreteOperator.init_transpose = KernelLauncher(self.prg.advec_init_transpose_2D, - self.queue, - (2 * int(resolution[0]), - int(resolution[1]) / 32), - (64, 1),) + k = KernelListLauncher([self.prg.transpose_xy, + self.prg.transpose_xz], + self.queue, + [transpose_xy_gwi, + transpose_xz_gwi], + [transpose_xy_lwi, + transpose_xz_lwi]) + self.advection.discreteOperator.init_transpose = k else: - # For swaping axes 0,1 in 3D, - # A work-group perform a (32,2,1) part of input and (0,16,0) times in a internal loop - # The problem size becomes (nb, nb/16, nb) to let workGroup deal with internal loop - # workGroup size (32,2,1) is rearranged in (64,1,1) - # Problem size (nb, nb/16, nb) is is rearranged in (nb*2,nb/(16*2),nb) - # For swaping axes 0,2 in 3D, - # A work-group perform a (32,1,2) part of input and (0,0,16) times in a internal loop - # The problem size becomes (nb, nb, nb/16) to let workGroup deal with internal loop - # workGroup size (32,1,2) is rearranged in (64,1,1) - # Problem size (nb, nb, nb) is is rearranged in (nb*2,nb,nb/(16*2)) print self.advection.discreteOperator.name, - self.advection.discreteOperator.init_transpose = KernelListLauncher([self.prg.advec_init_transpose_3D_01, - self.prg.advec_init_transpose_3D_02], - self.queue, - [(2 * int(resolution[0]), - int(resolution[1]) / 32, - int(resolution[2])), - (2 * int(resolution[0]), - int(resolution[1]), - int(resolution[2]) / 32) - ], - [(64, 1, 1), - (64, 1, 1)]) + k = KernelLauncher(self.prg.transpose_xy, + self.queue, + transpose_xy_gwi, + transpose_xy_lwi) + self.advection.discreteOperator.init_transpose = k self.problem.io.set_get_data_method(self.get_data_from_device) print "===\n" @@ -336,6 +455,7 @@ class KernelListLauncher: self.global_size = gsize ## OpenCL local size index. self.local_size = lsize + self.call_number = [0 for k in self.kernel] def launch(self, d, *args): """ @@ -359,7 +479,8 @@ class KernelListLauncher: Opencl global and local sizes are given in args. """ - #print self.kernel[d].function_name, args[0], args[1] + print self.kernel[d].function_name, args[0], args[1] + self.call_number[d] += 1 evt = self.kernel[d](self.queue, *args) return evt diff --git a/HySoP/hysop/particular_solvers/gpu_src.cl b/HySoP/hysop/particular_solvers/gpu_src.cl index e1a8e9c8d94136dc86d4a347fa13b6d8ce217bda..af81deb01f79c1eef55f4f61ce169d70e07cb9c0 100644 --- a/HySoP/hysop/particular_solvers/gpu_src.cl +++ b/HySoP/hysop/particular_solvers/gpu_src.cl @@ -1,237 +1,243 @@ -#define GROUP_DIMX (32) -#define LOG_GROUP_DIMX (5) -#define GROUP_DIMY (2) - -#define TYPE float -inline TYPE alpha(TYPE y, TYPE s){ +inline float alpha(float y, float s){ return (y * (y * (y * (y * (13.0f - 5.0f * y) - 9.0f) - 1.0f) + 2.0f) / 24.0f) * s;} -inline TYPE beta(TYPE y, TYPE s){ +inline float beta(float y, float s){ return (y * (y * (y * (y * (25.0f * y - 64.0f) + 39.0f) + 16.0f) - 16.0f) / 24.0f)*s;} -inline TYPE gamma(TYPE y, TYPE s){ +inline float gamma(float y, float s){ return (y * y * (y * (y * (126.0f - 50.0f * y) - 70.0f) - 30.0f) / 24.0f + 1.0f)*s;} -inline TYPE delta(TYPE y, TYPE s){ +inline float delta(float y, float s){ return (y * (y * (y * (y * (50.0f * y - 124.0f) + 66.0f) + 16.0f) + 16.0f) / 24.0f)*s;} -inline TYPE eta(TYPE y, TYPE s){ +inline float eta(float y, float s){ return (y * (y * (y * (y * (61.0f - 25.0f * y) - 33.0f) - 1.0f) - 2.0f) / 24.0f)*s;} -inline TYPE zeta(TYPE y, TYPE s){ +inline float zeta(float y, float s){ return (y * y * y * (y * (5.0f * y - 12.0f) + 7.0f) / 24.0f)*s;} - - -__kernel void advec_init_copy(__global const float* gscal, - __global float* pscal) +#if DIM==3 +__kernel void advection_3D_basic(__global const float* gvelo, + __global float* ppos, + float dt, float min_position, float dx) { uint gidX = get_global_id(0); uint gidY = get_global_id(1); uint gidZ = get_global_id(2); + int ind, i_ind, i_ind_p; + float invdx = 1.0f/dx; uint i; - for(i=gidX; i<WIDTH; i+=WGN) + float v, y, p; + for(i=gidX; i<WIDTH; i+=NB_WI_ADVECTION) { - pscal[i+gidY*(WIDTH+PADDING)+gidZ*(WIDTH+PADDING)*(WIDTH+PADDING)] = gscal[i+gidY*WIDTH+gidZ*WIDTH*WIDTH]; + v = gvelo[i+gidY*WIDTH+gidZ*WIDTH*WIDTH]; + p = i*dx + 0.5f*dt*v; + ind = convert_int_rtn(p * invdx); + y = (p - convert_float(ind) * dx) * invdx; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((ind + 1) % WIDTH); + p=(gvelo[i_ind+gidY*WIDTH+gidZ*WIDTH*WIDTH]*(1.0f - y) + gvelo[i_ind_p+gidY*WIDTH+gidZ*WIDTH*WIDTH]*y)*dt + i*dx + min_position; + ppos[i+gidY*WIDTH+gidZ*WIDTH*WIDTH] = p; } } - -/* - Contraintes : - - Résultat avec padding de 32 (blanck conflicts) dans toutes les directions - - Domaine de taille multiple de 32 - */ -__kernel void advec_init_transpose_2D(__global const float* input, - __global float* output) +__kernel void advection_3D_float4(__global const float* gvelo, + __global float* ppos, + float dt, float min_position, float dx) { + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + int4 ind, i_ind, i_ind_p; + float invdx = 1.0f/dx; + uint i; + float4 v, vp, y, p, coord; + __local float velocity_cache[WIDTH]; - __local float tile[GROUP_DIMX*(GROUP_DIMX+1)]; - int block_x = get_group_id(0); - int block_y = get_group_id(1); - - int local_x = get_local_id(0) & (GROUP_DIMX - 1); - int local_y = get_local_id(0) >> LOG_GROUP_DIMX; + for(i=gidX*4; i<WIDTH; i+=NB_WI_ADVECTION*4) + { + v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); + velocity_cache[i] = v.x; + velocity_cache[i+1] = v.y; + velocity_cache[i+2] = v.z; + velocity_cache[i+3] = v.w; + } - int local_input = mad24(local_y, GROUP_DIMX + 1, local_x); - int local_output = mad24(local_x, GROUP_DIMX + 1, local_y); + barrier(CLK_LOCAL_MEM_FENCE); - int in_x = mad24(block_x, GROUP_DIMX, local_x); - int in_y = mad24(block_y, GROUP_DIMX, local_y); - int input_index = mad24(in_y, WIDTH, in_x); + for(i=gidX*4; i<WIDTH; i+=NB_WI_ADVECTION*4) + { + v = vload4(i/4, velocity_cache); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = coord + 0.5f*dt*v; + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(velocity_cache[i_ind.x], velocity_cache[i_ind.y], velocity_cache[i_ind.z], velocity_cache[i_ind.w]); + vp = (float4)(velocity_cache[i_ind_p.x], velocity_cache[i_ind_p.y], velocity_cache[i_ind_p.z], velocity_cache[i_ind_p.w]); + p = (v*(1.0f - y) + vp*y)*dt + coord + (float4)(min_position); + vstore4(p, (i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, ppos); + } +} +__kernel void remeshing_3D_float4(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0f/dx; + int4 ind; + uint i, nb_part = WIDTH/NB_WI_REMESHING; + float4 p, s, y; + uint4 index4; - int out_x = mad24(block_y, GROUP_DIMX, local_x); - int out_y = mad24(block_x, GROUP_DIMX, local_y); + __local float gscal_loc[WIDTH]; - int output_index = mad24(out_y, WIDTH + PADDING, out_x); + for(i=gidX*4; i<WIDTH; i+=(NB_WI_REMESHING*4)) +{ + gscal_loc[i] = 0.0f; + gscal_loc[i+1] = 0.0f; + gscal_loc[i+2] = 0.0f; + gscal_loc[i+3] = 0.0f; +} + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + p = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, ppos) - (float4)(min_position); + s = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH); + gscal_loc[index4.x] += (alpha(y.x,s.x)); + gscal_loc[index4.y] += (alpha(y.y,s.y)); + gscal_loc[index4.z] += (alpha(y.z,s.z)); + gscal_loc[index4.w] += (alpha(y.w,s.w)); +barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[index4.x] += (beta(y.x,s.x)); + gscal_loc[index4.y] += (beta(y.y,s.y)); + gscal_loc[index4.z] += (beta(y.z,s.z)); + gscal_loc[index4.w] += (beta(y.w,s.w)); +barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[index4.x] += (gamma(y.x, s.x)); + gscal_loc[index4.y] += (gamma(y.y, s.y)); + gscal_loc[index4.z] += (gamma(y.z, s.z)); + gscal_loc[index4.w] += (gamma(y.w, s.w)); +barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[index4.x] += (delta(y.x, s.x)); + gscal_loc[index4.y] += (delta(y.y, s.y)); + gscal_loc[index4.z] += (delta(y.z, s.z)); + gscal_loc[index4.w] += (delta(y.w, s.w)); +barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[index4.x] += (eta(y.x, s.x)); + gscal_loc[index4.y] += (eta(y.y, s.y)); + gscal_loc[index4.z] += (eta(y.z, s.z)); + gscal_loc[index4.w] += (eta(y.w, s.w)); +barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[index4.x] += (zeta(y.x, s.x)); + gscal_loc[index4.y] += (zeta(y.y, s.y)); + gscal_loc[index4.z] += (zeta(y.z, s.z)); + gscal_loc[index4.w] += (zeta(y.w, s.w)); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(NB_WI_REMESHING*4)) + vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, gscal); +} - int global_input_stride = WIDTH * GROUP_DIMY; - int global_output_stride = (WIDTH + PADDING) * GROUP_DIMY; +__kernel void copyLocMem_3D(__global const float* in, + __global float* out) +{ + uint xIndex = get_group_id(0) * TILE_DIM_COPY_3D + get_local_id(0); + uint yIndex = get_group_id(1) * TILE_DIM_COPY_3D + get_local_id(1); + uint zIndex = get_global_id(2); + uint index = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; - int local_input_stride = GROUP_DIMY * (GROUP_DIMX + 1); - int local_output_stride = GROUP_DIMY; + __local float tile[TILE_DIM_COPY_3D][TILE_DIM_COPY_3D]; - for (uint i=0; i<16; i++){ - tile[local_input] = input[input_index]; - local_input += local_input_stride; - input_index += global_input_stride; + for(uint i=0; i<TILE_DIM_COPY_3D; i+=BLOCK_ROWS_COPY_3D) + { + tile[get_local_id(1)+i][get_local_id(0)] = in[index + i*WIDTH]; } - - barrier(CLK_LOCAL_MEM_FENCE); - - for (uint i=0; i<16; i++){ - output[output_index] = tile[local_output]; - local_output += local_output_stride; - output_index += global_output_stride; +barrier(CLK_LOCAL_MEM_FENCE); + for(uint i=0; i<TILE_DIM_COPY_3D; i+=BLOCK_ROWS_COPY_3D) + { + out[index + i*WIDTH] = tile[get_local_id(1)+i][get_local_id(0)]; } } -__kernel void advec_init_transpose_3D_01(__global const float* input, - __global float* output) -{ - /* Nouvelle version avec des index : (256*2, 256/32, 256) (64, 1, 1) - Transposition xyz -> yxz dans un buffer local 32,32,1*/ - int block_x = get_group_id(0); - int block_y = get_group_id(1); - int block_z = get_group_id(2); - - int local_x = get_local_id(0) % (GROUP_DIMX); - int local_y = get_local_id(0) / (GROUP_DIMX); +__kernel void transpose_3D_xy_coalesced_locPad_Diag(__global const float* in, + __global float* out) +{ + uint group_id_x = get_group_id(0); + uint group_id_y = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); - __local float tile[GROUP_DIMX*(GROUP_DIMX+1)]; + uint xIndex = group_id_x * TILE_DIM_TRANSPOSE_3D_XY + get_local_id(0); + uint yIndex = group_id_y * TILE_DIM_TRANSPOSE_3D_XY + get_local_id(1); + uint zIndex = get_global_id(2); + uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; - int input_index = local_x + block_x*GROUP_DIMX + (local_y + block_y*GROUP_DIMX)*WIDTH + block_z*WIDTH*WIDTH; - int local_input = local_x + local_y*(GROUP_DIMX+1); - int local_input_stride = GROUP_DIMY * (GROUP_DIMX+1); - int global_input_stride = WIDTH * GROUP_DIMY; + xIndex = group_id_y * TILE_DIM_TRANSPOSE_3D_XY + get_local_id(0); + yIndex = group_id_x * TILE_DIM_TRANSPOSE_3D_XY + get_local_id(1); + uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; - int output_index = local_x + block_y*GROUP_DIMX + (local_y + block_x*GROUP_DIMX)*(WIDTH + PADDING) + block_z*(WIDTH + PADDING)*(WIDTH + PADDING); - int local_output = local_y + local_x*(GROUP_DIMX+1); - int local_output_stride = GROUP_DIMY; - int global_output_stride = (WIDTH + PADDING) * GROUP_DIMY; + __local float tile[TILE_DIM_TRANSPOSE_3D_XY][TILE_DIM_TRANSPOSE_3D_XY+1]; - for (uint i=0; i<16; i++){ - tile[local_input] = input[input_index]; - local_input += local_input_stride; - input_index += global_input_stride; + for(uint i=0; i<TILE_DIM_TRANSPOSE_3D_XY; i+=BLOCK_ROWS_TRANSPOSE_3D_XY) + { + tile[get_local_id(1) + i][get_local_id(0)] = in[index_in + i*WIDTH]; } - barrier(CLK_LOCAL_MEM_FENCE); - - for (uint i=0; i<16; i++){ - output[output_index] = tile[local_output]; - local_output += local_output_stride; - output_index += global_output_stride; + for(uint i=0; i<TILE_DIM_TRANSPOSE_3D_XY; i+=BLOCK_ROWS_TRANSPOSE_3D_XY) + { + out[index_out + i*WIDTH] = tile[get_local_id(0)][get_local_id(1) + i]; } - - /* Fonctionne avec des index: (64, 256, 256) (64, 1, 1) - uint gidX = get_global_id(0); - uint gidY = get_global_id(1); - uint gidZ = get_global_id(2); - for(uint i=gidX; i<WIDTH; i+=WGN) - { - output[i+gidY*(WIDTH+PADDING)+gidZ*(WIDTH+PADDING)*(WIDTH+PADDING)] = input[gidY+i*WIDTH+gidZ*WIDTH*WIDTH]; - } -*/ } - -__kernel void advec_init_transpose_3D_02(__global const float* input, - __global float* output) +__kernel void transpose_3D_xz_coalesced(__global const float* in, + __global float* out) { - /* Nouvelle version avec des index : (256*2, 256, 256/32) (64, 1, 1) - Transposition yxz -> zxy (ie. xyz->zyx) dans un buffer local 32,32,1*/ + uint xIndex = get_group_id(0) * TILE_DIM_TRANSPOSE_3D_XZ + get_local_id(0); + uint yIndex = get_group_id(1) * TILE_DIM_TRANSPOSE_3D_XZ + get_local_id(1); + uint zIndex = get_group_id(2) * TILE_DIM_TRANSPOSE_3D_XZ + get_local_id(2); + uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; - int block_x = get_group_id(0); - int block_y = get_group_id(1); - int block_z = get_group_id(2); + xIndex = get_group_id(2) * TILE_DIM_TRANSPOSE_3D_XZ + get_local_id(0); + zIndex = get_group_id(0) * TILE_DIM_TRANSPOSE_3D_XZ + get_local_id(2); + uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; - int local_x = get_local_id(0) % (GROUP_DIMX); - int local_z = get_local_id(0) / (GROUP_DIMX); + __local float tile[TILE_DIM_TRANSPOSE_3D_XZ][TILE_DIM_TRANSPOSE_3D_XZ][TILE_DIM_TRANSPOSE_3D_XZ]; - __local float tile[GROUP_DIMX*(GROUP_DIMX+1)]; - - int input_index = local_x + block_x*GROUP_DIMX + block_y*WIDTH + (local_z+block_z*GROUP_DIMX)*WIDTH*WIDTH; - int local_input = local_x + local_z*(GROUP_DIMX+1); - int local_input_stride = GROUP_DIMY*(GROUP_DIMX+1); - int global_input_stride = GROUP_DIMY*WIDTH*WIDTH; - - int output_index = local_x + block_z*GROUP_DIMX + block_y*(WIDTH + PADDING) + (local_z+block_x*GROUP_DIMX)*(WIDTH + PADDING)*(WIDTH + PADDING); - int local_output = local_z + local_x*(GROUP_DIMX+1); - int local_output_stride = GROUP_DIMY; - int global_output_stride = GROUP_DIMY*(WIDTH + PADDING)*(WIDTH + PADDING); - - for (uint i=0; i<16; i++){ - tile[local_input] = input[input_index]; - local_input += local_input_stride; - input_index += global_input_stride; +for(uint j=0; j<TILE_DIM_TRANSPOSE_3D_XZ; j+=BLOCK_ROWS_TRANSPOSE_3D_XZ) +{ + for(uint i=0; i<TILE_DIM_TRANSPOSE_3D_XZ; i+=BLOCK_ROWS_TRANSPOSE_3D_XZ) + { + tile[get_local_id(2) + j][get_local_id(1) + i][get_local_id(0)] = in[index_in + i*WIDTH + j*WIDTH*WIDTH]; } - +} barrier(CLK_LOCAL_MEM_FENCE); - - for (uint i=0; i<16; i++){ - output[output_index] = tile[local_output]; - local_output += local_output_stride; - output_index += global_output_stride; +for(uint j=0; j<TILE_DIM_TRANSPOSE_3D_XZ; j+=BLOCK_ROWS_TRANSPOSE_3D_XZ) +{ + for(uint i=0; i<TILE_DIM_TRANSPOSE_3D_XZ; i+=BLOCK_ROWS_TRANSPOSE_3D_XZ) + { + out[index_out + i*WIDTH + j*WIDTH*WIDTH] = tile[get_local_id(0)][get_local_id(1)+i][get_local_id(2) + j]; } - - /* Fonctionne avec des index: (64, 256, 256) (64, 1, 1) - uint gidX = get_global_id(0); - uint gidY = get_global_id(1); - uint gidZ = get_global_id(2); - uint i; - for(i=gidX; i<WIDTH; i+=WGN) - { - output[i+gidY*(WIDTH+PADDING)+gidZ*(WIDTH+PADDING)*(WIDTH+PADDING)] = input[gidZ+gidY*WIDTH+i*WIDTH*WIDTH]; - } - */ } - - -#ifdef BASIC -/* Advection Basique */ -__kernel void advection(__global const float* gvelo, - __global float* ppos, - float dt, float min_position, float dx) -{ - uint gidX = get_global_id(0); - uint gidY = get_global_id(1); - uint gidZ = get_global_id(2); - int ind, i_ind, i_ind_p; - float invdx = 1.0f/dx; - uint i; - float v, y, p; - for(i=gidX; i<WIDTH; i+=WGN) - { - v = gvelo[i+gidY*WIDTH+gidZ*WIDTH*WIDTH]; - p = i*dx + 0.5f*dt*v; - ind = convert_int_rtn(p * invdx); - y = (p - convert_float(ind) * dx) * invdx; - i_ind = ((ind + WIDTH) % WIDTH); - i_ind_p = ((ind + 1) % WIDTH); - p=(gvelo[i_ind+gidY*WIDTH+gidZ*WIDTH*WIDTH]*(1.0f - y) + gvelo[i_ind_p+gidY*WIDTH+gidZ*WIDTH*WIDTH]*y)*dt + i*dx + min_position; - ppos[i+gidY*WIDTH+gidZ*WIDTH*WIDTH] = p; - } - } +} #else -/* Advection Opimisée */ - /*Optimisations : - - Memoire locale utilisée comme cache pour la vitesse - - IO en mémoire globale par float4 - - Calculs par float4 -Contraintes : - - Domaine de longueur WIDTH (dans la direction de splitting, ie. dans toutes les directions) - - WIDTH > 4*WGN - */ -__kernel void advection(__global const float* gvelo, - __global float* ppos, - float dt, float min_position, float dx) +__kernel void advection_2D_float4(__global const float* gvelo, + __global float* ppos, + float dt, float min_position, float dx) { uint gidX = get_global_id(0); uint gidY = get_global_id(1); - uint gidZ = get_global_id(2); int4 ind, i_ind, i_ind_p; float invdx = 1.0f/dx; uint i; float4 v, vp, y, p, coord; __local float velocity_cache[WIDTH]; - for(i=gidX*4; i<WIDTH; i+=WGN*4) + for(i=gidX*4; i<WIDTH; i+=NB_WI_ADVECTION*4) { - v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); + v = vload4((i+gidY*WIDTH)/4, gvelo); velocity_cache[i] = v.x; velocity_cache[i+1] = v.y; velocity_cache[i+2] = v.z; @@ -240,7 +246,7 @@ __kernel void advection(__global const float* gvelo, barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*4; i<WIDTH; i+=WGN*4) + for(i=gidX*4; i<WIDTH; i+=NB_WI_ADVECTION*4) { v = vload4(i/4, velocity_cache); coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); @@ -252,124 +258,200 @@ __kernel void advection(__global const float* gvelo, v = (float4)(velocity_cache[i_ind.x], velocity_cache[i_ind.y], velocity_cache[i_ind.z], velocity_cache[i_ind.w]); vp = (float4)(velocity_cache[i_ind_p.x], velocity_cache[i_ind_p.y], velocity_cache[i_ind_p.z], velocity_cache[i_ind_p.w]); p = (v*(1.0f - y) + vp*y)*dt + coord + (float4)(min_position); - vstore4(p, (i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, ppos); + vstore4(p, (i+gidY*WIDTH)/4, ppos); } - } -#endif - +} -#ifdef BASIC -/* Remaillage basique */ -__kernel void remeshing(__global const float* ppos, - __global const float* pscal, - __global float* gscal, float min_position, float dx) +__kernel void advection_2D_basic(__global const float* gvelo, + __global float* ppos, + float dt, float min_position, float dx) { uint gidX = get_global_id(0); uint gidY = get_global_id(1); - uint gidZ = get_global_id(2); + int ind, i_ind, i_ind_p; float invdx = 1.0f/dx; - int ind; - uint i, nb_part = WIDTH/WGN; - float p, s, y; - uint index; - - __local float gscal_loc[WIDTH]; - - for(i=gidX; i<WIDTH; i+=WGN) - gscal_loc[i] = 0.0f; - barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=1) + uint i; + float v, y, p; + for(i=gidX; i<WIDTH; i+=NB_WI_ADVECTION) { - p = ppos[(i + gidY*WIDTH + gidZ*WIDTH*WIDTH)] - min_position; + v = gvelo[i+gidY*WIDTH]; + p = i*dx + 0.5f*dt*v; ind = convert_int_rtn(p * invdx); y = (p - convert_float(ind) * dx) * invdx; - s = pscal[(i + gidY*(WIDTH+PADDING) + gidZ*(WIDTH+PADDING)*(WIDTH+PADDING))]; - index = convert_uint((ind - 2 + WIDTH) % WIDTH); - gscal_loc[index] += (alpha(y, s)); - index = (index + 1) % WIDTH; - gscal_loc[index] += (beta(y,s)); - index = (index + 1) % WIDTH; - gscal_loc[index] += (gamma(y,s)); - index = (index + 1) % WIDTH; - gscal_loc[index] += (delta(y,s)); - index = (index + 1) % WIDTH; - gscal_loc[index] += (eta(y,s)); - index = (index + 1) % WIDTH; - gscal_loc[index] += (zeta(y,s)); - } - barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX; i<WIDTH; i+=WGN) - gscal[i+gidY*WIDTH+gidZ*WIDTH*WIDTH] = gscal_loc[i]; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((ind + 1) % WIDTH); + p=(gvelo[i_ind+gidY*WIDTH]*(1.0f - y) + gvelo[i_ind_p+gidY*WIDTH]*y)*dt + i*dx + min_position; + ppos[i+gidY*WIDTH] = p; } -#else -/* Remaillage Optimisé*/ - /*Optimisations : - - Memoire locale utilisée comme cache pour le scalaire et la synchronisation des écritures - - IO en mémoire globale par float4 - - Calculs par float4 -Contraintes : - - Domaine de longueur WIDTH (dans la direction de splitting, ie. dans toutes les directions) - - WIDTH > 4*WGN - */ -__kernel void remeshing(__global const float* ppos, - __global const float* pscal, - __global float* gscal, float min_position, float dx) +} + + +__kernel void remeshing_2D_float4(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) { uint gidX = get_global_id(0); uint gidY = get_global_id(1); - uint gidZ = get_global_id(2); float invdx = 1.0f/dx; int4 ind; - uint i, nb_part = WIDTH/WGN; + uint i, nb_part = WIDTH/NB_WI_REMESHING; float4 p, s, y; + float gsx, gsy, gsz, gsw; uint4 index4; __local float gscal_loc[WIDTH]; - for(i=gidX; i<WIDTH; i+=WGN) + for(i=gidX*4; i<WIDTH; i+=(NB_WI_REMESHING*4)) +{ gscal_loc[i] = 0.0f; + gscal_loc[i+1] = 0.0f; + gscal_loc[i+2] = 0.0f; + gscal_loc[i+3] = 0.0f; +} barrier(CLK_LOCAL_MEM_FENCE); for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) { - p = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, ppos) - (float4)(min_position); - s = vload4((i + gidY*(WIDTH+PADDING) + gidZ*(WIDTH+PADDING)*(WIDTH+PADDING))/4, pscal); + p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position); + s = vload4((i + gidY*WIDTH)/4, pscal); ind = convert_int4_rtn(p * invdx); y = (p - convert_float4(ind) * dx) * invdx; index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH); - gscal_loc[index4.x] += (alpha(y.x, s.x)); - gscal_loc[index4.y] += (alpha(y.y, s.y)); - gscal_loc[index4.z] += (alpha(y.z, s.z)); - gscal_loc[index4.w] += (alpha(y.w, s.w)); + gsx = gscal_loc[index4.x]; + gsy = gscal_loc[index4.y]; + gsz = gscal_loc[index4.z]; + gsw = gscal_loc[index4.w]; + gsx = gsx + (alpha(y.x, s.x)); + gsy = gsy + (alpha(y.y, s.y)); + gsz = gsz + (alpha(y.z, s.z)); + gsw = gsw + (alpha(y.w, s.w)); + gscal_loc[index4.x] = gsx; + gscal_loc[index4.y] = gsy; + gscal_loc[index4.z] = gsz; + gscal_loc[index4.w] = gsw; +barrier(CLK_LOCAL_MEM_FENCE); index4 = (index4 + 1) % WIDTH; - gscal_loc[index4.x] += (beta(y.x,s.x)); - gscal_loc[index4.y] += (beta(y.y,s.y)); - gscal_loc[index4.z] += (beta(y.z,s.z)); - gscal_loc[index4.w] += (beta(y.w,s.w)); + gsx = gscal_loc[index4.x]; + gsy = gscal_loc[index4.y]; + gsz = gscal_loc[index4.z]; + gsw = gscal_loc[index4.w]; + gsx = gsx + (beta(y.x, s.x)); + gsy = gsy + (beta(y.y, s.y)); + gsz = gsz + (beta(y.z, s.z)); + gsw = gsw + (beta(y.w, s.w)); + gscal_loc[index4.x] = gsx; + gscal_loc[index4.y] = gsy; + gscal_loc[index4.z] = gsz; + gscal_loc[index4.w] = gsw; +barrier(CLK_LOCAL_MEM_FENCE); index4 = (index4 + 1) % WIDTH; - gscal_loc[index4.x] += (gamma(y.x, s.x)); - gscal_loc[index4.y] += (gamma(y.y, s.y)); - gscal_loc[index4.z] += (gamma(y.z, s.z)); - gscal_loc[index4.w] += (gamma(y.w, s.w)); + gsx = gscal_loc[index4.x]; + gsy = gscal_loc[index4.y]; + gsz = gscal_loc[index4.z]; + gsw = gscal_loc[index4.w]; + gsx = gsx + (gamma(y.x, s.x)); + gsy = gsy + (gamma(y.y, s.y)); + gsz = gsz + (gamma(y.z, s.z)); + gsw = gsw + (gamma(y.w, s.w)); + gscal_loc[index4.x] = gsx; + gscal_loc[index4.y] = gsy; + gscal_loc[index4.z] = gsz; + gscal_loc[index4.w] = gsw; +barrier(CLK_LOCAL_MEM_FENCE); index4 = (index4 + 1) % WIDTH; - gscal_loc[index4.x] += (delta(y.x, s.x)); - gscal_loc[index4.y] += (delta(y.y, s.y)); - gscal_loc[index4.z] += (delta(y.z, s.z)); - gscal_loc[index4.w] += (delta(y.w, s.w)); + gsx = gscal_loc[index4.x]; + gsy = gscal_loc[index4.y]; + gsz = gscal_loc[index4.z]; + gsw = gscal_loc[index4.w]; + gsx = gsx + (delta(y.x, s.x)); + gsy = gsy + (delta(y.y, s.y)); + gsz = gsz + (delta(y.z, s.z)); + gsw = gsw + (delta(y.w, s.w)); + gscal_loc[index4.x] = gsx; + gscal_loc[index4.y] = gsy; + gscal_loc[index4.z] = gsz; + gscal_loc[index4.w] = gsw; +barrier(CLK_LOCAL_MEM_FENCE); index4 = (index4 + 1) % WIDTH; - gscal_loc[index4.x] += (eta(y.x, s.x)); - gscal_loc[index4.y] += (eta(y.y, s.y)); - gscal_loc[index4.z] += (eta(y.z, s.z)); - gscal_loc[index4.w] += (eta(y.w, s.w)); + gsx = gscal_loc[index4.x]; + gsy = gscal_loc[index4.y]; + gsz = gscal_loc[index4.z]; + gsw = gscal_loc[index4.w]; + gsx = gsx + (eta(y.x, s.x)); + gsy = gsy + (eta(y.y, s.y)); + gsz = gsz + (eta(y.z, s.z)); + gsw = gsw + (eta(y.w, s.w)); + gscal_loc[index4.x] = gsx; + gscal_loc[index4.y] = gsy; + gscal_loc[index4.z] = gsz; + gscal_loc[index4.w] = gsw; +barrier(CLK_LOCAL_MEM_FENCE); index4 = (index4 + 1) % WIDTH; - gscal_loc[index4.x] += (zeta(y.x, s.x)); - gscal_loc[index4.y] += (zeta(y.y, s.y)); - gscal_loc[index4.z] += (zeta(y.z, s.z)); - gscal_loc[index4.w] += (zeta(y.w, s.w)); + gsx = gscal_loc[index4.x]; + gsy = gscal_loc[index4.y]; + gsz = gscal_loc[index4.z]; + gsw = gscal_loc[index4.w]; + gsx = gsx + (zeta(y.x, s.x)); + gsy = gsy + (zeta(y.y, s.y)); + gsz = gsz + (zeta(y.z, s.z)); + gsw = gsw + (zeta(y.w, s.w)); + gscal_loc[index4.x] = gsx; + gscal_loc[index4.y] = gsy; + gscal_loc[index4.z] = gsz; + gscal_loc[index4.w] = gsw; } barrier(CLK_LOCAL_MEM_FENCE); - for(i=gidX*4; i<WIDTH; i+=(WGN*4)) - vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, gscal); + for(i=gidX*4; i<WIDTH; i+=(NB_WI_REMESHING*4)) + vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH)/4, gscal); +} + +__kernel void copyLocMem_2D(__global const float* in, + __global float* out) +{ + uint xIndex = get_group_id(0) * TILE_DIM_COPY_2D + get_local_id(0); + uint yIndex = get_group_id(1) * TILE_DIM_COPY_2D + get_local_id(1); + uint index = xIndex + yIndex * WIDTH; + + __local float tile[TILE_DIM_COPY_2D][TILE_DIM_COPY_2D]; + + for(uint i=0; i<TILE_DIM_COPY_2D; i+=BLOCK_ROWS_COPY_2D) + { + tile[get_local_id(1)+i][get_local_id(0)] = in[index + i*WIDTH]; + } +barrier(CLK_LOCAL_MEM_FENCE); + for(uint i=0; i<TILE_DIM_COPY_2D; i+=BLOCK_ROWS_COPY_2D) + { + out[index + i*WIDTH] = tile[get_local_id(1)+i][get_local_id(0)]; + } +} + +__kernel void transpose_2D_coalesced_locPad_Diag(__global const float* in, + __global float* out) +{ + uint group_id_x = get_group_id(0); + uint group_id_y = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); + + uint xIndex = group_id_x * TILE_DIM_TRANSPOSE_2D + get_local_id(0); + uint yIndex = group_id_y * TILE_DIM_TRANSPOSE_2D + get_local_id(1); + uint index_in = xIndex + yIndex * WIDTH; + + xIndex = group_id_y * TILE_DIM_TRANSPOSE_2D + get_local_id(0); + yIndex = group_id_x * TILE_DIM_TRANSPOSE_2D + get_local_id(1); + uint index_out = xIndex + yIndex * WIDTH; + + __local float tile[TILE_DIM_TRANSPOSE_2D][TILE_DIM_TRANSPOSE_2D+1]; + + for(uint i=0; i<TILE_DIM_TRANSPOSE_2D; i+=BLOCK_ROWS_TRANSPOSE_2D) + { + tile[get_local_id(1) + i][get_local_id(0)] = in[index_in + i*WIDTH]; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(uint i=0; i<TILE_DIM_TRANSPOSE_2D; i+=BLOCK_ROWS_TRANSPOSE_2D) + { + out[index_out + i*WIDTH] = tile[get_local_id(0)][get_local_id(1) + i]; + } } + + #endif @@ -379,6 +461,10 @@ __kernel void remeshing(__global const float* ppos, + + + + /* // volume computing diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_2D_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..008be61a924d04004e424cf66c0dadd509ac59c2 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_builtin.cl @@ -0,0 +1,34 @@ +/** + * 2D advection kernel. + * + * Computations are done using builtin functions on vector types. + * A local array is used un local memory as a cache for velocity. + * + * @param gvelo Velocity. + * @param ppos Particle position. + * @param dt Time step. + * @param min_position Domain lower point. + * @param dx Space step. + */ +__kernel void advection(__global const float* gvelo, + __global float* ppos, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + int ind, i_ind, i_ind_p; + float invdx = 1.0/dx; + uint i; + float v, y, p; + for(i=gidX; i<WIDTH; i+=WGN) + { + v = gvelo[i+gidY*WIDTH]; + p = fma((float)(0.5*dt),v,i*dx); + ind = convert_int_rtn(p * invdx); + y = (fma(- convert_float(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((ind + 1) % WIDTH); + p=fma(mix(gvelo[i_ind+gidY*WIDTH],gvelo[i_ind_p+gidY*WIDTH],y),dt,i*dx + min_position); + ppos[i+gidY*WIDTH] = p; + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt2_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt2_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..d99fe53429fe274cca78682038c06150eda981b9 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt2_builtin.cl @@ -0,0 +1,49 @@ +/** + * 2D advection kernel. + * + * Memory reads and writes are performed using 2 element OpenCL vectors (float2 or double2). + * Computations are done using builtin functions on vector types. + * A local array is used un local memory as a cache for velocity. + * + * @param gvelo Velocity. + * @param ppos Particle position. + * @param dt Time step. + * @param min_position Domain lower point. + * @param dx Space step. + */ +__kernel void advection(__global const float* gvelo, + __global float* ppos, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + int2 ind, i_ind, i_ind_p; + float invdx = 1.0/dx; + uint i; + float2 v, vp, y, p, coord; + __local float velocity_cache[WIDTH]; + + for(i=gidX*2; i<WIDTH; i+=WGN*2) + { + v = vload2((i+gidY*WIDTH)/2, gvelo); + velocity_cache[i] = v.x; + velocity_cache[i+1] = v.y; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=gidX*2; i<WIDTH; i+=WGN*2) + { + v = vload2(i/2, velocity_cache); + coord = (float2)(i*dx, (i+1)*dx); + p = fma((float2)(0.5*dt),v,coord); + ind = convert_int2_rtn(p * invdx); + y = (fma(- convert_float2(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float2)(velocity_cache[i_ind.x], velocity_cache[i_ind.y]); + vp = (float2)(velocity_cache[i_ind_p.x], velocity_cache[i_ind_p.y]); + p=fma(mix(v,vp,y),dt,coord + (float2)(min_position)); + vstore2(p, (i+gidY*WIDTH)/2, ppos); + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt4_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt4_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..c9d9637ad6728e8b5346532d706618100e647324 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt4_builtin.cl @@ -0,0 +1,53 @@ +/** + * 2D advection kernel. + * + * Memory reads and writes are performed using 4 element OpenCL vectors (float4 or double4). + * Computations are done using builtin functions on vector types. + * A local array is used un local memory as a cache for velocity. + * + * @param gvelo Velocity. + * @param ppos Particle position. + * @param dt Time step. + * @param min_position Domain lower point. + * @param dx Space step. + */ +__kernel void advection(__global const float* gvelo, + __global float* ppos, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + int4 ind, i_ind, i_ind_p; + float invdx = 1.0/dx; + uint i; + float4 v, vp, y, p, coord; + __local float velocity_cache[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=WGN*4) + { + v = vload4((i+gidY*WIDTH)/4, gvelo); + velocity_cache[i] = v.x; + velocity_cache[i+1] = v.y; + velocity_cache[i+2] = v.z; + velocity_cache[i+3] = v.w; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=gidX*4; i<WIDTH; i+=WGN*4) + { + v = vload4(i/4, velocity_cache); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(velocity_cache[i_ind.x], velocity_cache[i_ind.y], + velocity_cache[i_ind.z], velocity_cache[i_ind.w]); + vp = (float4)(velocity_cache[i_ind_p.x], velocity_cache[i_ind_p.y], + velocity_cache[i_ind_p.z], velocity_cache[i_ind_p.w]); + p=fma(mix(v,vp,y),dt,coord + (float4)(min_position)); + vstore4(p, (i+gidY*WIDTH)/4, ppos); + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt8_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt8_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..cab7c1dd42d87d26fb21c864a80211d430d2a460 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_2D_opt8_builtin.cl @@ -0,0 +1,76 @@ +/** + * 2D advection kernel. + * + * Memory reads and writes are performed using 8 element OpenCL vectors (float8 or double8). + * Computations are done using builtin functions on vector types. + * A local array is used un local memory as a cache for velocity. + * + * @param gvelo Velocity. + * @param ppos Particle position. + * @param dt Time step. + * @param min_position Domain lower point. + * @param dx Space step. + */ +__kernel void advection(__global const float* gvelo, + __global float* ppos, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + int8 ind, i_ind, i_ind_p; + float invdx = 1.0/dx; + uint i; + float8 v, vp, y, p, coord; + __local float velocity_cache[WIDTH]; + + for(i=gidX*8; i<WIDTH; i+=WGN*8) + { + v = vload8((i+gidY*WIDTH)/8, gvelo); + velocity_cache[i] = v.s0; + velocity_cache[i+1] = v.s1; + velocity_cache[i+2] = v.s2; + velocity_cache[i+3] = v.s3; + velocity_cache[i+4] = v.s4; + velocity_cache[i+5] = v.s5; + velocity_cache[i+6] = v.s6; + velocity_cache[i+7] = v.s7; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=gidX*8; i<WIDTH; i+=WGN*8) + { + v = vload8(i/8, velocity_cache); + coord = (float8)(i*dx, + (i+1)*dx, + (i+2)*dx, + (i+3)*dx, + (i+4)*dx, + (i+5)*dx, + (i+6)*dx, + (i+7)*dx); + p = fma((float8)(0.5*dt),v,coord); + ind = convert_int8_rtn(p * invdx); + y = (fma(- convert_float8(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float8)(velocity_cache[i_ind.s0], + velocity_cache[i_ind.s1], + velocity_cache[i_ind.s2], + velocity_cache[i_ind.s3], + velocity_cache[i_ind.s4], + velocity_cache[i_ind.s5], + velocity_cache[i_ind.s6], + velocity_cache[i_ind.s7]); + vp = (float8)(velocity_cache[i_ind_p.s0], + velocity_cache[i_ind_p.s1], + velocity_cache[i_ind_p.s2], + velocity_cache[i_ind_p.s3], + velocity_cache[i_ind_p.s4], + velocity_cache[i_ind_p.s5], + velocity_cache[i_ind_p.s6], + velocity_cache[i_ind_p.s7]); + p = fma(mix(v,vp,y),dt,coord + (float8)(min_position)); + vstore8(p, (i+gidY*WIDTH)/8, ppos); + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_3D_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..c54dd72679192fcca254c170d8669d3cbe80742f --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_builtin.cl @@ -0,0 +1,37 @@ +/** + * 3D advection kernel. + * + * Computations are done using builtin functions on vector types. + * A local array is used un local memory as a cache for velocity. + * + * @param gvelo Velocity. + * @param ppos Particle position. + * @param dt Time step. + * @param min_position Domain lower point. + * @param dx Space step. + */ +__kernel void advection(__global const float* gvelo, + __global float* ppos, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + int ind, i_ind, i_ind_p; + float invdx = 1.0/dx; + uint i; + float v, y, p; + for(i=gidX; i<WIDTH; i+=WGN) + { + v = gvelo[i+gidY*WIDTH+gidZ*WIDTH*WIDTH]; + p = fma((float)(0.5*dt),v,i*dx); + ind = convert_int_rtn(p * invdx); + y = (fma(- convert_float(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((ind + 1) % WIDTH); + p=fma(mix(gvelo[i_ind+gidY*WIDTH+gidZ*WIDTH*WIDTH], + gvelo[i_ind_p+gidY*WIDTH+gidZ*WIDTH*WIDTH],y), + dt,i*dx + min_position); + ppos[i+gidY*WIDTH+gidZ*WIDTH*WIDTH] = p; + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt2_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt2_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..3891bb10dcdc5442cb14e4857f385eb4f4c7f4c8 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt2_builtin.cl @@ -0,0 +1,50 @@ +/** + * 3D advection kernel. + * + * Memory reads and writes are performed using 2 element OpenCL vectors (float2 or double2). + * Computations are done using builtin functions on vector types. + * A local array is used un local memory as a cache for velocity. + * + * @param gvelo Velocity. + * @param ppos Particle position. + * @param dt Time step. + * @param min_position Domain lower point. + * @param dx Space step. + */ +__kernel void advection(__global const float* gvelo, + __global float* ppos, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + int2 ind, i_ind, i_ind_p; + float invdx = 1.0/dx; + uint i; + float2 v, vp, y, p, coord; + __local float velocity_cache[WIDTH]; + + for(i=gidX*2; i<WIDTH; i+=WGN*2) + { + v = vload2((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/2, gvelo); + velocity_cache[i] = v.x; + velocity_cache[i+1] = v.y; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=gidX*2; i<WIDTH; i+=WGN*2) + { + v = vload2(i/2, velocity_cache); + coord = (float2)(i*dx, (i+1)*dx); + p = fma((float2)(0.5*dt),v,coord); + ind = convert_int2_rtn(p * invdx); + y = (fma(- convert_float2(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float2)(velocity_cache[i_ind.x], velocity_cache[i_ind.y]); + vp = (float2)(velocity_cache[i_ind_p.x], velocity_cache[i_ind_p.y]); + p=fma(mix(v,vp,y),dt,coord + (float2)(min_position)); + vstore2(p, (i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/2, ppos); + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt4_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt4_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..49ae8e1d4bb4ab796e0b659eaeadc7e8177bb0eb --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt4_builtin.cl @@ -0,0 +1,54 @@ +/** + * 3D advection kernel. + * + * Memory reads and writes are performed using 4 element OpenCL vectors (float4 or double4). + * Computations are done using builtin functions on vector types. + * A local array is used un local memory as a cache for velocity. + * + * @param gvelo Velocity. + * @param ppos Particle position. + * @param dt Time step. + * @param min_position Domain lower point. + * @param dx Space step. + */ +__kernel void advection(__global const float* gvelo, + __global float* ppos, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + int4 ind, i_ind, i_ind_p; + float invdx = 1.0/dx; + uint i; + float4 v, vp, y, p, coord; + __local float velocity_cache[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=WGN*4) + { + v = vload4((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, gvelo); + velocity_cache[i] = v.x; + velocity_cache[i+1] = v.y; + velocity_cache[i+2] = v.z; + velocity_cache[i+3] = v.w; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=gidX*4; i<WIDTH; i+=WGN*4) + { + v = vload4(i/4, velocity_cache); + coord = (float4)(i*dx, (i+1)*dx, (i+2)*dx, (i+3)*dx); + p = fma((float4)(0.5*dt),v,coord); + ind = convert_int4_rtn(p * invdx); + y = (fma(- convert_float4(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float4)(velocity_cache[i_ind.x], velocity_cache[i_ind.y], + velocity_cache[i_ind.z], velocity_cache[i_ind.w]); + vp = (float4)(velocity_cache[i_ind_p.x], velocity_cache[i_ind_p.y], + velocity_cache[i_ind_p.z], velocity_cache[i_ind_p.w]); + p=fma(mix(v,vp,y),dt,coord + (float4)(min_position)); + vstore4(p, (i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/4, ppos); + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt8_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt8_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..db3fb09a765970c6a15b7c3f43a99fc3e92c46bb --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/advection_3D_opt8_builtin.cl @@ -0,0 +1,77 @@ +/** + * 3D advection kernel. + * + * Memory reads and writes are performed using 8 element OpenCL vectors (float8 or double8). + * Computations are done using builtin functions on vector types. + * A local array is used un local memory as a cache for velocity. + * + * @param gvelo Velocity. + * @param ppos Particle position. + * @param dt Time step. + * @param min_position Domain lower point. + * @param dx Space step. + */ +__kernel void advection(__global const float* gvelo, + __global float* ppos, + float dt, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + int8 ind, i_ind, i_ind_p; + float invdx = 1.0/dx; + uint i; + float8 v, vp, y, p, coord; + __local float velocity_cache[WIDTH]; + + for(i=gidX*8; i<WIDTH; i+=WGN*8) + { + v = vload8((i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/8, gvelo); + velocity_cache[i] = v.s0; + velocity_cache[i+1] = v.s1; + velocity_cache[i+2] = v.s2; + velocity_cache[i+3] = v.s3; + velocity_cache[i+4] = v.s4; + velocity_cache[i+5] = v.s5; + velocity_cache[i+6] = v.s6; + velocity_cache[i+7] = v.s7; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for(i=gidX*8; i<WIDTH; i+=WGN*8) + { + v = vload8(i/8, velocity_cache); + coord = (float8)(i*dx, + (i+1)*dx, + (i+2)*dx, + (i+3)*dx, + (i+4)*dx, + (i+5)*dx, + (i+6)*dx, + (i+7)*dx); + p = fma((float8)(0.5*dt),v,coord); + ind = convert_int8_rtn(p * invdx); + y = (fma(- convert_float8(ind),dx,p))* invdx ; + i_ind = ((ind + WIDTH) % WIDTH); + i_ind_p = ((i_ind + 1) % WIDTH); + v = (float8)(velocity_cache[i_ind.s0], + velocity_cache[i_ind.s1], + velocity_cache[i_ind.s2], + velocity_cache[i_ind.s3], + velocity_cache[i_ind.s4], + velocity_cache[i_ind.s5], + velocity_cache[i_ind.s6], + velocity_cache[i_ind.s7]); + vp = (float8)(velocity_cache[i_ind_p.s0], + velocity_cache[i_ind_p.s1], + velocity_cache[i_ind_p.s2], + velocity_cache[i_ind_p.s3], + velocity_cache[i_ind_p.s4], + velocity_cache[i_ind_p.s5], + velocity_cache[i_ind_p.s6], + velocity_cache[i_ind_p.s7]); + p = fma(mix(v,vp,y),dt,coord + (float8)(min_position)); + vstore8(p, (i+gidY*WIDTH+gidZ*WIDTH*WIDTH)/8, ppos); + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/copy_2D.cl b/HySoP/hysop/particular_solvers/gpu_src/copy_2D.cl new file mode 100644 index 0000000000000000000000000000000000000000..1af25c6d58beb24ead4e25dfbd21cb78adc64613 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/copy_2D.cl @@ -0,0 +1,14 @@ + +__kernel void copy(__global const float* in, + __global float* out) +{ + uint xIndex = get_group_id(0) * TILE_DIM_COPY + get_local_id(0); + uint yIndex = get_group_id(1) * TILE_DIM_COPY + get_local_id(1); + uint zIndex = get_global_id(2); + uint index = xIndex + yIndex * WIDTH + zIndex*WIDTH*WIDTH; + + for(uint i=0; i<TILE_DIM_COPY; i+=BLOCK_ROWS_COPY) + { + out[index + i*WIDTH] = in[index + i*WIDTH]; + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/copy_2D_opt2.cl b/HySoP/hysop/particular_solvers/gpu_src/copy_2D_opt2.cl new file mode 100644 index 0000000000000000000000000000000000000000..4cff9b8e204cb4f43fb3525ec19d6ac66d401fc9 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/copy_2D_opt2.cl @@ -0,0 +1,17 @@ + +__kernel void copy(__global const float* in, + __global float* out) +{ + uint xIndex = (get_group_id(0) * TILE_DIM_COPY + get_local_id(0)*2); + uint yIndex = get_group_id(1) * TILE_DIM_COPY + get_local_id(1); + uint index = xIndex + yIndex * WIDTH; + float x,y,z,w; + + for(uint i=0; i<TILE_DIM_COPY; i+=BLOCK_ROWS_COPY) + { + x = in[index + i*WIDTH]; + y = in[index + 1 + i*WIDTH]; + out[index + i*WIDTH] = x; + out[index + 1 + i*WIDTH] = y; + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/copy_3D_locMem.cl b/HySoP/hysop/particular_solvers/gpu_src/copy_3D_locMem.cl new file mode 100644 index 0000000000000000000000000000000000000000..b13c4b735ca5000ba8697cdfd43aa0cf024a789c --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/copy_3D_locMem.cl @@ -0,0 +1,21 @@ + +__kernel void copy(__global const float* in, + __global float* out) +{ + uint xIndex = get_group_id(0) * TILE_DIM_COPY + get_local_id(0); + uint yIndex = get_group_id(1) * TILE_DIM_COPY + get_local_id(1); + uint zIndex = get_global_id(2); + uint index = xIndex + yIndex * WIDTH + zIndex*WIDTH*WIDTH; + + __local float tile[TILE_DIM_COPY][TILE_DIM_COPY]; + + for(uint i=0; i<TILE_DIM_COPY; i+=BLOCK_ROWS_COPY) + { + tile[get_local_id(1)+i][get_local_id(0)] = in[index + i*WIDTH]; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(uint i=0; i<TILE_DIM_COPY; i+=BLOCK_ROWS_COPY) + { + out[index + i*WIDTH] = tile[get_local_id(1)+i][get_local_id(0)]; + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/copy_3D_opt4.cl b/HySoP/hysop/particular_solvers/gpu_src/copy_3D_opt4.cl new file mode 100644 index 0000000000000000000000000000000000000000..e5bec1c04df53e5b901db18b6c959bbc4443fb55 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/copy_3D_opt4.cl @@ -0,0 +1,22 @@ + +__kernel void copy(__global const float* in, + __global float* out) +{ + uint xIndex = (get_group_id(0) * TILE_DIM_COPY + get_local_id(0)*4); + uint yIndex = get_group_id(1) * TILE_DIM_COPY + get_local_id(1); + uint zIndex = get_global_id(2); + uint index = xIndex + yIndex * WIDTH + zIndex*WIDTH*WIDTH; + float x,y,z,w; + + for(uint i=0; i<TILE_DIM_COPY; i+=BLOCK_ROWS_COPY) + { + x = in[index + i*WIDTH]; + y = in[index + 1 + i*WIDTH]; + z = in[index + 2 + i*WIDTH]; + w = in[index + 3 + i*WIDTH]; + out[index + i*WIDTH] = x; + out[index + 1 + i*WIDTH] = y; + out[index + 2 + i*WIDTH] = z; + out[index + 3 + i*WIDTH] = w; + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..895250723bdd1eaa04357817122fd5205eb10e1c --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_noBC.cl @@ -0,0 +1,90 @@ +/** + * Local memory as a 2D array to avoid bank conflicts. + * + * @param id Index 1D + * @param nb_part Particle number per work-item + * + * @return Index in a 2D space. + */ +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGN+(id/nb_part); +} + +/** + * 2D Remeshing kernel. + * + * @param ppos Particle position + * @param pscal Particle scalar + * @param gscal Grid scalar + * @param min_position Domain lower point + * @param dx Space step + */ +__kernel void remeshing(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int4 ind; + uint i, nb_part = WIDTH/WGN; + float4 p, s, y; + uint4 index4; + + __local float gscal_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + { + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position); + s = vload4((i + gidY*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH); + gscal_loc[noBC_id(index4.x,nb_part)] += (alpha(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (alpha(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (alpha(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (alpha(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (beta(y.x,s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (beta(y.y,s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (beta(y.z,s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (beta(y.w,s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (gamma(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (gamma(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (gamma(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (gamma(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (delta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (delta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (delta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (delta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (eta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (eta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (eta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (eta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gscal_loc[noBC_id(index4.x,nb_part)] += (zeta(y.x, s.x)); + gscal_loc[noBC_id(index4.y,nb_part)] += (zeta(y.y, s.y)); + gscal_loc[noBC_id(index4.z,nb_part)] += (zeta(y.z, s.z)); + gscal_loc[noBC_id(index4.w,nb_part)] += (zeta(y.w, s.w)); + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + vstore4((float4)(gscal_loc[noBC_id(i,nb_part)],gscal_loc[noBC_id(i+1,nb_part)], gscal_loc[noBC_id(i+2,nb_part)], gscal_loc[noBC_id(i+3,nb_part)]), (i + gidY*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_private4.cl b/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_private4.cl new file mode 100644 index 0000000000000000000000000000000000000000..bd831d5abf3d791b984c67d3e4dee9d3b447f004 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_private4.cl @@ -0,0 +1,76 @@ + +__kernel void remeshing(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int4 ind; + uint i, nb_part = WIDTH/WGN; + float4 p, s, y, gs; + uint4 index4; + + __local float gscal_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + { + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position); + s = vload4((i + gidY*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH); + gs = (alpha(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (beta(y, s)); + gscal_loc[index4.x]+= gs.x; + gscal_loc[index4.y]+= gs.y; + gscal_loc[index4.z]+= gs.z; + gscal_loc[index4.w]+= gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (gamma(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (delta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (eta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (zeta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + vstore4((float4)(gscal_loc[i],gscal_loc[i+1], gscal_loc[i+2], gscal_loc[i+3]), (i + gidY*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_private4_noBC.cl b/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_private4_noBC.cl new file mode 100644 index 0000000000000000000000000000000000000000..014edd31671f10731a58ba826b876bb5bf612fa1 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/remeshing_2D_opt4_private4_noBC.cl @@ -0,0 +1,79 @@ + +inline uint noBC_id(int id, int nb_part){ + return (id%nb_part)*WGN+(id/nb_part); +} +__kernel void remeshing(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + float invdx = 1.0/dx; + int4 ind; + uint i, nb_part = WIDTH/WGN; + float4 p, s, y, gs; + uint4 index4; + + __local float gscal_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + { + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + p = vload4((i + gidY*WIDTH)/4, ppos) - (float4)(min_position); + s = vload4((i + gidY*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH); + gs = (alpha(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (beta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (gamma(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (delta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (eta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (zeta(y, s)); + gscal_loc[noBC_id(index4.x, nb_part)] += gs.x; + gscal_loc[noBC_id(index4.y, nb_part)] += gs.y; + gscal_loc[noBC_id(index4.z, nb_part)] += gs.z; + gscal_loc[noBC_id(index4.w, nb_part)] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + vstore4((float4)(gscal_loc[noBC_id(i, nb_part)],gscal_loc[noBC_id(i+1, nb_part)], gscal_loc[noBC_id(i+2, nb_part)], gscal_loc[noBC_id(i+3, nb_part)]), (i + gidY*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/remeshing_3D_opt4_private4.cl b/HySoP/hysop/particular_solvers/gpu_src/remeshing_3D_opt4_private4.cl new file mode 100644 index 0000000000000000000000000000000000000000..eb67791af74bdfb46e79c38c87fcc216d5fc6032 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/remeshing_3D_opt4_private4.cl @@ -0,0 +1,81 @@ + +__kernel void remeshing(__global const float* ppos, + __global const float* pscal, + __global float* gscal, float min_position, float dx) +{ + uint gidX = get_global_id(0); + uint gidY = get_global_id(1); + uint gidZ = get_global_id(2); + float invdx = 1.0/dx; + int4 ind; + uint i, nb_part = WIDTH/WGN; + float4 p, s, y, gs; + uint4 index4; + + __local float gscal_loc[WIDTH]; + + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + { + gscal_loc[i] = 0.0; + gscal_loc[i+1] = 0.0; + gscal_loc[i+2] = 0.0; + gscal_loc[i+3] = 0.0; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*nb_part; i<(gidX + 1)*nb_part; i+=4) + { + p = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, ppos) - (float4)(min_position); + s = vload4((i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, pscal); + ind = convert_int4_rtn(p * invdx); + y = (p - convert_float4(ind) * dx) * invdx; + index4 = convert_uint4((ind - 2 + WIDTH) % WIDTH); + gs = (alpha(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (beta(y, s)); + gscal_loc[index4.x]+= gs.x; + gscal_loc[index4.y]+= gs.y; + gscal_loc[index4.z]+= gs.z; + gscal_loc[index4.w]+= gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (gamma(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (delta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (eta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + index4 = (index4 + 1) % WIDTH; + gs = (zeta(y, s)); + gscal_loc[index4.x] += gs.x; + gscal_loc[index4.y] += gs.y; + gscal_loc[index4.z] += gs.z; + gscal_loc[index4.w] += gs.w; + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for(i=gidX*4; i<WIDTH; i+=(WGN*4)) + vstore4((float4)(gscal_loc[i], + gscal_loc[i+1], + gscal_loc[i+2], + gscal_loc[i+3]), + (i + gidY*WIDTH + gidZ*WIDTH*WIDTH)/4, gscal); +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/transpose_2D_xy_coalesced_locPad_Diag_opt4.cl b/HySoP/hysop/particular_solvers/gpu_src/transpose_2D_xy_coalesced_locPad_Diag_opt4.cl new file mode 100644 index 0000000000000000000000000000000000000000..b6853e37ba37b97e35fbf8a7d224acfcbda7c5a2 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/transpose_2D_xy_coalesced_locPad_Diag_opt4.cl @@ -0,0 +1,36 @@ + +__kernel void transpose_xy(__global const float* in, + __global float* out) +{ + float4 temp; + uint group_id_x = get_group_id(0); + uint group_id_y = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); + + uint xIndex = group_id_x * TILE_DIM_XY + get_local_id(0)*4; + uint yIndex = group_id_y * TILE_DIM_XY + get_local_id(1); + uint index_in = xIndex + yIndex * WIDTH; + + xIndex = group_id_y * TILE_DIM_XY + get_local_id(0)*4; + yIndex = group_id_x * TILE_DIM_XY + get_local_id(1); + uint index_out = xIndex + yIndex * WIDTH; + + __local float tile[TILE_DIM_XY][TILE_DIM_XY+1]; + + for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) + { + temp = vload4((index_in + i*WIDTH)/4, in); + tile[get_local_id(1) + i][get_local_id(0)*4] = temp.x; + tile[get_local_id(1) + i][get_local_id(0)*4+1] = temp.y; + tile[get_local_id(1) + i][get_local_id(0)*4+2] = temp.z; + tile[get_local_id(1) + i][get_local_id(0)*4+3] = temp.w; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) + { + temp = (float4)(tile[get_local_id(0)*4][get_local_id(1) + i], + tile[get_local_id(0)*4+1][get_local_id(1) + i], + tile[get_local_id(0)*4+2][get_local_id(1) + i], + tile[get_local_id(0)*4+3][get_local_id(1) + i]); + vstore4(temp, (index_out + i*WIDTH)/4, out); + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl new file mode 100644 index 0000000000000000000000000000000000000000..f94717ae5827d2b0bdd50d9d27da32336e935aef --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt2.cl @@ -0,0 +1,33 @@ + +__kernel void transpose_xy(__global const float* in, + __global float* out) +{ + float2 temp; + uint group_id_x = get_group_id(0); + uint group_id_y = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); + + uint xIndex = group_id_x * TILE_DIM_XY + get_local_id(0)*2; + uint yIndex = group_id_y * TILE_DIM_XY + get_local_id(1); + uint zIndex = get_global_id(2); + uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; + + xIndex = group_id_y * TILE_DIM_XY + get_local_id(0)*2; + yIndex = group_id_x * TILE_DIM_XY + get_local_id(1); + uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; + + __local float tile[TILE_DIM_XY][TILE_DIM_XY+1]; + + for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) + { + temp = vload2((index_in + i*WIDTH)/2, in); + tile[get_local_id(1) + i][get_local_id(0)*2] = temp.x; + tile[get_local_id(1) + i][get_local_id(0)*2+1] = temp.y; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) + { + temp = (float2)(tile[get_local_id(0)*2][get_local_id(1) + i], + tile[get_local_id(0)*2+1][get_local_id(1) + i]); + vstore2(temp, (index_out + i*WIDTH)/2, out); + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl new file mode 100644 index 0000000000000000000000000000000000000000..e8afa56b9744f41d70327adde3541c83d0f17806 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_2Dslice_opt4.cl @@ -0,0 +1,37 @@ + +__kernel void transpose_xy(__global const float* in, + __global float* out) +{ + float4 temp; + uint group_id_x = get_group_id(0); + uint group_id_y = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); + + uint xIndex = group_id_x * TILE_DIM_XY + get_local_id(0)*4; + uint yIndex = group_id_y * TILE_DIM_XY + get_local_id(1); + uint zIndex = get_global_id(2); + uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; + + xIndex = group_id_y * TILE_DIM_XY + get_local_id(0)*4; + yIndex = group_id_x * TILE_DIM_XY + get_local_id(1); + uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; + + __local float tile[TILE_DIM_XY][TILE_DIM_XY+1]; + + for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) + { + temp = vload4((index_in + i*WIDTH)/4, in); + tile[get_local_id(1) + i][get_local_id(0)*4] = temp.x; + tile[get_local_id(1) + i][get_local_id(0)*4+1] = temp.y; + tile[get_local_id(1) + i][get_local_id(0)*4+2] = temp.z; + tile[get_local_id(1) + i][get_local_id(0)*4+3] = temp.w; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) + { + temp = (float4)(tile[get_local_id(0)*4][get_local_id(1) + i], + tile[get_local_id(0)*4+1][get_local_id(1) + i], + tile[get_local_id(0)*4+2][get_local_id(1) + i], + tile[get_local_id(0)*4+3][get_local_id(1) + i]); + vstore4(temp, (index_out + i*WIDTH)/4, out); + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_opt2.cl b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_opt2.cl new file mode 100644 index 0000000000000000000000000000000000000000..f94717ae5827d2b0bdd50d9d27da32336e935aef --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xy_coalesced_locPad_Diag_opt2.cl @@ -0,0 +1,33 @@ + +__kernel void transpose_xy(__global const float* in, + __global float* out) +{ + float2 temp; + uint group_id_x = get_group_id(0); + uint group_id_y = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); + + uint xIndex = group_id_x * TILE_DIM_XY + get_local_id(0)*2; + uint yIndex = group_id_y * TILE_DIM_XY + get_local_id(1); + uint zIndex = get_global_id(2); + uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; + + xIndex = group_id_y * TILE_DIM_XY + get_local_id(0)*2; + yIndex = group_id_x * TILE_DIM_XY + get_local_id(1); + uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; + + __local float tile[TILE_DIM_XY][TILE_DIM_XY+1]; + + for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) + { + temp = vload2((index_in + i*WIDTH)/2, in); + tile[get_local_id(1) + i][get_local_id(0)*2] = temp.x; + tile[get_local_id(1) + i][get_local_id(0)*2+1] = temp.y; + } + barrier(CLK_LOCAL_MEM_FENCE); + for(uint i=0; i<TILE_DIM_XY; i+=BLOCK_ROWS_XY) + { + temp = (float2)(tile[get_local_id(0)*2][get_local_id(1) + i], + tile[get_local_id(0)*2+1][get_local_id(1) + i]); + vstore2(temp, (index_out + i*WIDTH)/2, out); + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl new file mode 100644 index 0000000000000000000000000000000000000000..67e8891bd6910bb0d2463ec367f10a11d8aa5ba8 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xz_coalesced_Diag_bis_3DBlock.cl @@ -0,0 +1,34 @@ + +__kernel void transpose_xz(__global const float* in, + __global float* out) +{ + uint group_id_x = get_group_id(0); + uint group_id_z = (get_group_id(0) + get_group_id(2)) % get_num_groups(0); + + uint xIndex = group_id_x * TILE_DIM_XZ + get_local_id(0); + uint yIndex = get_group_id(1) * TILE_DIM_XZ + get_local_id(1); + uint zIndex = group_id_z * TILE_DIM_XZ + get_local_id(2); + uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; + + xIndex = group_id_z * TILE_DIM_XZ + get_local_id(0); + zIndex = group_id_x * TILE_DIM_XZ + get_local_id(2); + uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; + + __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ]; + + for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_ROWS_XZ) + { + for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) + { + tile[get_local_id(2) + j][get_local_id(1) + i][get_local_id(0)] = in[index_in + i*WIDTH + j*WIDTH*WIDTH]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_ROWS_XZ) + { + for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) + { + out[index_out + i*WIDTH + j*WIDTH*WIDTH] = tile[get_local_id(0)][get_local_id(1)+i][get_local_id(2) + j]; + } + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl new file mode 100644 index 0000000000000000000000000000000000000000..a112ea1993e8885aa57c845d09120b466a57d21f --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/transpose_3D_xz_coalesced_locPad_Diag_bis_3DBlock.cl @@ -0,0 +1,34 @@ + +__kernel void transpose_xz(__global const float* in, + __global float* out) +{ + uint group_id_x = get_group_id(0); + uint group_id_z = (get_group_id(0) + get_group_id(2)) % get_num_groups(0); + + uint xIndex = group_id_x * TILE_DIM_XZ + get_local_id(0); + uint yIndex = get_group_id(1) * TILE_DIM_XZ + get_local_id(1); + uint zIndex = group_id_z * TILE_DIM_XZ + get_local_id(2); + uint index_in = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; + + xIndex = group_id_z * TILE_DIM_XZ + get_local_id(0); + zIndex = group_id_x * TILE_DIM_XZ + get_local_id(2); + uint index_out = xIndex + yIndex * WIDTH + zIndex * WIDTH * WIDTH; + + __local float tile[TILE_DIM_XZ][TILE_DIM_XZ][TILE_DIM_XZ+1]; + + for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_ROWS_XZ) + { + for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) + { + tile[get_local_id(2) + j][get_local_id(1) + i][get_local_id(0)] = in[index_in + i*WIDTH + j*WIDTH*WIDTH]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + for(uint j=0; j<TILE_DIM_XZ; j+=BLOCK_ROWS_XZ) + { + for(uint i=0; i<TILE_DIM_XZ; i+=BLOCK_ROWS_XZ) + { + out[index_out + i*WIDTH + j*WIDTH*WIDTH] = tile[get_local_id(0)][get_local_id(1)+i][get_local_id(2) + j]; + } + } +} diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime.cl new file mode 100644 index 0000000000000000000000000000000000000000..cd21a32e74bff85d463b68da55d44bd8f83749ee --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime.cl @@ -0,0 +1,20 @@ +/** + * M6prime weights. + * + * @param y Distance between the particle and left-hand grid point + * @param s Scalar of the particle + * + * @return weights + */ +inline float alpha(float y, float s){ + return (y * (y * (y * (y * (13.0 - 5.0 * y) - 9.0) - 1.0) + 2.0) / 24.0) * s;} +inline float beta(float y, float s){ + return (y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0) / 24.0)*s;} +inline float gamma(float y, float s){ + return (y * y * (y * (y * (126.0 - 50.0 * y) - 70.0) - 30.0) / 24.0 + 1.0)*s;} +inline float delta(float y, float s){ + return (y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0) / 24.0)*s;} +inline float eta(float y, float s){ + return (y * (y * (y * (y * (61.0 - 25.0 * y) - 33.0) - 1.0) - 2.0) / 24.0)*s;} +inline float zeta(float y, float s){ + return (y * y * y * (y * (5.0 * y - 12.0) + 7.0) / 24.0)*s;} diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..f63a419523ab10048ed59ca85cbb25e3e1326da1 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_builtin.cl @@ -0,0 +1,13 @@ + +inline float alpha(float y, float s){ + return (y * fma(y , fma(y , fma(y , fma(-5.0, y, 13.0),- 9.0),- 1.0), 2.0) / 24.0) * s;} +inline float beta(float y, float s){ + return (y * fma(y , fma(y , fma(y , fma(25.0 , y ,- 64.0), 39.0) , 16.0), - 16.0) / 24.0)*s;} +inline float gamma(float y, float s){ + return (y * y * fma(y , fma(y , fma( - 50.0 , y, 126.0) ,- 70.0) ,- 30.0) / 24.0 + 1.0)*s;} +inline float delta(float y, float s){ + return (y * fma(y , fma(y, fma(y, fma(50.0, y, - 124.0), 66.0) , 16.0) , 16.0) / 24.0)*s;} +inline float eta(float y, float s){ + return (y * fma(y , fma(y , fma(y , fma(- 25.0 , y, 61.0), - 33.0), - 1.0), - 2.0) / 24.0)*s;} +inline float zeta(float y, float s){ + return (y * y * y * fma(y , fma(5.0 , y ,- 12.0) , 7.0) / 24.0)*s;} diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_opt4.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_opt4.cl new file mode 100644 index 0000000000000000000000000000000000000000..865327f319211b18015e1ed556b7456c54b6cb7c --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_opt4.cl @@ -0,0 +1,13 @@ + +inline float4 alpha(float4 y, float4 s){ + return (y * (y * (y * (y * (13.0 - 5.0 * y) - 9.0) - 1.0) + 2.0) / 24.0) * s;} +inline float4 beta(float4 y, float4 s){ + return (y * (y * (y * (y * (25.0 * y - 64.0) + 39.0) + 16.0) - 16.0) / 24.0)*s;} +inline float4 gamma(float4 y, float4 s){ + return (y * y * (y * (y * (126.0 - 50.0 * y) - 70.0) - 30.0) / 24.0 + 1.0)*s;} +inline float4 delta(float4 y, float4 s){ + return (y * (y * (y * (y * (50.0 * y - 124.0) + 66.0) + 16.0) + 16.0) / 24.0)*s;} +inline float4 eta(float4 y, float4 s){ + return (y * (y * (y * (y * (61.0 - 25.0 * y) - 33.0) - 1.0) - 2.0) / 24.0)*s;} +inline float4 zeta(float4 y, float4 s){ + return (y * y * y * (y * (5.0 * y - 12.0) + 7.0) / 24.0)*s;} diff --git a/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_opt4_builtin.cl b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_opt4_builtin.cl new file mode 100644 index 0000000000000000000000000000000000000000..3a4c2eaeb63ee4317914062ddfb07a0fd38bb6a3 --- /dev/null +++ b/HySoP/hysop/particular_solvers/gpu_src/weights_m6prime_opt4_builtin.cl @@ -0,0 +1,13 @@ + +inline float4 alpha(float4 y, float4 s){ + return (y * fma(y , fma(y , fma(y , fma(-5.0, y, 13.0),- 9.0),- 1.0), 2.0) / 24.0) * s;} +inline float4 beta(float4 y, float4 s){ + return (y * fma(y , fma(y , fma(y , fma(25.0 , y ,- 64.0), 39.0) , 16.0), - 16.0) / 24.0)*s;} +inline float4 gamma(float4 y, float4 s){ + return (y * y * fma(y , fma(y , fma( - 50.0 , y, 126.0) ,- 70.0) ,- 30.0) / 24.0 + 1.0)*s;} +inline float4 delta(float4 y, float4 s){ + return (y * fma(y , fma(y, fma(y, fma(50.0, y, - 124.0), 66.0) , 16.0) , 16.0) / 24.0)*s;} +inline float4 eta(float4 y, float4 s){ + return (y * fma(y , fma(y , fma(y , fma(- 25.0 , y, 61.0), - 33.0), - 1.0), - 2.0) / 24.0)*s;} +inline float4 zeta(float4 y, float4 s){ + return (y * y * y * fma(y , fma(5.0 , y ,- 12.0) , 7.0) / 24.0)*s;} diff --git a/HySoP/hysop/tools/printer.py b/HySoP/hysop/tools/printer.py index 1d973844fb752c4d12ae09dbea26b7453f6d6429..d29e85438207437fd5b678d5b4ff4847d29d9059 100644 --- a/HySoP/hysop/tools/printer.py +++ b/HySoP/hysop/tools/printer.py @@ -93,13 +93,13 @@ class Printer(): evtk.imageToVTK(filename, pointData=self._build_vtk_dict()) print "not yet implemented" except IOError: - ## Standard output + ## Standard output f = open(filename, 'w') shape = self.fields[0].domain.discreteDomain.resolution step = self.fields[0].domain.discreteDomain.step if len(shape) == 2: - for i in xrange(shape[0]): - for j in xrange(shape[1]): + for i in xrange(shape[0]-1): + for j in xrange(shape[1]-1): f.write("{0:8.12} {1:8.12} ".format(i * step[0], j * step[1])) for field in self.fields: if field.vector: @@ -109,9 +109,9 @@ class Printer(): f.write("{0:8.12} ".format(field.discreteField[0][i, j])) f.write("\n") else: - for i in xrange(shape[0]): - for j in xrange(shape[1]): - for k in xrange(shape[2]): + for i in xrange(shape[0]-1): + for j in xrange(shape[1]-1): + for k in xrange(shape[2]-1): f.write("{0:8.12} {1:8.12} {2:8.12} ".format(i * step[0], j * step[1], k * step[2])) for field in self.fields: if field.vector: