Skip to content
Snippets Groups Projects
Commit 0a7f9769 authored by Jean-Matthieu Etancelin's avatar Jean-Matthieu Etancelin Committed by Franck Pérignon
Browse files

add GPU diffusion operator. Fix gpu diffusion kernel

parent 6b820a30
No related branches found
No related tags found
No related merge requests found
......@@ -56,14 +56,14 @@ __kernel void diffusion(__global const float* scal_in,
/* // fill tile edges */
tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
}
#if CUT_DIR_Y == 1
tile_XY[lidX][0] = (t_gidY*TILE_SIZE>=1)? scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + NB_X + gidZ*NB_X*2];
tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE+1<NB_Y) ? scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + gidZ*NB_X*2];
tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE+1<NB_Y) ? scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + gidZ*NB_X*2];
#else
tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
#endif
/* Synchronize work-group */
......@@ -107,14 +107,14 @@ __kernel void diffusion(__global const float* scal_in,
/* // fill tile edges */
tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
}
#if CUT_DIR_Y == 1
tile_XY[lidX][0] = (t_gidY*TILE_SIZE>=1)? scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + NB_X + gidZ*NB_X*2];
tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE+1<NB_Y) ? scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + gidZ*NB_X*2];
tile_XY[lidX][TILE_SIZE+1] = ((t_gidY+1)*TILE_SIZE+1<NB_Y) ? scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y] : ghosts[gidX + gidZ*NB_X*2];
#else
tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
#endif
/* Synchronize work-group */
......@@ -147,4 +147,3 @@ __kernel void diffusion(__global const float* scal_in,
scal_out[gidX + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y] = s;
}
}
......@@ -44,10 +44,10 @@ __kernel void diffusion(__global const float* scal_in,
/* // fill tile edges */
tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
}
tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -90,10 +90,10 @@ __kernel void diffusion(__global const float* scal_in,
/* // fill tile edges */
tile_XY[0][lidY+i*L_WIDTH] = scal_in[((t_gidX*TILE_SIZE-1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+1+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[TILE_SIZE+1][lidY+i*L_WIDTH] = scal_in[(((t_gidX+1)*TILE_SIZE+NB_X)%NB_X) + (gidY+i*L_WIDTH)*NB_X + gidZ*NB_X*NB_Y];
}
tile_XY[lidX][0] = scal_in[gidX + ((t_gidY*TILE_SIZE-1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+1+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
tile_XY[lidX][TILE_SIZE+1] = scal_in[gidX + (((t_gidY+1)*TILE_SIZE+NB_Y)%NB_Y)*NB_X + gidZ*NB_X*NB_Y];
/* Synchronize work-group */
barrier(CLK_LOCAL_MEM_FENCE);
......
"""
@file gpu_diffusion.py
Diffusion on GPU
"""
from parmepy.constants import debug, np, S_DIR, PARMES_MPI_REAL, ORDERMPI
import parmepy.tools.numpywrappers as npw
from parmepy.operator.discrete.discrete import DiscreteOperator
from parmepy.gpu import cl
from parmepy.gpu.gpu_operator import GPUOperator
from parmepy.gpu.gpu_kernel import KernelLauncher
from parmepy.gpu.gpu_discrete import GPUDiscreteField
from parmepy.tools.profiler import FProfiler
from parmepy.mpi.main_var import MPI
class GPUDiffusion(DiscreteOperator, GPUOperator):
@debug
def __init__(self, vorticity, viscosity, vorticity_tmp=None,
platform_id=None, device_id=None, device_type=None, **kwds):
## Discretisation of the solution field
self.vorticity = vorticity
## Viscosity.
self.viscosity = viscosity
assert 'variables' not in kwds, 'variables parameter is useless.'
super(GPUDiffusion, self).__init__(variables=[vorticity],
**kwds)
self.input = [self.vorticity]
self.output = [self.vorticity]
self.direction = 0
self._cl_work_size = 0
GPUOperator.__init__(self, platform_id=platform_id,
device_id=device_id,
device_type=device_type,
**kwds)
## GPU allocation.
alloc = not isinstance(self.vorticity, GPUDiscreteField)
GPUDiscreteField.fromField(self.cl_env, self.vorticity,
self.gpu_precision, simple_layout=False)
if not self.vorticity.gpu_allocated:
self.vorticity.allocate()
if alloc:
self.size_global_alloc += self.vorticity.mem_size
if vorticity_tmp is None:
self.vorticity_tmp = self.cl_env.global_allocation(
self.vorticity.data[0])
else:
self.vorticity_tmp = vorticity_tmp
topo = self.vorticity.topology
self._cutdir_list = np.where(topo.cutdir)[0].tolist()
self._comm = topo.comm
self._comm_size = self._comm.Get_size()
self._comm_rank = self._comm.Get_rank()
if self._comm_size > 1:
self._to_send = [None] * self.dim
self._to_recv_buf = [None] * self.dim
self._to_recv = [None] * self.dim
self.mpi_type_diff_l = {}
self.mpi_type_diff_r = {}
self.profiler += FProfiler('comm_diffusion')
for d in self._cutdir_list:
shape = list(self.vorticity.data[0].shape)
shape_b = list(self.vorticity.data[0].shape)
start_l = [0, ] * 3
start_r = [0, ] * 3
start_r[d] = 1
shape[d] = 2
shape_b[d] = 1
# _to_send[..., 0] contains [..., 0] data
# _to_send[..., 1] contains [..., Nz-1] data
# _to_recv[..., 0] contains [..., Nz] data (right ghosts)
# _to_recv[..., 1] contains [..., -1] data (left ghosts)
self._to_send[d] = npw.zeros(tuple(shape))
self._to_recv[d] = npw.zeros(tuple(shape))
self.mpi_type_diff_l[d] = PARMES_MPI_REAL.Create_subarray(
shape, shape_b, start_l, order=ORDERMPI)
self.mpi_type_diff_l[d].Commit()
self.mpi_type_diff_r[d] = PARMES_MPI_REAL.Create_subarray(
shape, shape_b, start_r, order=ORDERMPI)
self.mpi_type_diff_r[d].Commit()
self._to_recv_buf[d] = cl.Buffer(
self.cl_env.ctx, cl.mem_flags.READ_WRITE,
size=self._to_recv[d].nbytes)
cl.enqueue_copy(
self.cl_env.queue,
self._to_recv_buf[d],
self._to_recv[d],
buffer_origin=(0, 0, 0),
host_origin=(0, 0, 0),
region=(self._to_recv[d][0, 0, 0].nbytes, )).wait()
self._cl_work_size += self._to_recv[d].nbytes
self._compute = self._compute_diffusion_comm
else:
self._compute = self._compute_diffusion
self._mesh_size = npw.ones(4, dtype=self.gpu_precision)
self._mesh_size[:self.dim] = self._reorderVect(topo.mesh.space_step)
shape = topo.mesh.resolution
resol = shape.copy()
self.resol_dir = npw.dim_ones((self.dim,))
self.resol_dir[:self.dim] = self._reorderVect(shape)
self._append_size_constants(resol)
src, tile_size, nb_part_per_wi, vec, f_space = \
self._kernel_cfg['diffusion']
build_options = self._size_constants
build_options += " -D TILE_SIZE=" + str(tile_size)
build_options += " -D NB_PART=" + str(nb_part_per_wi)
build_options += " -D L_WIDTH=" + str(tile_size / nb_part_per_wi)
for d in xrange(self.dim):
build_options += " -D CUT_DIR" + S_DIR[d] + "="
build_options += str(1 if topo.shape[d] > 1 else 0)
if self._comm_size > 1:
src = [s.replace('diffusion', 'comm_diffusion')
for s in src]
prg = self.cl_env.build_src(src, build_options, vec)
gwi, lwi = f_space(self.vorticity.data[0].shape,
nb_part_per_wi, tile_size)
self.num_diffusion = KernelLauncher(
prg.diffusion, self.cl_env.queue, gwi, lwi)
def _compute_diffusion(self, simulation):
wait_evt = self.vorticity.events
d_evt = self.num_diffusion(
self.vorticity.gpu_data[0],
self.vorticity_tmp,
self.gpu_precision(self.viscosity * simulation.timeStep),
self._mesh_size,
wait_for=wait_evt)
c_evt = cl.enqueue_copy(self.cl_env.queue, self.vorticity.gpu_data[0],
self.vorticity_tmp, wait_for=[d_evt])
self.vorticity.events.append(c_evt)
def _compute_diffusion_comm(self, simulation):
# Compute OpenCL transfer parameters
tc = MPI.Wtime()
topo = self.vorticity.topology
first_cut_dir = topo.cutdir.tolist().index(True)
wait_evt = []
for d in self._cutdir_list:
r_orig = [0, ] * self.dim
br_orig = [0, ] * self.dim
r_orig[d] = self.vorticity.data[0].shape[d] - 1
br_orig[d] = 1
r_orig = tuple(r_orig)
br_orig = tuple(br_orig)
l_sl = [slice(None), ] * 3
r_sl = [slice(None), ] * 3
l_sl[d] = slice(0, 1)
r_sl[d] = slice(1, 2)
l_sl = tuple(l_sl)
r_sl = tuple(r_sl)
region_size = list(self.vorticity.data[0].shape)
region_size[0] = self._to_send[d][:, 0, 0].nbytes
region_size[d] = 1
wait_events = self.vorticity.events
e_l = cl.enqueue_copy(self.cl_env.queue, self._to_send[d],
self.vorticity.gpu_data[0],
host_origin=(0, 0, 0),
buffer_origin=(0, 0, 0),
region=tuple(region_size),
wait_for=wait_events)
e_r = cl.enqueue_copy(self.cl_env.queue, self._to_send[d],
self.vorticity.gpu_data[0],
host_origin=br_orig, buffer_origin=r_orig,
region=tuple(region_size),
wait_for=wait_events)
# MPI send
r_send = []
r_recv = []
R_rk = topo.neighbours[1, d - first_cut_dir]
L_rk = topo.neighbours[0, d - first_cut_dir]
r_recv.append(self._comm.Irecv(
[self._to_recv[d], 1, self.mpi_type_diff_l[d]],
source=R_rk, tag=123 + R_rk))
r_recv.append(self._comm.Irecv(
[self._to_recv[d], 1, self.mpi_type_diff_r[d]],
source=L_rk, tag=456 + L_rk))
e_l.wait()
e_r.wait()
r_send.append(self._comm.Issend(
[self._to_send[d], 1, self.mpi_type_diff_l[d]],
dest=L_rk, tag=123 + self._comm_rank))
r_send.append(self._comm.Issend(
[self._to_send[d], 1, self.mpi_type_diff_r[d]],
dest=R_rk, tag=456 + self._comm_rank))
for r in r_send + r_recv:
r.Wait()
# _to_recv[..., 0] contains [..., Nz] data (right ghosts)
# _to_recv[..., 1] contains [..., -1] data (left ghosts)
wait_evt.append(cl.enqueue_copy(self.cl_env.queue,
self._to_recv_buf[d],
self._to_recv[d]))
self.profiler['comm_diffusion'] += MPI.Wtime() - tc
if len(self._cutdir_list) == 1:
d_evt = self.num_diffusion(
self.vorticity.gpu_data[0],
self._to_recv_buf[self._cutdir_list[0]],
self.vorticity_tmp,
self.gpu_precision(self.viscosity * simulation.timeStep),
self._mesh_size,
wait_for=wait_evt)
if len(self._cutdir_list) == 2:
d_evt = self.num_diffusion(
self.vorticity.gpu_data[0],
self._to_recv_buf[self._cutdir_list[0]],
self._to_recv_buf[self._cutdir_list[1]],
self.vorticity_tmp,
self.gpu_precision(self.viscosity * simulation.timeStep),
self._mesh_size,
wait_for=wait_evt)
c_evt = cl.enqueue_copy(self.cl_env.queue, self.vorticity.gpu_data[0],
self.vorticity_tmp, wait_for=[d_evt])
self.vorticity.events.append(c_evt)
def apply(self, simulation):
self._compute(simulation)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment