Skip to content
Snippets Groups Projects
Commit c36236c0 authored by Jean-Matthieu Etancelin's avatar Jean-Matthieu Etancelin Committed by Franck Pérignon
Browse files

Cleanup

parent 034ea5b3
No related branches found
No related tags found
No related merge requests found
......@@ -167,6 +167,26 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
int(self._v_buff_width * PARMES_REAL(0.).nbytes),
int(self.v_resol_dir[1]),
int(self.v_resol_dir[2]))
self._v_block_size = 1024 * 1024 # 1MByte
while self._v_l_buff.nbytes % self._v_block_size != 0:
self._v_block_size /= 2
w = "WARNING: block size for pipelined GPU-to-GPU transfer is small, "
if self._v_block_size < 256 * 1024:
self._v_block_size = self._v_l_buff.nbytes / 4
print w + "use blocks of {0} MB (4 blocks velocity)".format(
self._v_block_size / (1024. * 1024.))
self._v_n_blocks = self._v_l_buff.nbytes / self._v_block_size
self._v_elem_block = np.prod(self._v_l_buff.shape) / self._v_n_blocks
self._l_recv_v = [None, ] * self._v_n_blocks
self._r_recv_v = [None, ] * self._v_n_blocks
self._send_to_l_v = [None, ] * self._v_n_blocks
self._send_to_r_v = [None, ] * self._v_n_blocks
self._evt_l_v = [None, ] * self._v_n_blocks
self._evt_r_v = [None, ] * self._v_n_blocks
self._v_block_slice = [None, ] * self._v_n_blocks
for b in xrange(self._v_n_blocks):
self._v_block_slice[b] = slice(
b * self._v_elem_block, (b + 1) * self._v_elem_block)
## Python remeshing formula for the multiscale interpolation
self._py_ms_formula = self.method[MultiScale]
......@@ -194,14 +214,6 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
cl.enqueue_copy(self.cl_env.queue,
self._cl_s_l_buff, _s_l_buff).wait()
self._cl_work_size += 2 * self._s_froml_buff_max.nbytes
self._s_buff_size = self._s_buff_width * \
self.resol_dir[1] * self.resol_dir[2]
self._s_pitches_host = (int(self._s_froml_buff_max[:, 0, 0].nbytes),
int(self._s_froml_buff_max[:, :, 0].nbytes))
self._s_buffer_region = (
int(self._s_buff_width * PARMES_REAL(0.).nbytes),
int(self.resol_dir[1]),
int(self.resol_dir[2]))
self._s_l_buff, evt = cl.enqueue_map_buffer(
self.cl_env.queue,
self._cl_s_l_buff,
......@@ -224,51 +236,24 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
evt.wait()
self._s_froml_buff_flat = self._s_froml_buff_max.ravel(order='F')
self._s_fromr_buff_flat = self._s_fromr_buff_max.ravel(order='F')
#self._s_block_size = 1024 * 1024 # 1MByte
self._v_block_size = 1024 * 1024 # 1MByte
#while self._s_l_buff.nbytes % self._s_block_size != 0:
# self._s_block_size /= 2
while self._v_l_buff.nbytes % self._v_block_size != 0:
self._v_block_size /= 2
w = "WARNING: block size for pipelined GPU-to-GPU transfer is small, "
#if self._s_block_size < 256 * 1024:
# self._s_block_size = self._s_l_buff.nbytes / 4
# print w + "use blocks of {0} MB (4 blocks scalar)".format(
# self._s_block_size / (1024. * 1024.))
if self._v_block_size < 256 * 1024:
self._v_block_size = self._v_l_buff.nbytes / 4
print w + "use blocks of {0} MB (4 blocks velocity)".format(
self._v_block_size / (1024. * 1024.))
#self._s_n_blocks = self._s_l_buff.nbytes / self._s_block_size
self._v_n_blocks = self._v_l_buff.nbytes / self._v_block_size
#self._s_elem_block = np.prod(self._s_l_buff.shape) / self._s_n_blocks
self._v_elem_block = np.prod(self._v_l_buff.shape) / self._v_n_blocks
#print "MULTI-GPU Communications of size {0}MB, by {1} blocs of {2}MB ({3} width)".format(
# self._s_l_buff.nbytes / (1024. * 1024.),
# self._s_n_blocks,
# self._s_block_size / (1024. * 1024.),
# str((self._s_buff_width, self.resol_dir[1], self.resol_dir[2])))
#self._evt_get_l = [None, ] * self._s_n_blocks
#self._evt_get_r = [None, ] * self._s_n_blocks
#self._l_send = [None, ] * self._s_n_blocks
#self._r_send = [None, ] * self._s_n_blocks
#self._l_recv = [None, ] * self._s_n_blocks
#self._r_recv = [None, ] * self._s_n_blocks
self._l_recv_v = [None, ] * self._v_n_blocks
self._r_recv_v = [None, ] * self._v_n_blocks
self._send_to_l_v = [None, ] * self._v_n_blocks
self._send_to_r_v = [None, ] * self._v_n_blocks
self._evt_l_v = [None, ] * self._v_n_blocks
self._evt_r_v = [None, ] * self._v_n_blocks
#self._s_buff_block_slice = [None, ] * self._s_n_blocks
self._v_buff_block_slice = [None, ] * self._v_n_blocks
#for b in xrange(self._s_n_blocks):
# self._s_buff_block_slice[b] = slice(
# b * self._s_elem_block, (b + 1) * self._s_elem_block)
for b in xrange(self._v_n_blocks):
self._v_buff_block_slice[b] = slice(
b * self._v_elem_block, (b + 1) * self._v_elem_block)
# attributes declarations, values are recomputed at each time
self._s_buff_width_loc_p, self._s_buff_width_loc_m = 0, 0
self._s_buff_width_from_l, self._s_buff_width_from_r = 0, 0
self._s_froml_buff, self._s_locl_buff = None, None
self._s_fromr_buff, self._s_locr_buff = None, None
self._s_buffer_region_on_l, self._s_buffer_region_on_r = None, None
self._origin_locl, self._origin_locr = None, None
self._s_block_size_to_r, self._s_block_size_to_l = None, None
self._s_block_size_from_r, self._s_block_size_from_l = None, None
self._s_n_blocks_to_r, self._s_n_blocks_to_l = None, None
self._s_n_blocks_from_r, self._s_n_blocks_from_l = None, None
self._s_elem_block_to_r, self._s_elem_block_to_l = None, None
self._s_elem_block_from_r, self._s_elem_block_from_l = None, None
self._s_block_slice_to_r, self._s_block_slice_to_l = None, None
self._s_block_slice_from_r, self._s_block_slice_from_l = None, None
self._r_recv, self._l_recv = None, None
self._evt_get_l, self._evt_get_r = None, None
self._l_send, self._r_send = None, None
self._queue_comm_m = self.cl_env.create_other_queue()
self._queue_comm_p = self.cl_env.create_other_queue()
......@@ -321,15 +306,6 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
self._build_exec_list()
def setup_gpu(self):
pass
# self._s_locl_buff = \
# self.fields_on_grid[0].host_data_pinned[0].reshape(
# self.resol_dir, order=ORDER)[:self._s_buff_width, :, :]
# self._s_locr_buff = \
# self.fields_on_grid[0].host_data_pinned[0].reshape(
# self.resol_dir, order=ORDER)[-self._s_buff_width:, :, :]
def _collect_kernels_cl_src_2k(self):
pass
......@@ -481,21 +457,40 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
# Recompute blocks number and block size
self._s_block_size_to_r, self._s_n_blocks_to_r, \
self._s_elem_block_to_r, self._s_buff_block_slice_to_r = \
self._s_elem_block_to_r, self._s_block_slice_to_r = \
self._compute_block_number_and_size(
SIZEOF_PARMES_REAL * self._s_buff_width_loc_p *
self.resol_dir[1] * self.resol_dir[2])
self._s_block_size_to_l, self._s_n_blocks_to_l, \
self._s_elem_block_to_l, self._s_buff_block_slice_to_l = \
self._s_elem_block_to_l, self._s_block_slice_to_l = \
self._compute_block_number_and_size(
SIZEOF_PARMES_REAL * self._s_buff_width_loc_m *
self.resol_dir[1] * self.resol_dir[2])
self._s_block_size_from_r, self._s_n_blocks_from_r, \
self._s_elem_block_from_r, self._s_buff_block_slice_from_r = \
self._s_elem_block_from_r, self._s_block_slice_from_r = \
self._compute_block_number_and_size(self._s_fromr_buff.nbytes)
self._s_block_size_from_l, self._s_n_blocks_from_l, \
self._s_elem_block_from_l, self._s_buff_block_slice_from_l = \
self._s_elem_block_from_l, self._s_block_slice_from_l = \
self._compute_block_number_and_size(self._s_froml_buff.nbytes)
print "[" + str(self._comm_rank) + \
"] Multi-GPU comm: send to L=({0} MB, {1} bloc),".format(
self._s_block_size_to_l * self._s_n_blocks_to_l /
(1024. * 1024),
self._s_n_blocks_to_l) + \
" R=({0} MB, {1} bloc)".format(
self._s_block_size_to_r * self._s_n_blocks_to_r /
(1024. * 1024),
self._s_n_blocks_to_r,) + \
"; recv from L=({0} MB, {1} bloc),".format(
self._s_block_size_from_l * self._s_n_blocks_from_l /
(1024. * 1024),
self._s_n_blocks_from_l) + \
" R=({0} MB, {1} bloc)".format(
self._s_block_size_from_r * self._s_n_blocks_from_r /
(1024. * 1024),
self._s_n_blocks_from_r,)
# Events lists
self._r_recv = [None, ] * self._s_n_blocks_from_r
self._l_recv = [None, ] * self._s_n_blocks_from_l
self._evt_get_l = [None, ] * self._s_n_blocks_to_l
......@@ -564,20 +559,20 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
ctime = MPI.Wtime()
for b in xrange(self._v_n_blocks):
self._l_recv_v[b] = self._comm.Irecv(
[self._v_l_buff_flat[self._v_buff_block_slice[b]],
[self._v_l_buff_flat[self._v_block_slice[b]],
self._v_elem_block, PARMES_MPI_REAL],
source=self._L_rk, tag=17 + 19 * self._L_rk + 59 * b)
self._r_recv_v[b] = self._comm.Irecv(
[self._v_r_buff_flat[self._v_buff_block_slice[b]],
[self._v_r_buff_flat[self._v_block_slice[b]],
self._v_elem_block, PARMES_MPI_REAL],
source=self._R_rk, tag=29 + 23 * self._R_rk + 57 * b)
for b in xrange(self._v_n_blocks):
self._send_to_r_v[b] = self._comm.Issend(
[self._v_r_buff_loc_flat[self._v_buff_block_slice[b]],
[self._v_r_buff_loc_flat[self._v_block_slice[b]],
self._v_elem_block, PARMES_MPI_REAL],
dest=self._R_rk, tag=17 + 19 * self._comm_rank + 59 * b)
self._send_to_l_v[b] = self._comm.Issend(
[self._v_l_buff_loc_flat[self._v_buff_block_slice[b]],
[self._v_l_buff_loc_flat[self._v_block_slice[b]],
self._v_elem_block, PARMES_MPI_REAL],
dest=self._L_rk, tag=29 + 23 * self._comm_rank + 57 * b)
if CL_PROFILE:
......@@ -779,12 +774,12 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
# Prepare the MPI receptions
for b in xrange(self._s_n_blocks_from_l):
self._l_recv[b] = self._comm.Irecv(
[self._s_froml_buff_flat[self._s_buff_block_slice_from_l[b]],
[self._s_froml_buff_flat[self._s_block_slice_from_l[b]],
self._s_elem_block_from_l, PARMES_MPI_REAL],
source=self._L_rk, tag=888 + self._L_rk + 19 * b)
for b in xrange(self._s_n_blocks_from_r):
self._r_recv[b] = self._comm.Irecv(
[self._s_fromr_buff_flat[self._s_buff_block_slice_from_r[b]],
[self._s_fromr_buff_flat[self._s_block_slice_from_r[b]],
self._s_elem_block_from_r, PARMES_MPI_REAL],
source=self._R_rk, tag=333 + self._R_rk + 17 * b)
......@@ -809,7 +804,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
for b in xrange(self._s_n_blocks_to_l):
self._evt_get_l[b].wait()
self._l_send[b] = self._comm.Issend(
[self._s_l_buff[self._s_buff_block_slice_to_l[b]],
[self._s_l_buff[self._s_block_slice_to_l[b]],
self._s_elem_block_to_l, PARMES_MPI_REAL],
dest=self._L_rk, tag=333 + self._comm_rank + 17 * b)
ctime_send_l = MPI.Wtime() - ctime
......@@ -834,14 +829,13 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
for b in xrange(self._s_n_blocks_to_r):
self._evt_get_r[b].wait()
self._r_send[b] = self._comm.Issend(
[self._s_r_buff[self._s_buff_block_slice_to_r[b]],
[self._s_r_buff[self._s_block_slice_to_r[b]],
self._s_elem_block_to_r, PARMES_MPI_REAL],
dest=self._R_rk, tag=888 + self._comm_rank + 19 * b)
ctime_send_r = MPI.Wtime() - ctime
# remesh in-domain particles and get left-right layer
evt = self._num_comm(wait_evts, dt)
self.evt_num_remesh = [evt]
evt_get_locl = cl.enqueue_copy(
self.cl_env.queue,
self.fields_on_grid[0].host_data_pinned[0],
......@@ -852,7 +846,7 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
host_pitches=self._pitches_dev,
region=self._s_buffer_region_on_l,
is_blocking=False,
wait_for=self.evt_num_remesh)
wait_for=[evt])
evt_get_locr = cl.enqueue_copy(
self.cl_env.queue,
self.fields_on_grid[0].host_data_pinned[0],
......@@ -863,10 +857,11 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
host_pitches=self._pitches_dev,
region=self._s_buffer_region_on_r,
is_blocking=False,
wait_for=self.evt_num_remesh)
wait_for=[evt])
ctime = MPI.Wtime()
# Wait MPI transfer of data from left, add them to local data and send back to device
# Wait MPI transfer of data from left, add them to local
# data and send back to device
for b in xrange(self._s_n_blocks_to_r):
self._r_send[b].Wait()
for b in xrange(self._s_n_blocks_from_l):
......@@ -876,7 +871,6 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
calctime = MPI.Wtime()
self._s_locl_buff += self._s_froml_buff
print self._comm_rank, self._s_locl_buff.shape, self._s_froml_buff.shape
self.profiler['comm_calc_remesh'] += MPI.Wtime() - calctime
evt_set_locl = cl.enqueue_copy(
self.cl_env.queue,
......@@ -889,7 +883,8 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
region=self._s_buffer_region_on_l,
is_blocking=False)
# Wait MPI transfer of data from right, add them to local data and send back to device
# Wait MPI transfer of data from right, add them to local
# data and send back to device
ctime = MPI.Wtime()
for b in xrange(self._s_n_blocks_to_l):
self._l_send[b].Wait()
......@@ -899,7 +894,6 @@ class MultiGPUParticleAdvection(GPUParticleAdvection):
ctime_wait_r = MPI.Wtime() - ctime
calctime = MPI.Wtime()
self._s_locr_buff += self._s_fromr_buff
print self._comm_rank, self._s_locr_buff.shape, self._s_fromr_buff.shape
self.profiler['comm_calc_remesh'] += MPI.Wtime() - calctime
evt_set_locr = cl.enqueue_copy(
self.cl_env.queue,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment