diff --git a/hysop/codegen/kernels/directional_stretching.py b/hysop/codegen/kernels/directional_stretching.py index 2bf3ca3775d74335c2ce375d105f5f6000317243..12c2e4b05074365f14dd0ea7eb026b03238fa70b 100644 --- a/hysop/codegen/kernels/directional_stretching.py +++ b/hysop/codegen/kernels/directional_stretching.py @@ -377,12 +377,14 @@ class DirectionalStretchingKernel(KernelCodeGenerator): @contextmanager def _work_iterate_(i): try: - fval = local_id[0] if i==0 else global_id.fval(i) - gsize = local_work() if i==0 else global_size[i] if i==0: - N = '(({}+2*{}+{lwork}-1)/{lwork})*{Lx}'.format(compute_grid_size[i],cache_ghosts(),lwork=local_work(), Lx=local_size[0]) + fval = '0' + gsize = local_work() + N = '(({}+2*{}+{lwork}-1)/{lwork})*{lwork}'.format(compute_grid_size[i],cache_ghosts(),lwork=local_work()) ghosts = '({}-{})'.format(compute_grid_ghosts[i],cache_ghosts()) else: + fval = global_id.fval(i) + gsize = global_size[i] N = '{Sx}'.format(Sx=compute_grid_size[i]) ghosts = compute_grid_ghosts[i] @@ -390,15 +392,22 @@ class DirectionalStretchingKernel(KernelCodeGenerator): with s._for_('int {i}={fval}; {i}<{N}; {i}+={gsize}'.format( i='kji'[i], fval=fval, gsize=gsize,N=N)) as ctx: - s.append('{} = {}+{};'.format(global_id[i], 'kji'[i], ghosts)) + if i==0: + s.append('{} = {}+{}+{};'.format(global_id[i], 'kji'[i], local_id[0], ghosts)) + else: + s.append('{} = {}+{};'.format(global_id[i], 'kji'[i], ghosts)) + if i==0: active.declare(s, init='(k < {}+2*{})'.format( compute_grid_size[0],cache_ghosts())) elif i==1: first.declare(s) + yield ctx - if i>0: - s.barrier(_local=True,_global=True) + + #if i==1: + #s.append('printf("OUT %i\\n", get_local_id(0));') + #s.barrier(_local=True) except: raise nested_loops = [_work_iterate_(i) for i in xrange(dim-1,-1,-1)] @@ -463,6 +472,16 @@ class DirectionalStretchingKernel(KernelCodeGenerator): winit += self.args[Wi][global_index()] + ',' uinit='({}{})({})'.format(ftype, work_dim, uinit[:-1]) winit='({}{})({})'.format(ftype, work_dim, winit[:-1]) + + + #s.append('''printf("IN lid.x=%zu/%zu; gid.x=%zu/%zu, (i,j,k)=(%i,%i,%i), wi=(%zu,%zu,%zu)/(%zu,%zu,%zu), active=%d\\n", + #get_local_id(0), get_local_size(0), + #get_global_id(0), get_global_size(0), + #i,j,k, + #get_group_id(0), get_group_id(1), get_group_id(2), + #get_num_groups(0), get_num_groups(1), get_num_groups(2), + #(int)(active));''') + #s.barrier(_local=True) s.jumpline() s.append('{} {},{};'.format(U.ctype,U(),W())) @@ -470,11 +489,11 @@ class DirectionalStretchingKernel(KernelCodeGenerator): with s._if_('{}'.format(first())): s.append('{} = {};'.format(U(), uinit)) s.append('{} = {};'.format(W(), winit)) + s.append('{} = false;'.format(first())) if is_periodic: with s._if_('{lid} < 2*{ghosts}'.format(lid=local_id[0],ghosts=cache_ghosts())): s.append('{} = {};'.format(Ul[local_id[0]], U())) s.append('{} = {};'.format(Wl[local_id[0]], W())) - s.append('{} = false;'.format(first())) with s._else_(): if is_periodic: with s._if_('{} >= {}-{}'.format(global_id[0],compute_grid_size[0],cache_ghosts())): @@ -498,7 +517,7 @@ class DirectionalStretchingKernel(KernelCodeGenerator): s.append(code) - s.barrier(_local=True,_global=True) + s.barrier(_local=True) s.jumpline() @@ -523,7 +542,7 @@ class DirectionalStretchingKernel(KernelCodeGenerator): s.append('{} = {};'.format(Wr[_id], W())) - s.barrier(_local=True,_global=True) + s.barrier(_local=True) s.jumpline() rk_args={'dt': dt, @@ -587,6 +606,15 @@ class DirectionalStretchingKernel(KernelCodeGenerator): #code = 'printf(\"SKIP: gid=(%i,%i,%i), lid.x=%i, GID=%i, U.x=%f, W.x=%f, newWx=%f \\n\", gid.x,gid.y,gid.z,lid.x,GID,U.x,Wx[GID],W.x);' #s.append(code) + #s.append('''printf("BLK lid.x=%zu/%zu; gid.x=%zu/%zu, (i,j,k)=(%i,%i,%i), wi=(%zu,%zu,%zu)/(%zu,%zu,%zu), active=%d\\n", + #get_local_id(0), get_local_size(0), + #get_global_id(0), get_global_size(0), + #i,j,k, + #get_group_id(0), get_group_id(1), get_group_id(2), + #get_num_groups(0), get_num_groups(1), get_num_groups(2), + #(int)(active));''') + #s.barrier(_local=True) + if __name__ == '__main__': from hysop.gpu import cl diff --git a/hysop/codegen/kernels/tests/test_directional_stretching.py b/hysop/codegen/kernels/tests/test_directional_stretching.py index c4cefb7d12c1ea58406525dda69efed20508f8a2..5a0f79a9fa8a49615c27f288d86ef85c0050b685 100644 --- a/hysop/codegen/kernels/tests/test_directional_stretching.py +++ b/hysop/codegen/kernels/tests/test_directional_stretching.py @@ -17,7 +17,7 @@ class TestDirectionalStretching(object): queue = cl.CommandQueue(typegen.context) ctx = typegen.context - grid_size = np.asarray([24,2,2]) + grid_size = np.asarray([16,2,2]) compute_grid_ghosts = np.asarray([3*4,0,0]) compute_grid_size = grid_size + 2*compute_grid_ghosts @@ -123,8 +123,8 @@ class TestDirectionalStretching(object): Lx = min(typegen.device.max_work_item_sizes[0], typegen.device.max_work_group_size) Lx = min(Lx, grid_size[0]) - cls.local_work_size = np.asarray([12,1,1]) - cls.work_load = np.asarray([1,2,1]) + cls.local_work_size = np.asarray([8,1,1]) + cls.work_load = np.asarray([1,2,2]) cls.inv_dx = inv_dx cls.dt = dtype(0.5) @@ -385,11 +385,6 @@ class TestDirectionalStretching(object): kernel = prg.all_kernels()[0] kernel.set_args(*kernel_args) - kwgi = cl.kernel_work_group_info - print kernel.get_work_group_info(kwgi.COMPILE_WORK_GROUP_SIZE, self.typegen.device) - print kernel.get_work_group_info(kwgi.WORK_GROUP_SIZE, self.typegen.device) - print kernel.get_work_group_info(kwgi.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, self.typegen.device) - print '\tCPU => GPU' for buf in velocity+vorticity+debug: src = host_init_buffers[buf] @@ -411,11 +406,11 @@ class TestDirectionalStretching(object): queue.flush() queue.finish() - print 'READS:\n' - print host_buffers_gpu['dbg0'][view] - print '\nWRITES:\n' - print host_buffers_gpu['dbg1'][view] - print + #print 'DBG0:\n' + #print host_buffers_gpu['dbg0'][view] + #print '\nDBG1:\n' + #print host_buffers_gpu['dbg1'][view] + #print buffers = [(varname,host_buffers_reference[varname],host_buffers_gpu[varname]) for varname in vorticity]