diff --git a/hysop/codegen/kernels/directional_stretching.py b/hysop/codegen/kernels/directional_stretching.py
index 2bf3ca3775d74335c2ce375d105f5f6000317243..12c2e4b05074365f14dd0ea7eb026b03238fa70b 100644
--- a/hysop/codegen/kernels/directional_stretching.py
+++ b/hysop/codegen/kernels/directional_stretching.py
@@ -377,12 +377,14 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
         @contextmanager
         def _work_iterate_(i):
             try:
-                fval  = local_id[0] if i==0 else global_id.fval(i)
-                gsize = local_work() if i==0 else global_size[i]
                 if i==0:
-                    N      = '(({}+2*{}+{lwork}-1)/{lwork})*{Lx}'.format(compute_grid_size[i],cache_ghosts(),lwork=local_work(), Lx=local_size[0])
+                    fval  = '0'
+                    gsize = local_work()
+                    N      = '(({}+2*{}+{lwork}-1)/{lwork})*{lwork}'.format(compute_grid_size[i],cache_ghosts(),lwork=local_work())
                     ghosts = '({}-{})'.format(compute_grid_ghosts[i],cache_ghosts())
                 else:
+                    fval = global_id.fval(i)
+                    gsize = global_size[i]
                     N      = '{Sx}'.format(Sx=compute_grid_size[i])
                     ghosts = compute_grid_ghosts[i]
 
@@ -390,15 +392,22 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
                 with s._for_('int {i}={fval}; {i}<{N}; {i}+={gsize}'.format(
                     i='kji'[i], fval=fval, gsize=gsize,N=N)) as ctx:
                     
-                    s.append('{} = {}+{};'.format(global_id[i], 'kji'[i], ghosts))
+                    if i==0:
+                        s.append('{} = {}+{}+{};'.format(global_id[i], 'kji'[i], local_id[0], ghosts))
+                    else:
+                        s.append('{} = {}+{};'.format(global_id[i], 'kji'[i], ghosts))
+                    
                     if i==0:
                         active.declare(s, init='(k < {}+2*{})'.format(
                             compute_grid_size[0],cache_ghosts()))
                     elif i==1:
                         first.declare(s)
+
                     yield ctx
-                    if i>0:
-                        s.barrier(_local=True,_global=True)
+
+                    #if i==1:
+                        #s.append('printf("OUT %i\\n", get_local_id(0));')
+                        #s.barrier(_local=True)
             except:
                 raise
         nested_loops = [_work_iterate_(i) for i in xrange(dim-1,-1,-1)]
@@ -463,6 +472,16 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
                     winit += self.args[Wi][global_index()] + ','
                 uinit='({}{})({})'.format(ftype, work_dim, uinit[:-1])
                 winit='({}{})({})'.format(ftype, work_dim, winit[:-1])
+                       
+
+                #s.append('''printf("IN lid.x=%zu/%zu; gid.x=%zu/%zu, (i,j,k)=(%i,%i,%i), wi=(%zu,%zu,%zu)/(%zu,%zu,%zu), active=%d\\n", 
+                        #get_local_id(0),  get_local_size(0),
+                        #get_global_id(0), get_global_size(0),
+                        #i,j,k,
+                        #get_group_id(0), get_group_id(1), get_group_id(2), 
+                        #get_num_groups(0), get_num_groups(1), get_num_groups(2), 
+                        #(int)(active));''')
+                #s.barrier(_local=True)
 
                 s.jumpline()
                 s.append('{} {},{};'.format(U.ctype,U(),W()))
@@ -470,11 +489,11 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
                     with s._if_('{}'.format(first())):
                         s.append('{} = {};'.format(U(), uinit))
                         s.append('{} = {};'.format(W(), winit))
+                        s.append('{} = false;'.format(first()))
                         if is_periodic:
                             with s._if_('{lid} < 2*{ghosts}'.format(lid=local_id[0],ghosts=cache_ghosts())):
                                 s.append('{} = {};'.format(Ul[local_id[0]], U()))
                                 s.append('{} = {};'.format(Wl[local_id[0]], W()))
-                        s.append('{} = false;'.format(first()))
                     with s._else_():
                         if is_periodic:
                             with s._if_('{} >= {}-{}'.format(global_id[0],compute_grid_size[0],cache_ghosts())):
@@ -498,7 +517,7 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
                         s.append(code)
                 
 
-                s.barrier(_local=True,_global=True)
+                s.barrier(_local=True)
                 s.jumpline()
 
 
@@ -523,7 +542,7 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
                         s.append('{} = {};'.format(Wr[_id], W()))
                 
                 
-                s.barrier(_local=True,_global=True)
+                s.barrier(_local=True)
                 s.jumpline()
             
                 rk_args={'dt': dt,
@@ -587,6 +606,15 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
                         #code = 'printf(\"SKIP: gid=(%i,%i,%i), lid.x=%i, GID=%i, U.x=%f, W.x=%f, newWx=%f \\n\", gid.x,gid.y,gid.z,lid.x,GID,U.x,Wx[GID],W.x);'
                         #s.append(code)
                 
+                #s.append('''printf("BLK lid.x=%zu/%zu; gid.x=%zu/%zu, (i,j,k)=(%i,%i,%i), wi=(%zu,%zu,%zu)/(%zu,%zu,%zu), active=%d\\n", 
+                        #get_local_id(0),  get_local_size(0),
+                        #get_global_id(0), get_global_size(0),
+                        #i,j,k,
+                        #get_group_id(0), get_group_id(1), get_group_id(2), 
+                        #get_num_groups(0), get_num_groups(1), get_num_groups(2), 
+                        #(int)(active));''')
+                #s.barrier(_local=True)
+                
 
 if __name__ == '__main__':
     from hysop.gpu import cl
diff --git a/hysop/codegen/kernels/tests/test_directional_stretching.py b/hysop/codegen/kernels/tests/test_directional_stretching.py
index c4cefb7d12c1ea58406525dda69efed20508f8a2..5a0f79a9fa8a49615c27f288d86ef85c0050b685 100644
--- a/hysop/codegen/kernels/tests/test_directional_stretching.py
+++ b/hysop/codegen/kernels/tests/test_directional_stretching.py
@@ -17,7 +17,7 @@ class TestDirectionalStretching(object):
         queue = cl.CommandQueue(typegen.context)
         ctx = typegen.context
 
-        grid_size = np.asarray([24,2,2])
+        grid_size = np.asarray([16,2,2])
         compute_grid_ghosts = np.asarray([3*4,0,0])
         compute_grid_size   = grid_size + 2*compute_grid_ghosts
 
@@ -123,8 +123,8 @@ class TestDirectionalStretching(object):
         Lx = min(typegen.device.max_work_item_sizes[0], typegen.device.max_work_group_size)
         Lx = min(Lx, grid_size[0])
 
-        cls.local_work_size = np.asarray([12,1,1])
-        cls.work_load       = np.asarray([1,2,1])
+        cls.local_work_size = np.asarray([8,1,1])
+        cls.work_load       = np.asarray([1,2,2])
         cls.inv_dx = inv_dx
         cls.dt = dtype(0.5)
 
@@ -385,11 +385,6 @@ class TestDirectionalStretching(object):
         kernel = prg.all_kernels()[0]
         kernel.set_args(*kernel_args)
         
-        kwgi = cl.kernel_work_group_info
-        print kernel.get_work_group_info(kwgi.COMPILE_WORK_GROUP_SIZE, self.typegen.device)
-        print kernel.get_work_group_info(kwgi.WORK_GROUP_SIZE, self.typegen.device)
-        print kernel.get_work_group_info(kwgi.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, self.typegen.device)
-
         print '\tCPU => GPU'
         for buf in velocity+vorticity+debug:
             src = host_init_buffers[buf]
@@ -411,11 +406,11 @@ class TestDirectionalStretching(object):
         queue.flush()
         queue.finish()
 
-        print 'READS:\n'
-        print host_buffers_gpu['dbg0'][view]
-        print '\nWRITES:\n'
-        print host_buffers_gpu['dbg1'][view]
-        print
+        #print 'DBG0:\n'
+        #print host_buffers_gpu['dbg0'][view]
+        #print '\nDBG1:\n'
+        #print host_buffers_gpu['dbg1'][view]
+        #print
         
         buffers = [(varname,host_buffers_reference[varname],host_buffers_gpu[varname]) 
                         for varname in vorticity]