diff --git a/hysop/__init__.py b/hysop/__init__.py
index 70947c5c950830d633be359024d4b34eb3e29ab1..1382493cb73a43792564a9e8e5cb607d52a42240 100644
--- a/hysop/__init__.py
+++ b/hysop/__init__.py
@@ -25,7 +25,7 @@ __PROFILE__        = "OFF" in ["0", "1"]
 __ENABLE_LONG_TESTS__ = "OFF" is "ON"
 
 # OpenCL
-__DEFAULT_PLATFORM_ID__ = 0
+__DEFAULT_PLATFORM_ID__ = 1
 __DEFAULT_DEVICE_ID__   = 0
 
 if __MPI_ENABLED__:
diff --git a/hysop/backend/device/codegen/functions/directional_remesh.py b/hysop/backend/device/codegen/functions/directional_remesh.py
index bf0614441a0db5de609e518f1ccb0505490879b1..051fdd6207050abd4720128dcbdb0ebbd41b291d 100644
--- a/hysop/backend/device/codegen/functions/directional_remesh.py
+++ b/hysop/backend/device/codegen/functions/directional_remesh.py
@@ -324,7 +324,9 @@ class DirectionalRemeshFunction(OpenClFunctionCodeGenerator):
                             printf = 'printf("BATCH {}: %lu wrote {vnf} at idx {vni} with W={vnf}.\\n",{},{},{},{});'.format(
                                     i, 'get_local_id(0)', val, ind,  w, vni=vni, vnf=vnf)
                         s.append(printf)
-                
+                if not use_atomics: 
+                    s.barrier(_local=True)
+            if use_atomics:
                 s.barrier(_local=True)
     
     # def per_work_statistics(self):
diff --git a/hysop/backend/device/codegen/kernels/directional_remesh.py b/hysop/backend/device/codegen/kernels/directional_remesh.py
index eb3a81d2f2e737992b0d92d09e93cf21c02423aa..def55d1f765a53fa3777dff64a5b66e24566fb87 100644
--- a/hysop/backend/device/codegen/kernels/directional_remesh.py
+++ b/hysop/backend/device/codegen/kernels/directional_remesh.py
@@ -539,26 +539,29 @@ class DirectionalRemeshKernel(KernelCodeGenerator):
         # when compute_grid_size is not a multiple of nparticles, 
         # we need to go back to scalars load/store for the last particles.
         def if_last_thread_active(cond=None):
-            cond = active if (cond is None) else cond
+            cond = last_active if (cond is None) else cond
             with s._if_(cond):
                 with s._align_() as al:
                     for j in xrange(nparticles): 
                         cond = '({}+{}$ < {})'.format(particle_offset, j, compute_grid_size[0])
                         yield al, cond, j
 
-        if is_periodic:
-            def if_first_or_last_thread_active():
-                #cond = '{} || ({} && {} && ({}<{}))'.format(last_active, first, active, local_id[0], nparticles)
-                cond = None
-                return if_last_thread_active(cond)
-        else:
-            if_first_or_last_thread_active = if_last_thread_active
-       
         @contextlib.contextmanager
-        def if_thread_active():
-            with s._elif_('{}'.format(active)):
+        def if_thread_active(not_first=False):
+            if not_first:
+                cond = '(!{}) && {}'.format(first, active)
+            else:
+                cond = active
+            with s._elif_(cond):
                 yield
        
+        def if_first_or_last_thread_active():
+            if is_periodic and is_inplace and nparticles==1:
+                cond= '{} || ({} && ({}>=2*{}))'.format(last_active, first, local_id[0], cache_ghosts)
+            else:
+                cond= '{} || {}'.format(last_active, first)
+            return if_last_thread_active(cond)
+       
         with s._kernel_():
             s.jumpline()
             
@@ -603,24 +606,19 @@ class DirectionalRemeshKernel(KernelCodeGenerator):
                 kmax.declare(s)
                 last_particle.declare(s)
             with contextlib.nested(*nested_loops):
-                s.barrier(_local=True, _global=True)
                 
                 s.comment('Compute global offsets and line pointers')
                 with s._align_() as al:
                     position_global_id.affect(al, i=0, align=True, 
                             init='{} + {}'.format(particle_offset, position_grid_ghosts[0]))
-                    if is_inplace:
-                        for _gid, _ghosts in zip(scalars_in_global_id, scalars_in_grid_ghosts):
-                            _gid.affect(al, i=0, align=True, 
-                                init='{} + {} - {}'.format(particle_offset, _ghosts[0], cache_ghosts))
-                    else:
+                    if not is_inplace:
                         for _gid, _ghosts in zip(scalars_in_global_id, scalars_in_grid_ghosts):
                             _gid.affect(al, i=0, align=True, 
                                 init='{} + {}'.format(particle_offset, _ghosts[0]))
-                        for _gid, _ghosts in zip(scalars_out_global_id, scalars_out_grid_ghosts):
-                            _gid.affect(al, i=0, align=True, 
-                                init='{} + {} - {}'.format(particle_offset, _ghosts[0], 
-                                    cache_ghosts if not is_periodic else 0))
+                    for _gid, _ghosts in zip(scalars_out_global_id, scalars_out_grid_ghosts):
+                        _gid.affect(al, i=0, align=True, 
+                            init='{} + {} - {}'.format(particle_offset, _ghosts[0], 
+                                cache_ghosts if not is_periodic else 0))
                 s.jumpline()
                 s.decl_aligned_vars(*line_vars)
            
@@ -643,7 +641,6 @@ class DirectionalRemeshKernel(KernelCodeGenerator):
                                 cs.affect(al, align=True, 
                                     i='{}'.format(local_id[0]),
                                     init=cs[end_id])
-                s.barrier(_local=True, _global=True)
                 s.jumpline()
            
 
@@ -659,7 +656,6 @@ class DirectionalRemeshKernel(KernelCodeGenerator):
                     for cs in cached_scalars:
                         offset = '{}+2*{}'.format(local_offset, cache_ghosts)
                         s.append(s.vstore(nparticles, cs, offset, vzero))
-                s.barrier(_local=True, _global=True)
                 s.jumpline()
            
                 s.comment('Load position and scalars at current index, {} particles at a time.'.format(nparticles))
@@ -667,7 +663,7 @@ class DirectionalRemeshKernel(KernelCodeGenerator):
                     load = self.vload(1, line_position, '{}+{}'.format(particle_offset,j), align=True)
                     code = '{} $= ({} $? {} $: {});'.format(pos[j], active_cond, load, tg.dump(0.0))
                     al.append(code)
-                    if is_inplace:
+                    if is_inplace and not is_periodic:
                         _id = '{}+{}+{}'.format(particle_offset, cache_ghosts, j)
                     else:
                         _id = '{}+{}'.format(particle_offset, j)
@@ -680,14 +676,14 @@ class DirectionalRemeshKernel(KernelCodeGenerator):
                         pos.affect(al, align=True, 
                                 init=self.vload(nparticles, line_position, particle_offset, align=True))
 
-                        if is_inplace:
+                        if is_inplace and not is_periodic:
                             _id = '{}+{}'.format(particle_offset, cache_ghosts)
                         else:
                             _id = particle_offset
                         for si, line_sin in zip(scalars, line_scalars_in):
                             si.affect(al, align=True, 
                                     init=self.vload(nparticles, line_sin, _id,align=True))
-                s.barrier(_local=True, _global=True)
+                s.barrier(_local=True)
                 s.jumpline()
 
 
@@ -706,7 +702,6 @@ class DirectionalRemeshKernel(KernelCodeGenerator):
                     remesh_kargs['grid_size'] = compute_grid_size[0]
                 call = remesh(**remesh_kargs)
                 s.append('{};'.format(call))
-                s.barrier(_local=True, _global=True)
                 s.jumpline()
 
 
@@ -719,15 +714,20 @@ class DirectionalRemeshKernel(KernelCodeGenerator):
                         else:
                             _id = '{}+{}'.format(particle_offset, j)
                         store = self.vstore(1, line_sout, _id, load, align=True, jmp=True, suppress_semicolon=True)
-                        code = '{} $&& ({});'.format(active_cond, store)
+                        code = '{} $&& ({})'.format(active_cond, store)
+                        if debug_mode:
+                            code += ' && (printf("last_active %i wrote %2.2f from cache position %i to global id %i.\\n", {},{},{},{}));'.format(
+                                    local_id[0], load, '{}+{}'.format(local_offset,j), _id)
+                        else:
+                            code += ';'
                         al.append(code)
-                with if_thread_active():
+                with if_thread_active(not_first=True):
                     with s._align_() as al:
                         if debug_mode:
                             s.append('printf("%i wrote {vnf} from cache position %i to global id %i.\\n", {},{},{},{});'.format(
                                 local_id[0], 
                                 self.vload(nparticles, cached_scalars[0], local_offset),
-                                local_offset, particle_offset, 
+                                local_offset, particle_offset if not is_periodic else '({}+{}-{})%{}'.format(particle_offset, compute_grid_size[0], cache_ghosts, compute_grid_size[0]),
                                 vnf='[%2.2v{}f]'.format(nparticles) if nparticles>1 else '%2.2f'))
                         for sc, line_sout in zip(cached_scalars, line_scalars_out):
                             load  = self.vload(nparticles, sc, '{}'.format(local_offset)) 
@@ -737,10 +737,6 @@ class DirectionalRemeshKernel(KernelCodeGenerator):
                                 _id = particle_offset
                             store = self.vstore(nparticles, line_sout, _id, load, align=True, jmp=True)
                             al.append(store)
-                s.barrier(_local=True, _global=True)
-                    #if debug_mode:
-                        #dbg0.affect(i='position_GID', codegen=s, init=1)
-                        #dbg1.affect(i='position_GID', codegen=s, init=2)
                 
                 if is_periodic:
                     s.jumpline()
@@ -750,7 +746,6 @@ class DirectionalRemeshKernel(KernelCodeGenerator):
                             for sc, bsc in zip(cached_scalars, boundary_scalars):
                                 load  = sc[local_id[0]]
                                 store = bsc.affect(al, align=True, i=local_id[0], init=load)
-                s.barrier(_local=True, _global=True)
                 s.jumpline()
 
                 with s._if_('{} && ({} < 2*{})'.format(last, local_id[0], cache_ghosts)):
@@ -764,8 +759,9 @@ class DirectionalRemeshKernel(KernelCodeGenerator):
                                 code = '{} $= {}+{};'.format(line_sout[_gid], val0, val1)
                                 al.append(code)
                         if debug_mode:
-                            s.append('printf("%i initiated last write from cache position %i+%i=%i to global id %i+%i-%i+%i=%i.\\n", {}, {},{},{}, {},{},{},{},{});'.format(
+                            s.append('printf("%i initiated last write of value %2.2f + %2.2f = %2.2f from cache position %i+%i=%i to global id %i+%i-%i+%i=%i.\\n", {}, {},{},{}, {},{},{}, {},{},{},{},{});'.format(
                                 local_id[0], 
+                                val0, val1, '{}+{}'.format(val0,val1),
                                 last_particle, local_id[0], '{}+{}'.format(last_particle, local_id[0]),
                                 line_offset, last_particle, cache_ghosts, local_id[0],
                                 _gid))
@@ -782,7 +778,7 @@ class DirectionalRemeshKernel(KernelCodeGenerator):
                                     '{}+{}'.format(last_particle, local_id[0]),
                                     line_offset, last_particle, local_id[0],
                                     '{}+{}+{}'.format(line_offset, last_particle, local_id[0])))
-                s.barrier(_local=True, _global=True)
+                s.barrier(_local=True)
                         
 
 if __name__ == '__main__':
diff --git a/hysop/backend/device/codegen/kernels/tests/test_directional_remesh.py b/hysop/backend/device/codegen/kernels/tests/test_directional_remesh.py
index b40622a016041dec568093b2cabb30052edc7274..f151f06d4e9e54b601e8a2de1878f09d62e3e608 100644
--- a/hysop/backend/device/codegen/kernels/tests/test_directional_remesh.py
+++ b/hysop/backend/device/codegen/kernels/tests/test_directional_remesh.py
@@ -13,7 +13,7 @@ from hysop.tools.contexts import printoptions
 
 class TestDirectionalRemesh(object):
         
-    DEBUG = True
+    DEBUG = False
         
     @classmethod
     def setup_class(cls,
@@ -34,11 +34,11 @@ class TestDirectionalRemesh(object):
     def _initialize(self, cl_env, cfl):
         
         typegen = cl_env.typegen
-        dtype = cl_env.precision
-        eps = np.finfo(dtype).eps
+        dtype   = cl_env.precision
+        eps     = np.finfo(dtype).eps
         
         # compute grid
-        compute_grid_size = np.asarray([10,2,1])
+        compute_grid_size = np.asarray([97,7,11])
         compute_grid_shape = compute_grid_size[::-1]
         (A,compute_grid_mesh_info) = _test_mesh_info('base_mesh_info',typegen,3,0,compute_grid_size)
         
@@ -124,7 +124,7 @@ class TestDirectionalRemesh(object):
             
         ## allocate and initialize buffers
         random = self.random
-        random.seed(43)
+        random.seed(42)
 
         uinit   = (umax-umin) * random.rand(*compute_grid_shape).astype(dtype) + umin
         # force worst case on boundaries
@@ -213,7 +213,7 @@ class TestDirectionalRemesh(object):
         Lx = min(typegen.device.max_work_item_sizes[0], typegen.device.max_work_group_size)
         Lx = min(Lx, 1 << int(np.floor(np.log2(compute_grid_size[0]))))
         local_work_size = np.asarray([Lx,1,1])
-        work_load       = np.asarray([1,1,1])
+        work_load       = np.asarray([1,3,2])
 
         self.dx     = dx
         self.inv_dx = inv_dx
@@ -748,12 +748,12 @@ class TestDirectionalRemesh(object):
         local_work_size = local_work_size[:work_dim]
 
         views = {
-                'u':      view,
-                'pos':    view,
-                'S0_in':  S0_in_view,
-                'S1_in':  S1_in_view,
-                'S0_out': S0_out_view,
-                'S1_out': S1_out_view,
+            'u':      view,
+            'pos':    view,
+            'S0_in':  S0_in_view,
+            'S1_in':  S1_in_view,
+            'S0_out': S0_out_view,
+            'S1_out': S1_out_view,
         }
 
         known_vars = {
@@ -782,7 +782,7 @@ class TestDirectionalRemesh(object):
             debug_mode=self.DEBUG,
             known_vars=known_vars)
 
-        #drk.edit()
+        # drk.edit()
         
         global_work_size = drk.get_global_size(work_size, local_work_size, work_load)
         (static_shared_bytes, dynamic_shared_bytes, total_sharedbytes) = \
@@ -1038,8 +1038,8 @@ class TestDirectionalRemesh(object):
         assert cfl>0
         check_instance(rk_scheme,ExplicitRungeKutta)
         
-        boundaries=[BoundaryCondition.PERIODIC]
-        nscalars=[1]
+        nscalars=[1,2]
+        boundaries=[BoundaryCondition.NONE, BoundaryCondition.PERIODIC]
 
         if self.enable_extra_tests:
             kernels=[(2,1), (2,2), (4,2), (4,4), (6,4), (6,6), (8,4)]
@@ -1050,13 +1050,13 @@ class TestDirectionalRemesh(object):
             is_inplaces=[False, True]
             remesh_criterias=[None, 1000000]
         else:
-            kernels=[(6,4)]
-            nparticles=[2]
-            work_dims=[2]
-            split_polys=[False]
-            use_atomics=[True]
-            is_inplaces=[False]
-            remesh_criterias=[None]
+            kernels=[(2,1), (2,2), (4,2), (4,4), (6,4), (6,6), (8,4)]
+            nparticles=[1,2,4,8,16]
+            work_dims=[1,2,3]
+            split_polys=[False, True]
+            use_atomics=[False, True]
+            is_inplaces=[False, True]
+            remesh_criterias=[None, 1000000]
         
         ntests = 0
         for cl_env in iter_clenv(precision=Precision.FLOAT):
@@ -1088,14 +1088,14 @@ class TestDirectionalRemesh(object):
 
     
     @opencl_failed
-    def test_remesh_from_Euler_advection(self, cfl=1.0):
+    def test_remesh_from_Euler_advection_low_cfl(self, cfl=0.5788):
         rk_scheme=ExplicitRungeKutta('Euler')
         self._check_kernels(rk_scheme=rk_scheme, cfl=cfl)
     
-    #@opencl_failed
-    #def test_remesh_from_RK4_advection(self, cfl=0.5):
-        #rk_scheme=ExplicitRungeKutta('RK4')
-        #self._check_kernels(rk_scheme=rk_scheme, cfl=cfl)
+    @opencl_failed
+    def test_remesh_from_Euler_advection_high_cfl(self, cfl=3.78):
+        rk_scheme=ExplicitRungeKutta('Euler')
+        self._check_kernels(rk_scheme=rk_scheme, cfl=cfl)
 
 if __name__ == '__main__':
     if not __HAS_OPENCL_BACKEND__:
@@ -1108,7 +1108,8 @@ if __name__ == '__main__':
     
     with printoptions(linewidth=200, 
             formatter={'float':lambda x: '{:0.2f}'.format(x)}):
-        test.test_remesh_from_Euler_advection()
+        test.test_remesh_from_Euler_advection_low_cfl()
+        test.test_remesh_from_Euler_advection_high_cfl()
 
     TestDirectionalRemesh.teardown_class()