diff --git a/CMakeLists.txt b/CMakeLists.txt
index 273806eb0678b2c88d2268d321068e29433ae0e3..095c983cc1b98f676a263d91d501464cefdafaff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -142,7 +142,7 @@ find_python_module(gmpy2        REQUIRED)
 find_python_module(subprocess32 REQUIRED)
 find_python_module(editdistance REQUIRED)
 find_python_module(portalocker  REQUIRED)
-find_python_module(colors.py    REQUIRED)
+find_python_module(colors       REQUIRED) # color.py package
 find_python_module(tee          REQUIRED)
 find_python_module(primefac     REQUIRED)
 find_python_module(graph_tool   REQUIRED)
diff --git a/ci/docker_images/ubuntu/bionic/Dockerfile b/ci/docker_images/ubuntu/bionic/Dockerfile
index 6284b2a57bf41209663b8496e350e02c203d4f16..575d705383d3c09d5fa9cc9a74f9b9e07b1aa6b6 100644
--- a/ci/docker_images/ubuntu/bionic/Dockerfile
+++ b/ci/docker_images/ubuntu/bionic/Dockerfile
@@ -146,8 +146,21 @@ RUN cd /tmp                                                           \
  && cd -                                                              \
  && rm -Rf /tmp/Oclgrind
 
+# clpeak
+RUN cd /tmp                                               \
+    && git clone https://github.com/krrishnarraj/clpeak   \
+    && cd clpeak/                                         \
+    && mkdir build                                        \
+    && cd build/                                          \
+    && cmake ..                                           \
+    && make                                               \
+    && mv clpeak /usr/local/bin/                          \
+    && cd -                                               \
+    && rm -Rf /tmp/clpeak
+
 # clFFT
 RUN cd /tmp                                                           \
+ && ln -s /usr/local/lib /usr/local/lib64                             \
  && git clone https://github.com/clMathLibraries/clFFT                \
  && cd clFFT                                                          \
  && cd src                                                            \
@@ -180,7 +193,9 @@ RUN cd /tmp                                       \
  && make install                                  \
  && cd -                                          \
  && rm -Rf /tmp/graph-tool-2.26
- 
+
+# ensure all libraries are known by the runtime linker
+RUN ldconfig
 
 # clean cached packages
 RUN rm -rf /var/lib/apt/lists/*
diff --git a/examples/example_utils.py b/examples/example_utils.py
index b1b143088d62b3d2bf5b2815141b8bb834bd7f75..230667fff4deb041aabdb720ffe684a7acaf5d44 100644
--- a/examples/example_utils.py
+++ b/examples/example_utils.py
@@ -174,9 +174,9 @@ class HysopArgParser(argparse.ArgumentParser):
 
         with self.redirect_stdout(rank, args):#, self.redirect_stderr(rank, args):
             from hysop.tools.contexts import printoptions
-            with printoptions(threshold=10000, linewidth=240, 
+            with printoptions(threshold=10000, linewidth=2000, 
                               nanstr='nan', infstr='inf', 
-                              formatter={'float': lambda x: '{:>6.2f}'.format(x)}):
+                              formatter={'float': lambda x: '{:>8.4f}'.format(x)}):
                 program(args, **kwds)
 
     @staticmethod
diff --git a/examples/shear_layer/shear_layer.py b/examples/shear_layer/shear_layer.py
index 8693accf62e1b8ebc4f22d08469ede4d22069367..b463e5e8ab82885989b2a599419f1f1105df399c 100644
--- a/examples/shear_layer/shear_layer.py
+++ b/examples/shear_layer/shear_layer.py
@@ -161,15 +161,15 @@ def compute(args):
     dfields = problem.input_discrete_fields
     dfields[vorti].initialize(formula=init_vorticity)
 
-    from hysop.tools.debug_utils import ImshowDebugger
-    dbg = ImshowDebugger(data={'Wz':(dfields[vorti],0)}, 
-            ntimes=1,
-            enable_on_op_apply=False)
-    dbg.synchronize_queue(cl_env.default_queue)
-    dbg('initialization')
+    #from hysop.tools.debug_utils import ImshowDebugger
+    #dbg = ImshowDebugger(data={'Wz':(dfields[vorti],0)}, 
+            #ntimes=1,
+            #enable_on_op_apply=False)
+    #dbg.synchronize_queue(cl_env.default_queue)
+    #dbg('initialization')
 
     # Finally solve the problem 
-    problem.solve(simu, dry_run=args.dry_run, dbg=dbg)
+    problem.solve(simu, dry_run=args.dry_run)
     
     # Finalize
     problem.finalize()
diff --git a/hysop/backend/device/codegen/functions/directional_remesh.py b/hysop/backend/device/codegen/functions/directional_remesh.py
index 5c4d042a485d70b2ad149065f8951f2307d1a1aa..87f00dbe22f09bbc9ce00dc05b2b4b080113f8cf 100644
--- a/hysop/backend/device/codegen/functions/directional_remesh.py
+++ b/hysop/backend/device/codegen/functions/directional_remesh.py
@@ -26,6 +26,9 @@ class DirectionalRemeshFunction(OpenClFunctionCodeGenerator):
             use_short_circuit=None,
             known_args=None,
             debug_mode=False):
+
+            
+        debug_mode = True
         
         check_instance(sboundary,tuple,values=BoundaryCondition)
         check_instance(remesh_kernel, RemeshKernel)
@@ -216,11 +219,11 @@ class DirectionalRemeshFunction(OpenClFunctionCodeGenerator):
         
         printf_no_vector=True
         if printf_no_vector:
-            vnf = '[{}]'.format(', '.join('%2.2f' for _ in xrange(nparticles)))
+            vnf = '[{}]'.format(', '.join('%2.4f' for _ in xrange(nparticles)))
             vni = '[{}]'.format(', '.join('%i' for _ in xrange(nparticles)))
             expand_printf_vector = lambda x: ','.join(x[i] for i in xrange(nparticles))
         else:
-            vnf = '[%2.2{}f]'.format(self._printv(nparticles))
+            vnf = '[%2.4{}f]'.format(self._printv(nparticles))
             vni = '[%{}i]'.format(self._printv(nparticles))
             expand_printf_vector = lambda x: str(x)
         epv = expand_printf_vector
@@ -264,7 +267,8 @@ class DirectionalRemeshFunction(OpenClFunctionCodeGenerator):
                 eps.declare(s)
             s.jumpline()
             
-            #s.decl_aligned_vars(tst0, tst1)
+            if debug_mode:
+                s.decl_aligned_vars(tst0, tst1)
             s.decl_aligned_vars(rp, find, ind, y)
             vone.declare(s, const=True)
             s.decl_vars(*weights)
@@ -274,13 +278,14 @@ class DirectionalRemeshFunction(OpenClFunctionCodeGenerator):
                 s.append('{} = {}*{};'.format(rp, pos, inv_dx))
                 s.append('{} = fract({}, &{});'.format(y, rp, find))
                 s.append('{} = convert_{}_rtn({});'.format(ind, ivtype, find))
-                #s.append('{} = floor({rp});'.format(tst0, rp=rp))
-                #s.append('{} = {rp} - floor({rp});'.format(tst1, rp=rp))
                 if debug_mode:
-                    with self._ordered_wi_execution_(barrier=False):
-                        code = 'printf("%lu p={vnf}, rp={vnf}, floor(rp)={vnf}, rp-floor(rp)={vnf}, ind={vni}, y={vnf}, s0={vnf}, a=%f, b=%f, c=%f.\\n", {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});'.format(
+                    s.append('{} = floor({rp});'.format(tst0, rp=rp))
+                    s.append('{} = {rp} - floor({rp});'.format(tst1, rp=rp))
+                if debug_mode:
+                    #with self._ordered_wi_execution_(barrier=False):
+                    with self._if_('(get_group_id(1)==0) && (get_group_id(1)==0) && get_local_id(0)==35'):
+                        code = 'printf("%lu p={vnf}, p.invdx={vnf}, floor(p/dx)={vnf}, p/dx-floor(p/dx)={vnf}, ind={vni}, y={vnf}, s0={vnf}.\\n", {}, {}, {}, {}, {}, {}, {}, {});'.format(
                                 'get_local_id(0)', epv(pos), epv(rp), epv(tst0), epv(tst1), epv(ind), epv(y), epv(scalars[0]), 
-                                'floor(4.5f)', 'floor((float2)(1.5f,2.54f)).x', 'floor((float2)(1.5f,2.54f)).y',
                                 vnf=vnf, vni=vni)
                         s.append(code)
                 y.affect(s, init='{}-{}'.format(vone,y))
@@ -332,13 +337,14 @@ class DirectionalRemeshFunction(OpenClFunctionCodeGenerator):
                         val='{}*{}'.format(w,scalars[0])
                         cache_idx='{}+{}'.format(cache_ghosts, ind[0])
                         
-                        if poly_splitted: 
-                            printf = 'printf("BATCH {}: %lu wrote {vnf} at idx={vni} with Wl={vnf}, Wr={vnf}, W={vnf}, cond={vni}.\\n",{},{},{},{},{},{},{});'.format(
-                                    i, 'get_local_id(0)', val, ind, wl, wr, w, 'y<0.5f', vnf=vnf, vni=vni)
-                        else:
-                            printf = 'printf("BATCH {}: %lu wrote {vnf} at idx {vni} with W={vnf}.\\n",{},{},{},{});'.format(
-                                    i, 'get_local_id(0)', val, ind,  w, vni=vni, vnf=vnf)
-                        s.append(printf)
+                        with self._if_('(get_group_id(1)==0) && (get_group_id(1)==0) && get_local_id(0)==35'):
+                            if poly_splitted: 
+                                printf = 'printf("BATCH {}: %lu wrote {vnf} at idx={vni} with Wl={vnf}, Wr={vnf}, W={vnf}, cond={vni}.\\n",{},{},{},{},{},{},{});'.format(
+                                        i, 'get_local_id(0)', val, ind, wl, wr, w, 'y<0.5f', vnf=vnf, vni=vni)
+                            else:
+                                printf = 'printf("BATCH {}: %lu wrote {vnf} at idx {vni} with W={vnf}.\\n",{},{},{},{});'.format(
+                                        i, 'get_local_id(0)', val, ind,  w, vni=vni, vnf=vnf)
+                            s.append(printf)
                 if not use_atomics: 
                     s.barrier(_local=True)
             if use_atomics:
diff --git a/hysop/backend/device/codegen/kernels/directional_remesh.py b/hysop/backend/device/codegen/kernels/directional_remesh.py
index 1aa60d7498c0410ba3c59f76e63c8db34843e7a4..63a4d1f6bd565b6098168131e21a149b5f4672aa 100644
--- a/hysop/backend/device/codegen/kernels/directional_remesh.py
+++ b/hysop/backend/device/codegen/kernels/directional_remesh.py
@@ -717,7 +717,7 @@ class DirectionalRemeshKernelGenerator(KernelCodeGenerator):
                                        init=tg.dump(+0.0))
                     with s._else_():
                         end_id='{}+{}'.format(local_work, 'l')
-                        if debug_mode:
+                        if debug_mode or True:
                             s.append('printf("%i loaded back %2.2f from position %i.\\n", {}, {}, {});'.format(local_id[0], cached_scalars[0][0][end_id], end_id))
                         with s._align_() as al:
                             for csi in cached_scalars:
@@ -725,6 +725,7 @@ class DirectionalRemeshKernelGenerator(KernelCodeGenerator):
                                     csij.affect(al, align=True, 
                                         i='l',
                                         init=csij[end_id])
+                s.barrier(_local=True)
                 s.jumpline()
            
 
diff --git a/hysop/backend/device/opencl/autotunable_kernels/remesh_dir.py b/hysop/backend/device/opencl/autotunable_kernels/remesh_dir.py
index 52bc6db03550148556395f674b216dbbdb9d4675..8c5da4faf68293ebb7f0524862b39166de715784 100644
--- a/hysop/backend/device/opencl/autotunable_kernels/remesh_dir.py
+++ b/hysop/backend/device/opencl/autotunable_kernels/remesh_dir.py
@@ -269,8 +269,8 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel):
         precision     = extra_kwds['precision']
         force_atomics = extra_kwds['force_atomics']
 
-        nparticles_options  = [1,2,4,8,16]
-        use_atomics_options = [True] if force_atomics else [True, False]
+        nparticles_options  = [1,]#2,4,8,16]
+        use_atomics_options = [True,]# if force_atomics else [True, False]
         if True in use_atomics_options:
             cl_env = self.cl_env
             msg=None
@@ -326,9 +326,9 @@ class OpenClAutotunableDirectionalRemeshKernel(OpenClAutotunableKernel):
         cache_ghosts = extra_kwds['min_s_ghosts']
 
         min_wg_size = npw.ones(shape=work_bounds.work_dim, dtype=npw.int32) 
-        min_wg_size[0] = max(min_wg_size[0], 2*cache_ghosts)
+        min_wg_size[0] = 36
         max_wg_size = npw.ones(shape=work_bounds.work_dim, dtype=npw.int32)
-        max_wg_size[0] = max(global_work_size[0], min_wg_size[0])
+        max_wg_size[0] = 36
         return (min_wg_size, max_wg_size)
     
     def compute_global_work_size(self, local_work_size, work, extra_parameters, extra_kwds):
diff --git a/hysop/backend/device/opencl/operator/directional/advection_dir.py b/hysop/backend/device/opencl/operator/directional/advection_dir.py
index f93666b08a51588279cb5af887aacae111a92a2c..c9352d1afc697fc24b6674dff838adde7c6650a5 100644
--- a/hysop/backend/device/opencl/operator/directional/advection_dir.py
+++ b/hysop/backend/device/opencl/operator/directional/advection_dir.py
@@ -13,7 +13,7 @@ from hysop.backend.device.opencl.opencl_kernel_launcher import OpenClKernelListL
 class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOperator):
     
     DEBUG=False
-    GRAPHICAL_DEBUG=False
+    GRAPHICAL_DEBUG=True
     
     @debug
     def __init__(self, force_atomics=False, relax_min_particles=False, remesh_criteria_eps=None,
@@ -190,8 +190,11 @@ class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOper
         elif self.GRAPHICAL_DEBUG:
             import numpy as np
             p    = self.dposition
+            v    = self.dvelocity
             sin  = self.dadvected_fields_in.values()[0]
             sout = self.dadvected_fields_out.values()[0]
+            
+            V = v[0].get()[v.compute_slices]
 
             self.advection_kernel_launcher(queue=queue, dt=dt).wait()
             
@@ -206,21 +209,27 @@ class OpenClDirectionalAdvection(DirectionalAdvectionBase, OpenClDirectionalOper
             S0 = np.sum(Sin, axis=1)
             S1 = np.sum(Sout, axis=1)
 
-            failed = np.where((S1-S0)>1e-1)[0]
+            failed = np.where((S1-S0)>1)[0]
             if failed.size:
-                for i in failed:
+                for i in failed[:1]:
                     print 'AT I={}'.format(i)
-                    slc = (i,slice(250,260))
+                    slc = (i,slice(None))
+                    print 'Vx'
+                    print V[slc]
                     print 'P'
-                    print P[slc]*512
+                    print P[slc]*P.shape[1]
                     print 'SIN'
                     print Sin[slc]
                     print 'SOUT'
                     print Sout[slc]
+                    print 'INDEXES'
+                    print np.arange(Sout[slc].size).astype(np.float32)
                     print 'SCALAR WAS NOT CONSERVED'
                     print '  Sin:  {}'.format(S0[i]/S0.size)
                     print '  Sout: {}'.format(S1[i]/S1.size)
                     print
+                print 'FAILED INDICES'
+                print failed
                 import sys
                 sys.exit(1)
         else: