diff --git a/hysop/codegen/base/function_codegen.py b/hysop/codegen/base/function_codegen.py
index 86bd915e068434de2f09c2b338df64d32e2587b4..8d5fb484694bb299eb2bfac0fdfcc98b21e57be5 100644
--- a/hysop/codegen/base/function_codegen.py
+++ b/hysop/codegen/base/function_codegen.py
@@ -74,7 +74,9 @@ class FunctionBase(object):
 
                 if isinstance(var, str):
                     sargs.append(var)
-                elif isinstance(var,arg.__class__):
+                elif isinstance(var,arg.__class__)                           \
+                    or (arg.__class__==CodegenVectorClBuiltin and arg.dim==1 \
+                            and isinstance(var, CodegenVariable)):
                     sargs.append(var.name)
                 else:
                     msg = 'Matched variable \'{}\' with argument \'{}\' but there was a type mismatch(\'{}\' is not an instance of \'{}\')!'.format(var.name, arg.name, var.__class__, arg.__class__)
diff --git a/hysop/codegen/functions/advection_rhs.py b/hysop/codegen/functions/advection_rhs.py
index d238bd9732b8546e0aa30101d42a459b73339988..d2b29d4747db3966a354d8c92936d07bbae743df 100644
--- a/hysop/codegen/functions/advection_rhs.py
+++ b/hysop/codegen/functions/advection_rhs.py
@@ -69,16 +69,15 @@ class DirectionalAdvectionRhsFunction(OpenClFunctionCodeGenerator):
         
         args['velocity'] = CodegenVariable('velocity', ftype, typegen, const=True, add_impl_const=True, 
                 storage=storage, ptr=True, restrict=restrict, nl=True)
-        args['X'] = CodegenVectorClBuiltin('X', ftype, nparticles, typegen, const=True, nl=True)
+        args['X'] = CodegenVectorClBuiltin('X', ftype, nparticles, typegen, add_impl_const=True, nl=True)
         
-        if not is_cached:
-            args['line_index']   = CodegenVariable('line_index',  itype, typegen, add_impl_const=True)
         if is_periodic and (not is_cached):
-            args['line_width']   = CodegenVariable('line_width',   itype, typegen, add_impl_const=True,nl=True)
+            args['line_width']   = CodegenVariable('line_width',   itype, typegen, add_impl_const=True)
+        args['line_offset']   = CodegenVariable('line_offset',  itype, typegen, add_impl_const=True,nl=True)
 
         args['rk_step'] = CodegenVariable('rk_step', itype, typegen, add_impl_const=True)
         args['inv_dx'] = CodegenVariable('inv_dx', ftype, typegen, add_impl_const=True, nl=True)
-        args['active'] = CodegenVariable('active','bool',typegen)
+        args['active'] = CodegenVariable('active','bool',typegen, add_impl_const=True)
         
         cached   = 'cached_' if is_cached else ''
         basename = 'advection_rhs_{}{}{}p'.format(cached, ftype[0], nparticles)
@@ -107,9 +106,9 @@ class DirectionalAdvectionRhsFunction(OpenClFunctionCodeGenerator):
         velocity = s.args['velocity']
         
         if not is_cached:
-            line_index = s.args['line_index']
-        if is_periodic:
-            line_width = s.args['line_width']
+            line_offset = s.args['line_offset']
+            if is_periodic:
+                line_width = s.args['line_width']
 
         inv_dx       = s.args['inv_dx']
         rk_step      = s.args['rk_step']
@@ -141,28 +140,23 @@ class DirectionalAdvectionRhsFunction(OpenClFunctionCodeGenerator):
                 s.jumpline()
 
                 with s._if_('{} == 0'.format(rk_step())):
-                    # if is_periodic:
-                        # s.append('{idx} = ({idx}+{size})%{size};'.format(idx=lidx(),size=line_width()))
                     if is_cached:
                         reads = [velocity[lidx[i] ] for i in xrange(nparticles)]
                     else:
-                        s.append('{} = {}+{};'.format(idx(), line_index(),lidx()))
+                        s.append('{} = {}+{};'.format(idx(), line_offset(),lidx()))
                         reads = [velocity[idx[i] ] for i in xrange(nparticles)]
                     init = '({})({})'.format(part_ftype,', '.join(reads))
                     s.append('{} = {};'.format(dX_dt(), init))
                 with s._else_():
-                    if is_periodic:
+                    if is_periodic and not is_cached:
                         s.append('{idx} = ({idx}+{size})%{size};'.format(idx=lidx(),size=line_width()))
-                    if is_cached:
-                        reads = [velocity[lidx[i] ] for i in xrange(nparticles)]
-                    else:
-                        s.append('{} = {}+{};'.format(idx(), line_index(),lidx()))
-                        reads = [velocity[idx[i] ] for i in xrange(nparticles)]
+                        s.append('{} = {}+{};'.format(idx(), line_offset(),lidx()))
+                    reads = [velocity[idx[i] ] for i in xrange(nparticles)]
                     init = '({})({})'.format(part_ftype,', '.join(reads))
                     Vl.declare(s,init=init)
-                    if is_periodic:
+                    if is_periodic and not is_cached:
                         s.append('{idx} = ({idx}+1)%{size};'.format(idx=lidx(),size=line_width()))
-                        s.append('{} = {}+{};'.format(idx(), line_index(),lidx()))
+                        s.append('{} = {}+{};'.format(idx(), line_offset(),lidx()))
                     else:
                         s.append('{} += 1;'.format(idx()))
                     Vr.declare(s,init=init)
diff --git a/hysop/codegen/functions/cache_load.py b/hysop/codegen/functions/cache_load.py
index d40d71ff159753f9982d2ffcbe9913948df6d1ab..c655967ecc7bab98aa07ac4836400497576fd91e 100644
--- a/hysop/codegen/functions/cache_load.py
+++ b/hysop/codegen/functions/cache_load.py
@@ -10,6 +10,7 @@ from hysop.codegen.base.utils      import ArgDict
 from hysop.codegen.base.statistics import WorkStatistics
 
 from hysop.codegen.functions.compute_index import ComputeIndexFunction
+from hysop.constants import BoundaryCondition
 
 class CacheLoadFunction(OpenClFunctionCodeGenerator):
     
@@ -22,22 +23,25 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator):
             'data_size':'size of source data'
             }
 
-    def __init__(self, typegen, ftype, dim,
+    def __init__(self, typegen, ftype, work_dim, boundary, 
             components=1, src_vectorize=True, dst_vectorize=True,
-            boundary=None, itype='int',
+            itype='int', with_gid_ghost_offset=True,
             known_args=None,force_symbolic=False):
 
-        assert dim>0
+        assert work_dim>0
         
-        if boundary not in [None, 'periodic']:
-            raise NotImplemented('Boundary \'{}\' not implmented yet!'.format(boundary))
+        assert boundary in BoundaryCondition.entries()
+        boundary  = BoundaryCondition.value(boundary)
+        sboundary = BoundaryCondition.svalue(boundary)
+        if boundary not in [BoundaryCondition.NONE, BoundaryCondition.PERIODIC]:
+            raise NotImplemented('Boundary \'{}\' not implemented yet!'.format(sboundary.lower()))
         
         tg = typegen
         fs = force_symbolic
         vtype = tg.vtype(ftype,components)
-        name = 'cache_load_{}d'.format(dim)
-        if boundary is not None:
-            name+='_{}'.format(boundary)
+        name = 'cache_load_{}d'.format(work_dim)
+        if boundary != BoundaryCondition.NONE:
+            name+='_{}'.format(sboundary.lower())
         output = 'void'
         
         args = ArgDict()
@@ -58,19 +62,21 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator):
                 args[dst] = CodegenVariable(dst, ftype, typegen=tg, ptr=True, const=False, 
                         storage='__local' , nl=True, restrict=True)
 
-        args['global_id']  = CodegenVectorClBuiltin('gid',itype,dim,typegen,
+        args['global_id']  = CodegenVectorClBuiltin('gid',itype,work_dim,typegen,
                 add_impl_const=False)
-        args['local_id']   = CodegenVectorClBuiltin('lid',itype,dim,typegen,
+        args['local_id']   = CodegenVectorClBuiltin('lid',itype,work_dim,typegen,
                 add_impl_const=False, nl=True)
 
-        args['local_size'] = CodegenVectorClBuiltin('L',itype,dim,tg,
+        args['local_size'] = CodegenVectorClBuiltin('L',itype,work_dim,tg,
                 add_impl_const=True, symbolic_mode=fs)
-        args['src_size']   = CodegenVectorClBuiltin('N',itype,dim,tg,
+        args['src_size']   = CodegenVectorClBuiltin('N',itype,work_dim,tg,
                 add_impl_const=True, symbolic_mode=fs, nl=True)
 
-        args['lghosts'] = CodegenVectorClBuiltin('lghosts',itype,dim,tg,
+        args['multiplier'] = CodegenVectorClBuiltin('multiplier',itype,work_dim,tg,
                 add_impl_const=True, symbolic_mode=fs)
-        args['rghosts'] = CodegenVectorClBuiltin('rghosts',itype,dim,tg,
+        args['lghosts'] = CodegenVectorClBuiltin('lghosts',itype,work_dim,tg,
+                add_impl_const=True, symbolic_mode=fs)
+        args['rghosts'] = CodegenVectorClBuiltin('rghosts',itype,work_dim,tg,
                 add_impl_const=True, symbolic_mode=fs)
 
         super(CacheLoadFunction,self).__init__(basename=name, output=output,
@@ -80,10 +86,11 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator):
         self.itype   = itype
         self.ftype   = ftype
         self.vtype   = vtype
-        self.dim     = dim
+        self.work_dim     = work_dim
         self.components = components
         self.src_vectorize = src_vectorize
         self.dst_vectorize = dst_vectorize
+        self.with_gid_ghost_offset = with_gid_ghost_offset
 
         self.boundary = boundary
         self.declare_code()
@@ -91,7 +98,7 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator):
     def declare_code(self):
         s = self
         tg  = s.typegen
-        dim = s.dim
+        work_dim = s.work_dim
         itype = s.itype
         ftype = s.ftype
         vtype = s.vtype
@@ -99,6 +106,7 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator):
         src_vectorize = s.src_vectorize
         dst_vectorize = s.dst_vectorize
         boundary=s.boundary
+        with_gid_ghost_offset = self.with_gid_ghost_offset
         
         global_id  = s.args['global_id'];
         local_id   = s.args['local_id'];
@@ -108,29 +116,34 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator):
 
         lghosts    = s.args['lghosts'];
         rghosts    = s.args['rghosts'];
+        multiplier = s.args['multiplier'];
         
-        cache_size = CodegenVectorClBuiltin('S'  ,itype,dim,tg,const=True)
-        local_pos  = CodegenVectorClBuiltin('lidx',itype,dim,tg) 
-        global_pos = CodegenVectorClBuiltin('gidx',itype,dim,tg) 
+        cache_size = CodegenVectorClBuiltin('S'  ,itype,work_dim,tg,const=True)
+        local_pos  = CodegenVectorClBuiltin('lidx',itype,work_dim,tg) 
+        global_pos = CodegenVectorClBuiltin('gidx',itype,work_dim,tg) 
 
         LID = CodegenVariable('LID',itype, tg)
         GID = CodegenVariable('GID',itype, tg)
 
         tmp = CodegenVectorClBuiltin('tmp', ftype, components, tg)
                 
-        compute_index = ComputeIndexFunction(tg,dim,wrap=False)
+        compute_index = ComputeIndexFunction(tg,work_dim,wrap=False)
         s.require('compute_index',compute_index)
                 
         with s._function_():
             s.jumpline()
-            s.append(cache_size.declare(init='{}+{}+{}'.format(lghosts(),local_size(),rghosts())))
+            s.append(cache_size.declare(init='{}+{}*{}+{}'.format(lghosts(),multiplier(),
+                local_size(),rghosts())))
             s.jumpline()
             s.append(global_pos.declare())
             s.append(local_pos.declare())
             s.jumpline()
             
-            if boundary == 'periodic':
-                s.append('{} += {}-{};'.format(global_id(),src_size(),lghosts()))
+            if boundary == BoundaryCondition.PERIODIC:
+                if with_gid_ghost_offset:
+                    s.append('{} += {}-{};'.format(global_id(),src_size(),lghosts()))
+                else:
+                    s.append('{} += {};'.format(global_id(),src_size()))
             
             @contextmanager
             def _cache_iterate_(i):
@@ -142,7 +155,7 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator):
                         s.append('{} = ({}+k{i}*{});'.format(local_pos[i],local_id[i],
                             local_size[i],i=i));
                         s.append('if ({}>={}) continue;'.format(local_pos[i],cache_size[i]))
-                        if boundary=='periodic':
+                        if boundary==BoundaryCondition.PERIODIC:
                             s.append('{} = ({}+k{i}*{})%{};'.format(global_pos[i],global_id[i],
                                 local_size[i],src_size[i],i=i));
                         else:
@@ -152,15 +165,15 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator):
                 except:
                     pass
             
-            s.mem_fence(read=True,_local=True)
+            s.barrier(_local=True)
             with s._block_():
-                nested_loops = [_cache_iterate_(i) for i in xrange(dim-1,-1,-1)]
+                nested_loops = [_cache_iterate_(i) for i in xrange(work_dim-1,-1,-1)]
                 with contextlib.nested(*nested_loops):
                     with s._block_():
                         s.append(LID.declare(const=True,init=compute_index(idx=local_pos,
-                            size=cache_size[:dim])))
+                            size=cache_size[:work_dim])))
                         s.append(GID.declare(const=True,init=compute_index(idx=global_pos,
-                            size=src_size[:dim])))
+                            size=src_size[:work_dim])))
                         
                         if src_vectorize:
                             load=s.vars['src'][GID()]
@@ -178,14 +191,14 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator):
                                 s.append(store)
 
                         
-            s.mem_fence(write=True,_local=True)
+            s.barrier(_local=True)
         
         
     def per_work_statistics(self): 
         typegen = self.typegen
         itype = self.itype
         ftype = self.ftype
-        dim   = self.dim
+        work_dim   = self.work_dim
         components = self.components
     
         size  = typegen.FLT_BYTES[ftype]
@@ -193,8 +206,8 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator):
         writes = reads
 
         ops = {}
-        ops[itype]  = 2*dim
-        ops[itype] += 10*dim
+        ops[itype]  = 2*work_dim
+        ops[itype] += 10*work_dim
 
         stats = WorkStatistics()
         stats.ops_per_type = ops
@@ -209,12 +222,12 @@ class CacheLoadFunction(OpenClFunctionCodeGenerator):
 
            
 if __name__ == '__main__':
-    from hysop.codegen.base.test import test_typegen
+    from hysop.codegen.base.test import _test_typegen
 
-    tg = test_typegen('float')
-    cg = OpenClCodeGenerator(name='base',typegen=tg)
+    tg = _test_typegen('float')
+    cg = OpenClCodeGenerator(name='base', typegen=tg)
 
-    cache_load0 = CacheLoadFunction(typegen=tg,ftype=tg.fbtype,dim=3,
+    cache_load0 = CacheLoadFunction(typegen=tg,ftype=tg.fbtype,work_dim=3,
             components=4, src_vectorize=False, dst_vectorize=True)
     print cache_load0.per_work_statistics()
 
diff --git a/hysop/codegen/kernels/directional_advection.py b/hysop/codegen/kernels/directional_advection.py
index c2f36e6fdfcd0c3b1e2ea7b8eb0ac768bd25bb73..f5a9fae29398722c39c9618998a48e2efe831a3d 100644
--- a/hysop/codegen/kernels/directional_advection.py
+++ b/hysop/codegen/kernels/directional_advection.py
@@ -201,6 +201,11 @@ class DirectionalAdvectionKernel(KernelCodeGenerator):
 
         mesh_info_struct = MeshInfoStruct(typegen=typegen, typedef='MeshInfo_s')
         reqs['MeshInfoStruct'] = mesh_info_struct
+    
+        cache_load = CacheLoadFunction(typegen=tg,ftype=ftype,work_dim=1,
+            components=1, src_vectorize=False, dst_vectorize=False,
+            boundary=boundary, with_gid_ghost_offset=False)
+        reqs['cache_load'] = cache_load
          
         advection_rhs = DirectionalAdvectionRhsFunction(typegen=typegen, work_dim=work_dim, 
                 ftype=ftype, is_cached=is_cached,
@@ -259,6 +264,7 @@ class DirectionalAdvectionKernel(KernelCodeGenerator):
         boundary   = s.boundary
         storage    = s.storage
         nparticles = s.nparticles
+        min_ghosts = s.min_ghosts
 
         symbolic_mode = s.symbolic_mode
 
@@ -296,21 +302,28 @@ class DirectionalAdvectionKernel(KernelCodeGenerator):
         s.update_vars(grid_size=grid_size, inv_dx=inv_dx, 
                 compute_grid_ghosts=compute_grid_ghosts, compute_grid_size=compute_grid_size)
 
-        # compute_index = self.reqs['compute_id']
-        runge_kutta   = self.reqs['runge_kutta']
+        cache_load   = self.reqs['cache_load']
+        runge_kutta  = self.reqs['runge_kutta']
+
+        line_index     = CodegenVariable(name='line_index', ctype=itype, typegen=tg)
+        line_offset    = CodegenVariable(name='line_offset', ctype=itype, typegen=tg,const=True)
+        line_velocity  = CodegenVariable(name='line_velocity', ctype=ftype, ptr=True, 
+                storage='__global', restrict=True, const=True, add_impl_const=True, typegen=tg)
 
-        line_index = CodegenVariable(name='line_index', ctype=itype, typegen=tg)
-        X          = CodegenVectorClBuiltin('X',       ftype, nparticles, typegen=tg)
-        pid        = CodegenVectorClBuiltin('pid',     itype, nparticles, typegen=tg)
-        poffset    = CodegenVectorClBuiltin('poffset', itype, nparticles, typegen=tg)
+        X           = CodegenVectorClBuiltin('X',       ftype, nparticles, typegen=tg)
+        pid         = CodegenVectorClBuiltin('pid',     itype, nparticles, typegen=tg, const=True)
+        poffset     = CodegenVectorClBuiltin('poffset', itype, nparticles, typegen=tg)
         
+        npart = CodegenVariable(name='npart', ctype=itype, typegen=tg, init=nparticles)
         if is_cached:
-            cache_ghosts = CodegenVariable('cache_ghosts', itype, typegen=tg, value=min_ghosts)
-            cache_width  = CodegenVariable('cache_width', itype, typegen=tg)
+            cache_ghosts = CodegenVariable('cache_ghosts', itype, typegen=tg, value=min_ghosts,
+                    symbolic_mode=symbolic_mode)
+            cache_width  = CodegenVariable('cache_width', itype, typegen=tg,
+                    init='{}*{} + 2*{}'.format(npart(),local_size[0],cache_ghosts()))
             if local_size_known:
-                local_size = s.known_vars['local_size']
+                L = s.known_vars['local_size']
                 Vc = CodegenArray(name='Vc',dim=1,ctype=ftype,typegen=tg,
-                            shape=(local_size.value[0]+2*min_ghosts,), storage='__local')
+                            shape=(nparticles*L[0]+2*min_ghosts,), storage='__local')
             else:
                 Vc = s.vars['Vc']
         
@@ -326,7 +339,7 @@ class DirectionalAdvectionKernel(KernelCodeGenerator):
                     fval   = 0
                     gsize  = 1
                     N      = '({Sx}+{npart}*{Lx}-1)/({npart}*{Lx})'.format(Sx=compute_grid_size[i],
-                            npart=nparticles,Lx=local_size[0])
+                            npart=npart(),Lx=local_size[0])
                 else:
                     fval = global_id.fval(i)
                     gsize = global_size[i]
@@ -348,6 +361,8 @@ class DirectionalAdvectionKernel(KernelCodeGenerator):
                         if work_dim>1:
                             init = '({}+{})*{}'.format(init, global_id[1],grid_size[0])
                         line_index.declare(s, const=True, init=init)
+                        line_velocity.declare(s, const=True, 
+                                init='{}+{}'.format(velocity(),line_index()))
 
                     yield ctx
             except:
@@ -372,16 +387,54 @@ class DirectionalAdvectionKernel(KernelCodeGenerator):
                 dx.declare(al,align=True)
             s.jumpline()
             
+            with s._align_() as al:
+                npart.declare(al, const=True, align=True)
+                if is_cached:
+                    cache_ghosts.declare(al,const=True, align=True)
+                    cache_width.declare(al,const=True, align=True)
+            s.jumpline()
+
+            if is_cached and local_size_known:
+                Vc.declare(s);
+                s.jumpline()
+            
             poffset.declare(s,init=range(nparticles),const=True)
             global_id.declare(s,init=False)
             s.jumpline()
                 
             with contextlib.nested(*nested_loops):
-                active.declare(s, init='({}*{}+{} < {})'.format(nparticles,global_id[0], nparticles-1, 
+                active.declare(s, init='({}*{}+{}-1 < {})'.format(npart(),global_id[0], npart(), 
                     compute_grid_size[0]))
                 s.jumpline()
                 
-                pid.declare(s, init='{}*{}+{}+{}'.format(nparticles,global_id[0],poffset(),compute_grid_ghosts[0]))
+                if is_cached:
+                    line_offset.declare(s,init='{}*{}*{}-{}'.format('k',local_size[0],npart(),
+                        cache_ghosts()))
+                else:
+                    line_offset.declare(s,init=0)
+
+                pid.declare(s, init='{}*{}+{}+{}'.format(npart(),global_id[0],poffset(),
+                    compute_grid_ghosts[0]))
+                s.jumpline()
+                
+                if is_cached:
+                    cid = CodegenVariable(name='cid',ctype=itype,typegen=tg,const=True)
+                    cid.declare(s,init='{} + {};'.format(line_offset(), local_id[0]))
+                    kargs = {
+                            'src0': line_velocity,
+                            'dst0': Vc,
+                            'src_size': global_size[0],
+                            'local_size': grid_size[0],
+                            'local_id': local_id[0],
+                            'global_id': cid,
+                            'lghosts': cache_ghosts,
+                            'rghosts': cache_ghosts,
+                            'multiplier': npart,
+                            }
+                    call = cache_load(**kargs)
+                    s.append(call+';')
+                    s.jumpline()
+                
                 X.declare(s, init='convert_{}({})*{}'.format(part_ftype,pid(),dx()));
 
                 rk_args = {
@@ -389,10 +442,12 @@ class DirectionalAdvectionKernel(KernelCodeGenerator):
                         'dt': dt,
                         'active': active,
                         'inv_dx': inv_dx,
-                        'velocity': velocity
+                        'line_offset': line_offset
                 }
-                if not is_cached:
-                    rk_args['line_index'] = line_index
+                if is_cached:
+                    rk_args['velocity']   = Vc
+                else:
+                    rk_args['velocity']   = line_velocity
                 if is_periodic:
                     rk_args['line_width'] = grid_size[0]
                 call = runge_kutta(**rk_args)
@@ -400,6 +455,7 @@ class DirectionalAdvectionKernel(KernelCodeGenerator):
                 s.append(code)
                 
                 s.append('{} += {};'.format(X(), xmin()))
+                s.jumpline()
                 
                 with s._if_(active()):
                     if nparticles>1:
@@ -432,10 +488,10 @@ if __name__ == '__main__':
         work_dim=work_dim,
         rk_scheme=ExplicitRungeKutta('RK2'),
         boundary=BoundaryCondition.PERIODIC,
-        is_cached=False,
-        min_ghosts=0,
+        is_cached=True,
+        min_ghosts=10,
         nparticles=4,
-        symbolic_mode=False,
+        symbolic_mode=True,
         known_vars=dict(
             mesh_info=mesh_info,
             local_size=local_size[:work_dim]
diff --git a/hysop/codegen/kernels/tests/test_directional_advection.py b/hysop/codegen/kernels/tests/test_directional_advection.py
new file mode 100644
index 0000000000000000000000000000000000000000..36daaf04eccb393b95db3773b0aa42397ff8cd5b
--- /dev/null
+++ b/hysop/codegen/kernels/tests/test_directional_advection.py
@@ -0,0 +1,419 @@
+
+import copy, math
+
+from hysop.gpu import cl
+from hysop.constants import np, BoundaryCondition
+from hysop.codegen.base.test import _test_mesh_info , _test_typegen
+from hysop.methods_keys import ExplicitRungeKutta
+from hysop.codegen.kernels.directional_advection import DirectionalAdvectionKernel
+
+class TestDirectionalAdvection(object):
+
+    @classmethod
+    def setup_class(cls,enable_error_plots=False):
+        typegen = _test_typegen('double','dec')
+        dtype = np.float64
+
+        queue = cl.CommandQueue(typegen.context)
+        ctx = typegen.context
+
+        grid_size = np.asarray([64,64,64])
+        (A,grid_mesh_info) = _test_mesh_info(typegen,3,0,grid_size)
+        
+        dx     = A['dx'][0][0]
+        inv_dx = A['inv_dx'][0][0]
+        dt     = 0.1
+        
+        umax = 10.0
+        min_ghosts = int(math.ceil(dt*umax/float(dx)))
+        ghosts = min_ghosts + 0
+
+        print 'min ghosts are: {}'.format(min_ghosts)
+        print 'actual ghosts are: {}'.format(ghosts)
+        
+        compute_grid_ghosts = np.asarray([ghosts,0,0])
+        compute_grid_size   = grid_size + 2*compute_grid_ghosts
+        (B,compute_grid_mesh_info) = _test_mesh_info(typegen,3,compute_grid_ghosts,compute_grid_size)
+        
+        grid_shape         = grid_size[::-1]
+        compute_grid_shape = compute_grid_size[::-1]
+
+        assert A['dx'][0] == B['dx'][0]
+
+        grid_bytes         = grid_size.size         * typegen.FLT_BYTES[typegen.fbtype]
+        compute_grid_bytes = compute_grid_size.size * typegen.FLT_BYTES[typegen.fbtype]
+
+        mf = cl.mem_flags
+
+        host_buffers_init = {
+                'no_ghosts': {
+                    'u':    umax * np.random.rand(*grid_shape).astype(dtype),
+                    'pos':  -1   * np.ones(shape=grid_shape, dtype=dtype),
+                    'dbg0': -1   * np.ones(shape=grid_shape, dtype=np.int32),
+                    'dbg1': -1   * np.ones(shape=grid_shape, dtype=np.int32),
+                },
+                'with_ghosts': {
+                    'u':    umax * np.random.rand(*compute_grid_shape).astype(dtype),
+                    'pos':  -1   * np.ones(shape=grid_shape, dtype=dtype),
+                    'dbg0': -1   * np.ones(shape=compute_grid_shape, dtype=np.int32),
+                    'dbg1': -1   * np.ones(shape=compute_grid_shape, dtype=np.int32),
+                }
+        }
+
+        device_buffers = {
+                'no_ghosts': {
+                    'u':   cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['u']),
+                    'pos':   cl.Buffer(ctx, flags=mf.WRITE_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['pos']),
+                    'dbg0': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['dbg0']),
+                    'dbg1': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['no_ghosts']['dbg1']),
+                },
+                'with_ghosts': {
+                    'u': cl.Buffer(ctx, flags=mf.READ_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['u']),
+                    'pos':   cl.Buffer(ctx, flags=mf.WRITE_ONLY  | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['pos']),
+                    'dbg0': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['dbg0']),
+                    'dbg1': cl.Buffer(ctx, flags=mf.READ_WRITE | mf.COPY_HOST_PTR, 
+								hostbuf=host_buffers_init['with_ghosts']['dbg1']),
+                }
+        }
+
+        host_buffers_reference = copy.deepcopy(host_buffers_init)
+        host_buffers_gpu       = copy.deepcopy(host_buffers_init)
+
+        cls.typegen = typegen
+        cls.queue   = queue
+
+        cls.grid_size  = grid_size
+        cls.grid_shape = grid_shape
+        cls.grid_bytes = grid_bytes
+
+        cls.grid_mesh_info         = grid_mesh_info
+        cls.compute_grid_mesh_info = compute_grid_mesh_info
+
+        cls.compute_grid_ghosts = compute_grid_ghosts
+        cls.compute_grid_size   = compute_grid_size
+        cls.compute_grid_shape  = compute_grid_shape
+        cls.compute_grid_bytes  = compute_grid_bytes
+
+        cls.host_buffers_init      = host_buffers_init
+        cls.host_buffers_reference = host_buffers_reference
+        cls.host_buffers_gpu       = host_buffers_gpu
+        cls.device_buffers         = device_buffers
+        
+        Lx = min(typegen.device.max_work_item_sizes[0], typegen.device.max_work_group_size)
+        Lx = min(Lx, grid_size[0])
+
+        cls.local_work_size = np.asarray([Lx,1,1])
+        cls.work_load       = np.asarray([1,1,1])
+
+        cls.dx     = dx
+        cls.inv_dx = inv_dx
+        cls.dt     = dtype(dt)
+        
+        cls.enable_error_plots = enable_error_plots
+
+
+    @classmethod
+    def teardown_class(cls):
+        pass
+    
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        pass
+
+    def _do_compute_cpu(self, rk_scheme, boundary):
+
+        dt = self.dt
+        dx = self.dx
+        ghosts = self.compute_grid_ghosts
+        grid_size   = self.grid_size
+        grid_ghosts = self.compute_grid_ghosts
+        compute_grid_size = self.compute_grid_size
+
+        is_periodic = False
+        if boundary   == BoundaryCondition.PERIODIC:
+            is_periodic = True
+            target = 'no_ghosts'
+            grid_ghosts*=0
+            view = [slice(0,grid_size[2]),
+                    slice(0,grid_size[1]),
+                    slice(0,grid_size[0])]
+        elif boundary == BoundaryCondition.NONE:
+            target = 'with_ghosts'
+            view = [slice(ghosts[2],grid_size[2]+ghosts[2]),
+                    slice(ghosts[1],grid_size[1]+ghosts[1]),
+                    slice(ghosts[0],grid_size[0]+ghosts[0])]
+        else:
+            raise ValueError()
+        
+        host_init_buffers      = self.host_buffers_init[target] 
+        host_buffers_reference = self.host_buffers_reference[target]
+
+        velocity = host_init_buffers['u']
+        position = host_init_buffers['pos']
+        pos = np.zeros_like(position)
+        pos[:,:,:] = np.arange(grid_size[0]+grid_ghosts[0])[None,None,:]
+        pos*=dx
+
+        def interp_velocity(X):
+            Gx = grid_size[0]
+            X = X.copy() / dx
+            lidx  = np.floor(X).astype(np.int32)
+            alpha = X-lidx
+            if is_periodic:
+                lidx  =  (lidx+Gx)%Gx
+                ridx  =  (lidx+1)%Gx
+            else:
+                ridx = lidx+1
+            Vl = np.empty_like(X)
+            Vr = np.empty_like(X)
+            for i in xrange(*view[0].indices(grid_size[2])):
+                for j in xrange(*view[1].indices(grid_size[1])):
+                    Vl[i,j,:] = velocity[i,j,lidx[i,j,:]]
+                    Vr[i,j,:] = velocity[i,j,ridx[i,j,:]]
+            return Vl + alpha*(Vr-Vl)
+
+        if rk_scheme.name() == 'Euler':
+            pos += velocity[view]*dt
+        elif rk_scheme.name() == 'RK2':
+            X0 = pos.copy()
+            K0 = velocity[view]
+            X1 = X0 + 0.5*K0*dt;
+            K1 = interp_velocity(X1)
+            K  = K1;
+            pos = X0+K*dt
+        elif rk_scheme.name() == 'RK4':
+            X0 = pos.copy()
+            K0 = velocity[view]
+            X1 = X0 + 0.5*K0*dt;
+            K1 = interp_velocity(X1)
+            X2 = X0 + 0.5*K1*dt;
+            K2 = interp_velocity(X2)
+            X3 = X0 + 1.0*K2*dt;
+            K3 = interp_velocity(X3)
+            K  = 1./6*K0 + 2./6*K1 + 2./6*K2 + 1./6*K3
+            pos = X0+K*dt
+        else:
+            msg = 'Unknown Runge-Kutta scheme {}.'.format(rk_scheme)
+            raise ValueError(msg)
+        
+        host_buffers_reference['pos'] = pos
+    
+    def _do_compute_gpu_and_check(self, rk_scheme, boundary, cached, nparticles, work_dim):
+
+        msg = '\nTesting {}directional {}d advection with {} scheme and {} boundaries, {} particles at a time.'\
+            .format('cached ' if cached else '',
+                    work_dim,
+                    rk_scheme.name(),
+                    BoundaryCondition.svalue(boundary).lower(),
+                    nparticles)
+        print msg
+
+        dt = self.dt
+        work_size       = self.grid_size
+        work_load       = self.work_load
+        local_work_size = self.local_work_size
+        queue = self.queue
+
+        grid_size  = self.grid_size
+        grid_shape = self.grid_shape
+        ghosts     = self.compute_grid_ghosts
+        
+        assert grid_size[0] % nparticles == 0
+
+        kernel_args = [dt]
+        if boundary   == BoundaryCondition.PERIODIC:
+            target = 'no_ghosts'
+            mesh_info = self.grid_mesh_info
+            view = [slice(0,grid_size[2]),
+                    slice(0,grid_size[1]),
+                    slice(0,grid_size[0])]
+        elif boundary == BoundaryCondition.NONE:
+            target = 'with_ghosts'
+            mesh_info = self.compute_grid_mesh_info
+            view = [slice(ghosts[2],grid_size[2]+ghosts[2]),
+                    slice(ghosts[1],grid_size[1]+ghosts[1]),
+                    slice(ghosts[0],grid_size[0]+ghosts[0])]
+        else:
+            raise ValueError()
+
+        for i in xrange(work_dim,3):
+            view[2-i] = 0
+        
+        grid_size       = grid_size[:work_dim]
+        grid_shape      = grid_shape[:work_dim]
+        ghosts          = ghosts[:work_dim]
+
+        work_size       = work_size[:work_dim]
+        work_load       = work_load[:work_dim]
+        local_work_size = local_work_size[:work_dim]
+
+        known_vars = {
+            'local_size': local_work_size,
+            'mesh_info': mesh_info
+        }
+        
+        host_init_buffers      = self.host_buffers_init[target] 
+        host_buffers_reference = self.host_buffers_reference[target]
+        host_buffers_gpu       = self.host_buffers_gpu[target]
+        device_buffers         = self.device_buffers[target]
+
+        dak = DirectionalAdvectionKernel(
+            typegen=self.typegen, 
+            ftype=self.typegen.fbtype,
+            work_dim=work_dim, 
+            rk_scheme=rk_scheme,
+            is_cached=cached,
+            boundary=boundary,
+            nparticles = nparticles,
+            symbolic_mode=False,
+            debug_mode=True,
+            known_vars=known_vars)
+        
+        global_work_size = dak.get_global_size(work_size,local_work_size,work_load)
+        (static_shared_bytes, dynamic_shared_bytes, total_bytes) = \
+                dak.required_workgroup_cache_size(local_work_size)
+        
+        variables = ['u','pos']
+        debug     = ['dbg0', 'dbg1']
+        for varname in variables+debug:
+            kernel_args.append(device_buffers[varname])
+        if (dynamic_shared_bytes != 0):
+            shared_buffer = cl.LocalMemory(dynamic_shared_bytes)
+            kernel_args.append(shared_buffer)
+    
+        print '\tGenerating and compiling Kernel...'
+        source = dak.__str__()
+        prg = cl.Program(self.typegen.context, source)
+        prg.build(devices=[self.typegen.device])
+        kernel = prg.all_kernels()[0]
+        kernel.set_args(*kernel_args)
+        
+        print '\tCPU => GPU'
+        for buf in variables+debug:
+            src = host_init_buffers[buf]
+            dst = device_buffers[buf]
+            cl.enqueue_copy(queue,dst,src)
+        
+        print '\tKernel execution <<<{},{}>>>'.format(global_work_size,local_work_size)
+        evt = cl.enqueue_nd_range_kernel(queue, kernel, 
+                global_work_size.tolist(), local_work_size.tolist())
+        evt.wait()
+        
+        print '\tGPU => CPU'
+        for buf in variables+debug:
+            src = device_buffers[buf]
+            dst = host_buffers_gpu[buf]
+            cl.enqueue_copy(queue,dst,src)
+
+        print '\tSynchronize queue'
+        queue.flush()
+        queue.finish()
+
+        buffers = [(varname,host_buffers_reference[varname],host_buffers_gpu[varname]) 
+                        for varname in variables]
+        self._cmp_buffers(buffers,view,dak,work_dim)
+    
+    def _cmp_buffers(self,buffers,view,dak,work_dim):
+        good = True
+        err_buffers = []
+
+        for (name,host,dev) in buffers:
+            (l1,l2,linf) = self._distances(host,dev,view)
+            print '\t{} -> l1={}  l2={}  linf={}'.format(name,l1,l2,linf)
+            if l2>1e-8:
+                err_buffers.append(name)
+                good = False
+        if not good:
+            msg = '\n[FAIL] Buffer comparisson failed for buffers {}.\n'.format(err_buffers)
+            print msg
+            dak.edit()
+            if self.enable_error_plots:
+                from matplotlib import pyplot as plt
+                for (name,host,dev) in buffers:
+                    if name in err_buffers:
+
+                        host = host[view]
+                        dev  = dev[view]
+
+                        d = (dev-host)*(dev-host)
+                        d -= np.mean(d)
+                    
+                        if work_dim==3:
+                            fig,axes = plt.subplots(2,2)
+                            axes[0][0].imshow(np.sum(d,axis=0),interpolation='nearest')
+                            axes[0][1].imshow(np.sum(d,axis=1),interpolation='nearest')
+                            axes[1][0].imshow(np.sum(d,axis=2),interpolation='nearest')
+                            axes[1][1].imshow(np.sum(d,axis=(0,1))[np.newaxis,:],interpolation='nearest')
+                            plt.title(name)
+                            fig.show()
+                            raw_input('Press enter to close all windows.')
+                            break
+                        elif work_dim==2:
+                            fig,axe = plt.subplots()
+                            axe.imshow(d,interpolation='nearest')
+                            plt.title(name)
+                            fig.show()
+                            raw_input('Press enter to close all windows.')
+                            break
+                        else:
+                            pass
+            raise RuntimeError(msg)
+
+    def _distances(self,lhs,rhs,view):
+        d    = rhs[view]-lhs[view]
+        da   = np.abs(d)
+
+        l1   = np.sum(da)/d.size
+        l2   = np.sqrt(np.sum(d*d))/d.size
+        linf = np.max(da)
+        return (l1,l2,linf)
+    
+    
+    def _check_kernels(self, rk_scheme):
+        assert isinstance(rk_scheme, ExplicitRungeKutta)
+
+        cached=[False]
+        boundaries=[BoundaryCondition.PERIODIC]
+        nparticles=[1,2,4,8,16]
+        work_dims=[1,2,3]
+        
+        for boundary in boundaries:
+            self._do_compute_cpu(boundary=boundary, rk_scheme=rk_scheme)
+            for work_dim in work_dims:
+                for cache in cached:
+                    for nparticle in nparticles:
+                        self._do_compute_gpu_and_check(boundary=boundary,rk_scheme=rk_scheme, 
+                                work_dim=work_dim, nparticles=nparticle, cached=cache)
+   
+
+    def test_advection_Euler(self):
+        rk_scheme=ExplicitRungeKutta('Euler')
+        self._check_kernels(rk_scheme=rk_scheme)
+    
+    def test_advection_RK2(self):
+        rk_scheme=ExplicitRungeKutta('RK2')
+        self._check_kernels(rk_scheme=rk_scheme)
+    
+    def test_advection_RK4(self):
+        rk_scheme=ExplicitRungeKutta('RK4')
+        self._check_kernels(rk_scheme=rk_scheme)
+
+if __name__ == '__main__':
+    TestDirectionalAdvection.setup_class(enable_error_plots=True)
+    test = TestDirectionalAdvection()
+    
+    test.test_advection_Euler()
+    test.test_advection_RK2()
+    test.test_advection_RK4()
+
+    TestDirectionalAdvection.teardown_class()
+
diff --git a/hysop/codegen/kernels/tests/test_directional_stretching.py b/hysop/codegen/kernels/tests/test_directional_stretching.py
index 3b043e0a15348d41a39115faeb3efc123c7f45fa..85f3c89059af9eecd73bf8d37c0ec22a084e7b5b 100644
--- a/hysop/codegen/kernels/tests/test_directional_stretching.py
+++ b/hysop/codegen/kernels/tests/test_directional_stretching.py
@@ -487,66 +487,66 @@ class TestDirectionalStretching(object):
 
 
 
-    def test_gradUW_Euler(self):
+    def test_stretching_gradUW_Euler(self):
         formulation=StretchingFormulation.GRAD_UW
         rk_scheme=ExplicitRungeKutta('Euler')
         self._check_kernels(formulation=formulation, rk_scheme=rk_scheme)
     
-    def test_gradUW_T_Euler(self):
+    def test_stretching_gradUW_T_Euler(self):
         formulation=StretchingFormulation.GRAD_UW_T
         rk_scheme=ExplicitRungeKutta('Euler')
         self._check_kernels(formulation=formulation, rk_scheme=rk_scheme)
     
-    def test_mixed_gradUW_Euler(self):
+    def test_stretching_mixed_gradUW_Euler(self):
         formulation=StretchingFormulation.MIXED_GRAD_UW
         rk_scheme=ExplicitRungeKutta('Euler')
         self._check_kernels(formulation=formulation, rk_scheme=rk_scheme)
     
-    def test_conservative_Euler(self):
+    def test_stretching_conservative_Euler(self):
         formulation=StretchingFormulation.CONSERVATIVE
         rk_scheme=ExplicitRungeKutta('Euler')
         self._check_kernels(formulation=formulation, rk_scheme=rk_scheme)
    
 
 
-    def test_gradUW_RK2(self):
+    def test_stretching_gradUW_RK2(self):
         formulation=StretchingFormulation.GRAD_UW
         rk_scheme=ExplicitRungeKutta('RK2')
         self._check_kernels(formulation=formulation, rk_scheme=rk_scheme)
     
-    def test_gradUW_T_RK2(self):
+    def test_stretching_gradUW_T_RK2(self):
         formulation=StretchingFormulation.GRAD_UW_T
         rk_scheme=ExplicitRungeKutta('RK2')
         self._check_kernels(formulation=formulation, rk_scheme=rk_scheme)
     
-    def test_mixed_gradUW_RK2(self):
+    def test_stretching_mixed_gradUW_RK2(self):
         formulation=StretchingFormulation.MIXED_GRAD_UW
         rk_scheme=ExplicitRungeKutta('RK2')
         self._check_kernels(formulation=formulation, rk_scheme=rk_scheme)
     
-    def test_conservative_RK2(self):
+    def test_stretching_conservative_RK2(self):
         formulation=StretchingFormulation.CONSERVATIVE
         rk_scheme=ExplicitRungeKutta('RK2')
         self._check_kernels(formulation=formulation, rk_scheme=rk_scheme)
     
    
 
-    def test_gradUW_RK4(self):
+    def test_stretching_gradUW_RK4(self):
         formulation=StretchingFormulation.GRAD_UW
         rk_scheme=ExplicitRungeKutta('RK4')
         self._check_kernels(formulation=formulation, rk_scheme=rk_scheme)
     
-    def test_gradUW_T_RK4(self):
+    def test_stretching_gradUW_T_RK4(self):
         formulation=StretchingFormulation.GRAD_UW_T
         rk_scheme=ExplicitRungeKutta('RK4')
         self._check_kernels(formulation=formulation, rk_scheme=rk_scheme)
     
-    def test_mixed_gradUW_RK4(self):
+    def test_stretching_mixed_gradUW_RK4(self):
         formulation=StretchingFormulation.MIXED_GRAD_UW
         rk_scheme=ExplicitRungeKutta('RK4')
         self._check_kernels(formulation=formulation, rk_scheme=rk_scheme)
     
-    def test_conservative_RK4(self):
+    def test_stretching_conservative_RK4(self):
         formulation=StretchingFormulation.CONSERVATIVE
         rk_scheme=ExplicitRungeKutta('RK4')
         self._check_kernels(formulation=formulation, rk_scheme=rk_scheme)
@@ -556,20 +556,20 @@ if __name__ == '__main__':
     TestDirectionalStretching.setup_class(do_extra_tests=False, enable_error_plots=True)
     test = TestDirectionalStretching()
     
-    test.test_gradUW_Euler()
-    test.test_gradUW_T_Euler()
-    test.test_mixed_gradUW_Euler()
-    test.test_conservative_Euler()
+    test.test_stretching_gradUW_Euler()
+    test.test_stretching_gradUW_T_Euler()
+    test.test_stretching_mixed_gradUW_Euler()
+    test.test_stretching_conservative_Euler()
     
-    test.test_gradUW_RK2()
-    test.test_gradUW_T_RK2()
-    test.test_mixed_gradUW_RK2()
-    test.test_conservative_RK2()
+    test.test_stretching_gradUW_RK2()
+    test.test_stretching_gradUW_T_RK2()
+    test.test_stretching_mixed_gradUW_RK2()
+    test.test_stretching_conservative_RK2()
     
-    test.test_gradUW_RK4()
-    test.test_gradUW_T_RK4()
-    test.test_mixed_gradUW_RK4()
-    test.test_conservative_RK4()
+    test.test_stretching_gradUW_RK4()
+    test.test_stretching_gradUW_T_RK4()
+    test.test_stretching_mixed_gradUW_RK4()
+    test.test_stretching_conservative_RK4()
 
     TestDirectionalStretching.teardown_class()