diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04b08f506b621747f83bd4f18387f5f516dd1a1f..7203581db508b915940a0b2f4c467785ba75d215 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,6 +197,10 @@ if(WITH_FFTW)
     add_definitions(${FFTW_DEFINES})
 endif()
 
+if(WITH_LIB_CXX)
+    compile_with(Boost REQUIRED)
+endif()
+
 if(WITH_EXTRAS)
   # Arnoldi solver needs zgeev, which means lapack
   compile_with(LAPACK)
@@ -317,7 +321,7 @@ endif()
 
 if(WITH_LIB_CXX)
   #C++ variables used by setup.py.in for swig
-  set(CMAKE_CXX_FLAGS                "${CMAKE_CXX_FLAGS} -W -Wall -Wextra -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unused-parameter ${FFTW_COMPILE_FLAGS} -fPIC -std=c++11")
+  set(CMAKE_CXX_FLAGS                "${CMAKE_CXX_FLAGS} -W -Wall -Wextra -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unused-parameter -Wno-unused-local-typedefs ${FFTW_COMPILE_FLAGS} -fPIC -std=c++11")
   set(CMAKE_CXX_FLAGS_DEBUG          "${CMAKE_CXX_FLAGS_DEBUG}")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
   set(CMAKE_CXX_FLAGS_RELEASE        "${CMAKE_CXX_FLAGS_RELEASE}")
@@ -325,11 +329,14 @@ if(WITH_LIB_CXX)
   
   set(CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   set(CXX_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
-  set(CXX_EXT_INCLUDES ${PYTHON_INCLUDE_DIR} ${FFTW_INCLUDE_DIRS})
-  set(CXX_EXT_LIBS ${PYTHON_LIBRARIES} ${FFTW_LIBRARIES})
-  set(CXX_EXT_LIB_DIRS ${FFTW_LIBRARY_DIRS})
+  set(CXX_EXT_INCLUDES  ${Boost_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR} ${FFTW_INCLUDE_DIRS})
+  set(CXX_EXT_LIBS ${PYTHON_LIBRARIES} ${FFTW_LIBRARIES} ${Boost_LIBRARIES})
+  set(CXX_EXT_LIB_DIRS ${FFTW_LIBRARY_DIRS} ${Boost_LIBRARY_DIRS})
   set(CXX_EXTRA_DEFINES ${FFTW_DEFINES} -DHAS_EXTERN_TEMPLATES)
 
+  set(CMAKE_INCLUDE_SYSTEM_FLAG_C "-isystem ")
+  set(CMAKE_INCLUDE_SYSTEM_FLAG_CXX "-isystem ")
+
   #swig package name (lib name generated by swig)
   set(CPP_2_HYSOP "cpp2hysop")
 endif()
diff --git a/hysop/codegen/kernels/stretching.py b/hysop/codegen/kernels/stretching.py
index 128777239d66329f3b3815310452d6785a08872b..72ed5136420629c4b121bd27dff8f98df0c8d0c2 100644
--- a/hysop/codegen/kernels/stretching.py
+++ b/hysop/codegen/kernels/stretching.py
@@ -19,9 +19,10 @@ class CachedStretchingKernel(KernelCodeGenerator):
     def codegen_name(ftype,work_dim):
         return 'cached_stretching_{}_{}d'.format(ftype,work_dim)
     
-    def __init__(self, typegen, dim, order=2,
+    def __init__(self, typegen, dim, 
+                       device,context, 
+                       order=2,
                        ftype=None,
-                       device=None,context=None, 
                        known_vars = None,
                        symbolic_mode=True):
         
@@ -29,7 +30,8 @@ class CachedStretchingKernel(KernelCodeGenerator):
         ftype = ftype if ftype is not None else typegen.fbtype
         
         work_dim=3
-        kernel_reqs = self.build_requirements(typegen,work_dim, order, cached)
+        kernel_reqs = self.build_requirements(typegen=typegen,device=device,context=context,
+                work_dim=work_dim, order=order, cached=cached)
         kernel_args = self.gen_kernel_arguments(typegen, work_dim, ftype, kernel_reqs)
         
         name = CachedStretchingKernel.codegen_name(ftype, dim)
@@ -55,16 +57,16 @@ class CachedStretchingKernel(KernelCodeGenerator):
         return reduce(operator.mul, local_size+order, 1)*self.typegen.FLT_BYTES[self.ftype]
 
     
-    def build_requirements(self,typegen,work_dim,order,cached):
+    def build_requirements(self,typegen,device,context,work_dim,order,cached):
         reqs = WriteOnceDict()
         
         compute_id = ComputeIndexFunction(typegen=typegen, dim=work_dim, itype='int', wrap=False)
         reqs['compute_id'] = compute_id
 
-        mesh_base_struct = MeshBaseStruct(typegen=typegen, typedef='MeshBaseStruct_s')
+        mesh_base_struct = MeshBaseStruct(device=device,context=context,typegen=typegen, typedef='MeshBaseStruct_s')
         reqs['MeshBaseStruct'] = mesh_base_struct
 
-        mesh_info_struct = MeshInfoStruct(typegen=typegen, typedef='MeshInfoStruct_s')
+        mesh_info_struct = MeshInfoStruct(device=device, context=context, typegen=typegen, typedef='MeshInfoStruct_s')
         reqs['MeshInfoStruct'] = mesh_info_struct
 
         gradient = GradientFunction(typegen=typegen, dim=work_dim, order=order,
@@ -163,11 +165,22 @@ class CachedStretchingKernel(KernelCodeGenerator):
 
 
 if __name__ == '__main__':
-    
-    tg = OpenClTypeGen('float', 'dec')
-    ek = CachedStretchingKernel(typegen=tg, order=16, dim=1 ,ftype=tg.fbtype, 
-            known_vars=dict(local_size=(1024,1,1)))
-    ek.edit()
-    ek.test_compile()
-    #print ek
-    print
+
+        import pyopencl as cl
+        
+        devices  = []
+        contexts = {}
+        for plat in cl.get_platforms():
+             devices += plat.get_devices()
+        for dev in devices:
+            ctx = cl.Context([dev])
+            contexts[dev] = ctx
+
+        tg = OpenClTypeGen('float', 'dec')
+        for dev,ctx in contexts.iteritems():
+            ek = CachedStretchingKernel(typegen=tg, context=ctx, device=dev,
+                    order=16, dim=1 ,ftype=tg.fbtype, 
+                    known_vars=dict(local_size=(1024,1,1)))
+            ek.edit()
+            ek.test_compile()
+            break
diff --git a/hysop/constants.py b/hysop/constants.py
index 5894a663591087c1a71e00828d9ccc7103acc216..3f9323cdccf16d799ecf9efb70974b5df9d9fddd 100755
--- a/hysop/constants.py
+++ b/hysop/constants.py
@@ -111,10 +111,10 @@ def debugdecorator(f):
 debug = debugdecorator
 
 # redefine profile decorator
-if __PROFILE__:
-    from memory_profiler import profile
-    prof = profile
-else:
-    def prof(f):
-        # Nothing ...
-        return f
+# if __PROFILE__:
+    # from memory_profiler import profile
+    # prof = profile
+# else:
+def prof(f):
+    # Nothing ...
+    return f
diff --git a/hysop/gpu/static_gpu_particle_advection_dir.py b/hysop/gpu/static_gpu_particle_advection_dir.py
index d14e76332eb0064d65da3ecbce0abdbecb7dc665..33e3508f88cfd9d1daae19883a3cf83fac46ceec 100644
--- a/hysop/gpu/static_gpu_particle_advection_dir.py
+++ b/hysop/gpu/static_gpu_particle_advection_dir.py
@@ -44,6 +44,9 @@ class StaticGPUParticleAdvectionDir(GPUParticleAdvectionDir):
             'transpose_xz':[], 
             'transpose_zx':[], 
             'stretching':[], 
+            'advec':[],
+            'remesh':[],
+            'advec_remesh':[]
             }
 
         # Additional method and configuration checks
@@ -418,7 +421,7 @@ class StaticGPUParticleAdvectionDir(GPUParticleAdvectionDir):
         mesh_info = self._fields_mesh_info_var
         dt=0.1
 
-        nruns=16
+        nruns=4
         force_renew_cache=True
 
         (kernel_launcher, kernel_args, kernel_args_mapping, cached_bytes) = \
@@ -490,6 +493,7 @@ class StaticGPUParticleAdvectionDir(GPUParticleAdvectionDir):
             )
 
             evt = self._advec_and_remesh[nbc](*args, wait_for=wait_evts)
+            self.bench['advec_remesh'].append(evt)
             fg.events.append(evt)
             velocity.events.append(evt)
     
@@ -506,6 +510,7 @@ class StaticGPUParticleAdvectionDir(GPUParticleAdvectionDir):
             self._cl_mesh_info
         ])
         advec_evt = self._advec(*args,wait_for=velocity.events)
+        self.bench['advec'].append(advec_evt)
         velocity.events.append(advec_evt)
 
         for (fg,fp) in self.fields_on_part.iteritems():
@@ -518,6 +523,7 @@ class StaticGPUParticleAdvectionDir(GPUParticleAdvectionDir):
                 + [self._cl_mesh_info]
             )
             remesh_evt = self._remesh[nbc](*args, wait_for=[advec_evt])
+            self.bench['remesh'].append(remesh_evt)
             fg.events.append(remesh_evt)
 
         if self._has_stretching:
@@ -608,7 +614,7 @@ if __name__=='__main__':
     dim = 3
     GHOSTS    = 0 
     NSCALARS  = 0
-    f_resolution = (65,65,65)[:dim]
+    f_resolution = (513,513,129)[:dim]
     v_resolution = f_resolution
     #v_resolution = (33,33,33)[:dim]
     ghosts       = (GHOSTS,)*dim
@@ -750,4 +756,15 @@ if __name__=='__main__':
         
         simu.advance()
         i+=1
+    
+    from hysop.gpu.kernel_autotuner import OpenClKernelStatistics
+    bench = A._advec_dir[0].bench
+    for i in xrange(1,dim):
+        for k,v in A._advec_dir[i].bench.iteritems():
+            bench[k] += v
+
+    for name,evts in bench.iteritems():
+        if len(evts)>=1:
+            print name, OpenClKernelStatistics(events=evts,nruns=i)
+