From fbc55d8cf0f768857acbd4297ef58aa2be67ec44 Mon Sep 17 00:00:00 2001
From: Keck Jean-Baptiste <jean-baptiste.keck@imag.fr>
Date: Wed, 14 Dec 2016 01:06:04 +0100
Subject: [PATCH] debug stretching

---
 hysop/codegen/base/cl_extensions.py           |  2 +-
 hysop/codegen/functions/apply_stencil.py      | 45 ++++++----
 hysop/codegen/functions/stretching_rhs.py     | 67 +++++++++++----
 .../codegen/kernels/directional_stretching.py | 64 +++++++++++---
 .../tests/test_directional_stretching.py      | 83 ++++++++++---------
 5 files changed, 177 insertions(+), 84 deletions(-)

diff --git a/hysop/codegen/base/cl_extensions.py b/hysop/codegen/base/cl_extensions.py
index d4be3e809..bab1df11d 100644
--- a/hysop/codegen/base/cl_extensions.py
+++ b/hysop/codegen/base/cl_extensions.py
@@ -18,7 +18,7 @@ _cl_extension_custom_declarations = {
 
 from hysop.codegen.base.opencl_codegen import OpenClCodeGenerator
 from hysop.codegen.base.types          import OpenClTypeGen
-from hysop.codegen.base.test           import test_typegen
+from hysop.codegen.base.test           import _test_typegen
 
 class ClExtCodeGen(OpenClCodeGenerator):
     def __init__(self,ext_name):
diff --git a/hysop/codegen/functions/apply_stencil.py b/hysop/codegen/functions/apply_stencil.py
index 1252d7b9a..3074c1064 100644
--- a/hysop/codegen/functions/apply_stencil.py
+++ b/hysop/codegen/functions/apply_stencil.py
@@ -12,9 +12,11 @@ class ApplyStencilFunction(OpenClFunctionCodeGenerator):
     
     def __init__(self,typegen,stencil,ftype,
             components=1, vectorize=True,
+            extra_inputs=[],
             scalar_inputs=[],
             vector_inputs=['data'],
             op='{vinput0}[id]',
+            custom_id=None,
             vector_suffixes=None,
             data_storage='__local', restrict=True,
             multipliers={},
@@ -29,6 +31,8 @@ class ApplyStencilFunction(OpenClFunctionCodeGenerator):
         if (vector_suffixes is None):
             vector_suffixes = range(components)
 
+        has_custom_id = (custom_id is not None)
+
         args = ArgDict()
         for iname in vector_inputs:
             if vectorize:
@@ -41,8 +45,12 @@ class ApplyStencilFunction(OpenClFunctionCodeGenerator):
         for iname in scalar_inputs:
             name = iname
             args[name] = CodegenVariable(name, ftype, typegen, const=True, add_impl_const=True, storage=data_storage, ptr=True, restrict=restrict)
-        args['offset']  = CodegenVariable('offset', itype, typegen, add_impl_const=True)
-        args['stride']  = CodegenVectorClBuiltin('stride', itype, dim, typegen, add_impl_const=True,nl=True)
+        for arg in extra_inputs:
+            args[arg.name] = arg
+        
+        if not has_custom_id:
+            args['offset']  = CodegenVariable('offset', itype, typegen, add_impl_const=True)
+            args['stride']  = CodegenVectorClBuiltin('stride', itype, dim, typegen, add_impl_const=True,nl=True)
         for varname,vartype in multipliers.iteritems():
             if vartype=='ftype':
                 vartype=ftype
@@ -57,6 +65,7 @@ class ApplyStencilFunction(OpenClFunctionCodeGenerator):
         super(ApplyStencilFunction,self).__init__(basename=basename,
                 output=vtype,typegen=typegen,inline=True,
                 args=args, known_args=known_args)
+
         self.dim = dim
         self.itype = itype
         self.ftype = ftype
@@ -69,6 +78,8 @@ class ApplyStencilFunction(OpenClFunctionCodeGenerator):
         self.vector_inputs = vector_inputs
         self.vector_suffixes = vector_suffixes
         self.scalar_inputs = scalar_inputs
+        self.has_custom_id = has_custom_id
+        self.custom_id = custom_id
         self.op = op
 
         self.gencode()
@@ -88,7 +99,7 @@ class ApplyStencilFunction(OpenClFunctionCodeGenerator):
         components = self.components
         known_args = self.known_args
         cached     = self.is_cached()
-        
+
         ncoeffs    = self.stencil.non_zero_coefficients()
         nmul       = len(self.multipliers)
 
@@ -118,14 +129,13 @@ class ApplyStencilFunction(OpenClFunctionCodeGenerator):
         components = s.components
         vectorized = s.vectorized
         
+        has_custom_id = s.has_custom_id
+        
         vector_suffixes = s.vector_suffixes
 
         ftype = s.ftype
         vtype = s.vtype
         
-        offset = s.vars['offset']
-        stride = s.vars['stride']
-
         has_multiplier = ('multiplier' in s.vars.keys())
         if has_multiplier:
             multiplier = s.vars['multiplier']
@@ -151,15 +161,20 @@ class ApplyStencilFunction(OpenClFunctionCodeGenerator):
                         for j,vn in enumerate(self.scalar_inputs):
                             operands['sinput{}'.format(j)] = s.vars[vn]()
                         for (off,coeff) in self.stencil.iteritems():
-                            strided=''
-                            for i in xrange(dim): 
-                                if off[i]==0:
-                                    continue
-                                if stride.known() and stride.value[0] == 1:
-                                    strided+='{}'.format(tg.dump(off[i]))
-                                else:
-                                    strided+='{}*{}'.format(tg.dump(off[i]),stride[i])
-                            operands['id'] = '{}+{}'.format(offset(),strided)
+                            if not has_custom_id:
+                                offset = s.vars['offset']
+                                stride = s.vars['stride']
+                                strided=''
+                                for i in xrange(dim): 
+                                    if off[i]==0:
+                                        continue
+                                    if stride.known() and stride.value[0] == 1:
+                                        strided+='{}'.format(tg.dump(off[i]))
+                                    else:
+                                        strided+='{}*{}'.format(tg.dump(off[i]),stride[i])
+                                operands['id'] = '{}+{}'.format(offset(),strided)
+                            else:
+                                operands['id'] = self.custom_id.format(offset=tg.dump(off[0]))
                             code = '{} += {} $* {};'.format(_res,tg.dump(coeff),self.op.format(**operands))
                             al.append(code)
                         if vectorized:
diff --git a/hysop/codegen/functions/stretching_rhs.py b/hysop/codegen/functions/stretching_rhs.py
index 9593bab43..35812bcdc 100644
--- a/hysop/codegen/functions/stretching_rhs.py
+++ b/hysop/codegen/functions/stretching_rhs.py
@@ -6,6 +6,7 @@ from hysop.codegen.base.types     import OpenClTypeGen
 from hysop.codegen.base.utils     import WriteOnceDict, ArgDict
 from hysop.codegen.base.statistics import WorkStatistics
 from hysop.methods_keys import StretchingFormulation
+from hysop.constants    import BoundaryCondition
 
 from hysop.codegen.functions.apply_stencil import ApplyStencilFunction
 
@@ -23,20 +24,25 @@ class DirectionalStretchingRhsFunction(OpenClFunctionCodeGenerator):
         'directions': 'XYZ'
     }
     
-    def __init__(self, typegen, dim, ftype, order, direction, formulation, cached,
+    def __init__(self, typegen, dim, ftype, order, direction, formulation, cached, boundary,
             restrict=True, vectorize_u=False,
             itype='int', 
             used_variables = _default_used_variables,
             known_args=None):
         
-        assert order>1 and order%2==0
         assert dim==3
         assert direction<dim
+        assert order>1 and order%2==0
         assert formulation in StretchingFormulation.entries()
+        assert boundary    in BoundaryCondition.entries()
         
         formulation  = StretchingFormulation.value(formulation)
         sformulation = StretchingFormulation.svalue(formulation)
-        is_conservative  = (formulation==StretchingFormulation.CONSERVATIVE)
+        boundary     = BoundaryCondition.value(boundary)
+        sboundary    = BoundaryCondition.svalue(boundary)
+
+        is_conservative = (formulation==StretchingFormulation.CONSERVATIVE)
+        is_periodic     = (boundary==BoundaryCondition.PERIODIC)
 
         if cached:
             storage='__local'
@@ -46,10 +52,10 @@ class DirectionalStretchingRhsFunction(OpenClFunctionCodeGenerator):
         vtype = typegen.vtype(ftype,dim)
 
         (args,basename) = self.build_prototype(typegen,dim,itype,ftype,vtype,order,direction,cached,
-                restrict,storage,vectorize_u,used_variables,sformulation,is_conservative)
+                restrict,storage,vectorize_u,used_variables,sformulation,is_conservative,is_periodic)
 
-        reqs = self.build_requirements(typegen,dim,itype,ftype,vtype,order,direction,
-                restrict,storage,vectorize_u,used_variables,is_conservative)
+        reqs = self.build_requirements(typegen,dim,itype,ftype,vtype,order,direction,boundary,cached,
+                restrict,storage,vectorize_u,used_variables,is_conservative,is_periodic,args)
         
         super(DirectionalStretchingRhsFunction,self).__init__(basename=basename,
                 output=vtype,typegen=typegen,inline=True,
@@ -68,13 +74,15 @@ class DirectionalStretchingRhsFunction(OpenClFunctionCodeGenerator):
         self.vectorize_u     = vectorize_u
         self.storage = storage
         self.is_conservative = is_conservative
+        self.is_periodic = is_periodic
         self.formulation=formulation
         self.cached=cached
+        self.boundary=boundary
         
         self.gencode()
 
     def build_prototype(self,typegen,dim,itype,ftype,vtype,order,direction,cached,
-            restrict,storage, vectorize_u,used_variables,sformulation,is_conservative):
+            restrict,storage, vectorize_u,used_variables,sformulation,is_conservative,is_periodic):
 
         U = used_variables['U']
         W = used_variables['W']
@@ -92,13 +100,17 @@ class DirectionalStretchingRhsFunction(OpenClFunctionCodeGenerator):
                 Wd= '{}{}'.format(W,xyz[direction])
                 args[Wd] = CodegenVariable(Wd, ftype, typegen, const=False, add_impl_const=True, storage=storage, ptr=True, restrict=restrict, nl=True)
                 
-
         args[W]  = CodegenVectorClBuiltin(W, ftype, dim, typegen, add_impl_const=True)
         args['inv_dx'] = CodegenVariable('inv_dx', ftype, typegen, add_impl_const=True, nl=True)
 
         if is_conservative:
             args['rk_step'] = CodegenVariable('rk_step', itype, typegen, add_impl_const=True)
-        args['offset'] = CodegenVariable('offset', itype, typegen, add_impl_const=True,nl=True)
+        if is_periodic and (not cached):
+            args['base']   = CodegenVariable('base',  itype, typegen, add_impl_const=True,nl=True)
+            args['offset'] = CodegenVariable('offset', itype, typegen, add_impl_const=True,nl=True)
+            args['width']   = CodegenVariable('width',   itype, typegen, add_impl_const=True,nl=True)
+        else:
+            args['offset'] = CodegenVariable('offset', itype, typegen, add_impl_const=True,nl=True)
         if is_conservative:
             args['lidx']  = CodegenVariable('lidx', itype, typegen, add_impl_const=True)
             args['Lx']   = CodegenVariable('Lx',   itype, typegen, add_impl_const=True, nl=True)
@@ -115,8 +127,8 @@ class DirectionalStretchingRhsFunction(OpenClFunctionCodeGenerator):
         stencil = Stencil(S.stencil, origin=S.Lx, order=S.order)
         return stencil
         
-    def build_requirements(self, typegen,dim,itype,ftype,vtype,order,direction,
-            restrict,storage,vectorize_u,used_variables,is_conservative):
+    def build_requirements(self, typegen,dim,itype,ftype,vtype,order,direction,boundary,cached,
+            restrict,storage,vectorize_u,used_variables,is_conservative,is_periodic,args):
 
         reqs = WriteOnceDict()
 
@@ -132,15 +144,32 @@ class DirectionalStretchingRhsFunction(OpenClFunctionCodeGenerator):
             scalar_inputs = []
             op = '{vinput0}[{id}]'
 
-        apply_stencil = ApplyStencilFunction(typegen=typegen,stencil=self.build_stencil(order),
-                ftype=ftype, itype=itype, data_storage=storage, vectorize=vectorize_u,
+        if is_periodic and not cached:
+            base   = args['base']
+            offset = args['offset']
+            width  = args['width']
+            extra_inputs = [base,offset,width]
+            custom_id = '{}+({}+{}+{})%{}'.format(base(),width(),offset(),'{offset}',width())
+            known_args={}
+        else:
+            extra_inputs=[]
+            custom_id=None
+            known_args={'stride':1}
+
+        apply_stencil = ApplyStencilFunction(typegen=typegen,
+                stencil=self.build_stencil(order),
+                ftype=ftype, itype=itype, 
+                data_storage=storage, 
+                vectorize=vectorize_u,
                 components=dim,
+                extra_inputs  = extra_inputs,
                 scalar_inputs = scalar_inputs,
                 vector_inputs = vector_inputs, 
                 vector_suffixes = xyz,
                 op=op,
+                custom_id=custom_id,
                 multipliers={'inv_dx':'ftype'},
-                known_args={'stride':1})
+                known_args=known_args)
         reqs['apply_stencil'] = apply_stencil
 
         return reqs
@@ -159,6 +188,7 @@ class DirectionalStretchingRhsFunction(OpenClFunctionCodeGenerator):
         order = s.order
 
         is_conservative = s.is_conservative
+        is_periodic = s.is_periodic
 
         apply_stencil = s.reqs['apply_stencil']
         
@@ -167,10 +197,13 @@ class DirectionalStretchingRhsFunction(OpenClFunctionCodeGenerator):
         W = used_variables['W']
         xyz = used_variables['components']
 
-        offset = s.args['offset']
-
         fargs = {}
-        fargs['offset'] = offset
+        if is_periodic and not cached:
+            fargs['base']  = s.args['base']
+            fargs['offset'] = s.args['offset']
+            fargs['width'] = s.args['width']
+        else:
+            fargs['offset'] = s.args['offset']
         fargs['inv_dx'] = s.args['inv_dx']
         if vectorize_u:
             fargs[U] = s.args[U]
diff --git a/hysop/codegen/kernels/directional_stretching.py b/hysop/codegen/kernels/directional_stretching.py
index 1c7287f03..6be81ae7a 100644
--- a/hysop/codegen/kernels/directional_stretching.py
+++ b/hysop/codegen/kernels/directional_stretching.py
@@ -135,11 +135,8 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
         assert (local_work_size[1] == 1) and (local_work_size[2]==1)
         assert local_work_size[0] > 2*cache_ghosts
 
-        local_work = local_work_size[0] - 2*cache_ghosts
-        Gx = ((work_size[0]+local_work-1)/local_work) * local_work_size[0]
-
         global_size = work_size.copy()
-        global_size[0] = Gx
+        global_size[0] = local_work_size[0]
         return global_size
     
     #return a tuple of required (static,dynamic) cache bytes per workgroup
@@ -191,7 +188,7 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
          
         stretching_rhs = DirectionalStretchingRhsFunction(typegen=typegen, dim=work_dim, 
                 ftype=ftype, cached=cached,
-                order=order, direction=direction,
+                order=order, direction=direction, boundary=boundary,
                 formulation=formulation,
                 restrict=True, vectorize_u=False,
                 itype='int')
@@ -419,6 +416,19 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
 
                 init = compute_index(idx=global_id, size=grid_size)
                 s.append('{} = {};'.format(global_index(), init))
+
+                cond='{}==0 && {}==0 && {}==0'.format(global_id[2],global_id[1],local_id[0])
+                with s._if_(cond):
+                    code = 'printf(\"G=(%lu,%lu,%lu)\\n\", get_global_size(0), get_global_size(1), get_global_size(2));'
+                    s.append(code)
+                    code = 'printf(\"L=(%lu,%lu,%lu)\\n\", get_local_size(0), get_local_size(1), get_local_size(2));'
+                    s.append(code)
+                    code = 'printf(\"gid=(%u,%u,%u)\\n\", get_global_id(0), get_global_id(1), get_global_id(2));'
+                    s.append(code)
+                    code = 'printf(\"lid=(%u,%u,%u)\\n\", get_local_id(0), get_local_id(1), get_local_id(2));'
+                    s.append(code)
+                    code = 'printf(\"k=%i, gidx=%i, GID=%i \\n\", k,gid.x,GID);'
+                    s.append(code)
                     
                 winit, uinit = '',''
                 for i in xrange(work_dim):
@@ -435,9 +445,13 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
                     s.append('{} = {};'.format(U(), uinit))
                     s.append('{} = {};'.format(W(), winit))
                     if is_periodic:
-                        with s._if_('{} < {}'.format(local_id[0],cache_ghosts())):
-                            s.append('{} = {};'.format(Ul[local_id[0]], U()))
-                            s.append('{} = {};'.format(Wl[local_id[0]], W()))
+                        with s._if_('{gid}>=0 && {gid}<{ghosts}'.format(gid=global_id[0],ghosts=cache_ghosts())):
+                            s.append('{} = {};'.format(Ul[global_id[0]], U()))
+                            s.append('{} = {};'.format(Wl[global_id[0]], W()))
+                            cond='{}==0 && {}==0'.format(global_id[2],global_id[1])
+                            with s._if_(cond):
+                                code = 'printf(\"AAAAA  k=%i, gidx=%i, lidx=%i, GID=%i\\n\", k,gid.x,lid.x,GID);'
+                                s.append(code)
                     s.append('{} = false;'.format(first()))
                 with s._else_():
                     if is_periodic:
@@ -445,6 +459,11 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
                             _id = '{}-{}'.format(global_id[0],compute_grid_size[0])
                             s.append('{} = {};'.format(U(), Ul[_id]))
                             s.append('{} = {};'.format(W(), Wl[_id]))
+                            
+                            cond='{}==0 && {}==0'.format(global_id[2],global_id[1])
+                            with s._if_(cond):
+                                code = 'printf(\"BBBBB k=%i, gidx=%i, lidx=%i, offset=%i \\n\", k,gid.x,lid.x, gid.x-compute_grid_size.x);'
+                                s.append(code)
                         with s._elif_('{} < 2*{}'.format(local_id[0],cache_ghosts())):
                             s.append('{} = {};'.format(U(), Ur[local_id[0]]))
                             s.append('{} = {};'.format(W(), Wr[local_id[0]]))
@@ -476,11 +495,19 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
                 s.jumpline()
             
 
-                rk_argnames = runge_kutta.args.keys()
-                rk_args={'offset': local_id[0] if cached else global_index,
-                         'dt': dt,
+                rk_args={'dt': dt,
                          'inv_dx': inv_dx,
                          'W': W}
+                if is_periodic and (not cached):
+                    base = CodegenVariable('base','int',typegen=tg,const=True)
+                    base.declare(s,init='({}/{}) * {}'.format(global_index(),grid_size[0],grid_size[0]))
+                    offset = CodegenVariable('offset','int',typegen=tg,const=True)
+                    offset.declare(s,init='{}-{}'.format(global_index(),base()))
+                    rk_args['base']   = base
+                    rk_args['offset'] = offset
+                    rk_args['width']  = grid_size[0]
+                else:
+                    rk_args['offset'] = local_id[0] if cached else global_index
                 if is_conservative:
                     rk_args['Lx']  = local_size[0]
                     rk_args['lidx'] = local_id[0]
@@ -499,8 +526,8 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
                         Wd  = s.vars[Wd_name]
                     rk_args[Wd_name] = Wd
                 call = runge_kutta(**rk_args)
-                # code = '{} = {};'.format(W(), call)
-                code = '{} += 42;'.format(W(), call)
+                code = '{} = {};'.format(W(), call)
+                #code = '{} += 42;'.format(W(), call)
                 s.append(code)
                 s.jumpline()
                 
@@ -515,6 +542,17 @@ class DirectionalStretchingKernel(KernelCodeGenerator):
                         Wi = s.vars[Wi]
                         code='{} = {};'.format(Wi[global_index()], W[i])
                         s.append(code)
+                    cond='{}==0 && {}==0'.format(global_id[2],global_id[1])
+                    if is_periodic and (not cached):
+                        with s._if_(cond):
+                            with s._if_('{0} == 0 || {0} == 49'.format(global_id[0])):
+                                code = 'printf(\"DDDDD k=%i, gidx=%i, lidx=%i, offset=%i, GID=%i, base=%i, offset=%i, width=%i\\n\", k,gid.x,lid.x, gid.x-compute_grid_size.x,GID,base,offset,grid_size.x);'
+                                s.append(code)
+
+                cond='{}==0 && {}==0'.format(global_id[2],global_id[1])
+                with s._elif_(cond):
+                    code = 'printf(\"CCCCC k=%i, gidx=%i, lidx=%i, offset=%i \\n\", k,gid.x,lid.x, gid.x-compute_grid_size.x);'
+                    s.append(code)
                 
                 s.barrier(_global=True,_local=True)
                 
diff --git a/hysop/codegen/kernels/tests/test_directional_stretching.py b/hysop/codegen/kernels/tests/test_directional_stretching.py
index cad0b6465..d272ff4c8 100644
--- a/hysop/codegen/kernels/tests/test_directional_stretching.py
+++ b/hysop/codegen/kernels/tests/test_directional_stretching.py
@@ -11,12 +11,14 @@ class TestDirectionalStretching(object):
 
     @classmethod
     def setup_class(cls,enable_error_plots=False):
-        typegen = _test_typegen('float','dec')
+        typegen = _test_typegen('double','dec')
+        dtype = np.float64
+
         queue = cl.CommandQueue(typegen.context)
         ctx = typegen.context
 
-        grid_size = np.asarray([40,40,40])
-        compute_grid_ghosts = np.asarray([12,12,12])
+        grid_size = np.asarray([50,50,50])
+        compute_grid_ghosts = np.asarray([2,2,2])
         compute_grid_size   = grid_size + 2*compute_grid_ghosts
         
         (A,grid_mesh_info)         = _test_mesh_info(typegen,3,0,grid_size)
@@ -32,20 +34,20 @@ class TestDirectionalStretching(object):
 
         host_buffers_init = {
                 'no_ghosts': {
-                    'ux': np.random.rand(*grid_size).astype(np.float32),
-                    'uy': np.random.rand(*grid_size).astype(np.float32),
-                    'uz': np.random.rand(*grid_size).astype(np.float32),
-                    'wx': np.random.rand(*grid_size).astype(np.float32),
-                    'wy': np.random.rand(*grid_size).astype(np.float32),
-                    'wz': np.random.rand(*grid_size).astype(np.float32)
+                    'ux': np.random.rand(*grid_size).astype(dtype),
+                    'uy': np.random.rand(*grid_size).astype(dtype),
+                    'uz': np.random.rand(*grid_size).astype(dtype),
+                    'wx': np.random.rand(*grid_size).astype(dtype),
+                    'wy': np.random.rand(*grid_size).astype(dtype),
+                    'wz': np.random.rand(*grid_size).astype(dtype)
                 },
                 'with_ghosts': {
-                    'ux': np.random.rand(*compute_grid_size).astype(np.float32),
-                    'uy': np.random.rand(*compute_grid_size).astype(np.float32),
-                    'uz': np.random.rand(*compute_grid_size).astype(np.float32),
-                    'wx': np.random.rand(*compute_grid_size).astype(np.float32),
-                    'wy': np.random.rand(*compute_grid_size).astype(np.float32),
-                    'wz': np.random.rand(*compute_grid_size).astype(np.float32)
+                    'ux': np.random.rand(*compute_grid_size).astype(dtype),
+                    'uy': np.random.rand(*compute_grid_size).astype(dtype),
+                    'uz': np.random.rand(*compute_grid_size).astype(dtype),
+                    'wx': np.random.rand(*compute_grid_size).astype(dtype),
+                    'wy': np.random.rand(*compute_grid_size).astype(dtype),
+                    'wz': np.random.rand(*compute_grid_size).astype(dtype)
                 }
         }
 
@@ -101,8 +103,8 @@ class TestDirectionalStretching(object):
         cls.host_buffers_gpu       = host_buffers_gpu
         cls.device_buffers         = device_buffers
 
-        cls.dt = np.float32(0.5)
-        cls.local_work_size = np.asarray([32,1,1])
+        cls.dt = dtype(0.5)
+        cls.local_work_size = np.asarray([16,1,1])
         cls.inv_dx = inv_dx
 
         cls.enable_error_plots = enable_error_plots
@@ -135,6 +137,7 @@ class TestDirectionalStretching(object):
             view = [slice(ghosts[0],grid_size[0]+ghosts[0]),
                     slice(ghosts[1],grid_size[1]+ghosts[1]),
                     slice(ghosts[2],grid_size[2]+ghosts[2])]
+            print view
         else:
             raise ValueError()
         
@@ -148,7 +151,7 @@ class TestDirectionalStretching(object):
         def deriv(field):
             res = np.zeros_like(field)
             for i in xrange(-order/2,order/2+1,1):
-                res += stencil[i+order/2]*np.roll(field,i)
+                res += stencil[i+order/2]*np.roll(field,-i)
             return res*self.inv_dx[0]
 
         host_init_buffers      = self.host_buffers_init[target] 
@@ -164,10 +167,8 @@ class TestDirectionalStretching(object):
             if formulation==StretchingFormulation.GRAD_UW:
                 for i in xrange(3):
                     if i!=direction:
-                        # W[i] += dt*dUdx[i]*W[direction]
-                        W[i] += 42
-                # W[direction] += dt*dUdx[direction]*W[direction]
-                W[direction] += 42
+                        W[i] += dt*dUdx[i]*W[direction]
+                W[direction] += dt*dUdx[direction]*W[direction]
             else:
                 msg = 'Unknown stretching formulation scheme {}.'.format(
                         StretchingFormulation.svalue(formulation))
@@ -239,8 +240,6 @@ class TestDirectionalStretching(object):
             boundary=boundary,
             known_vars=known_vars)
 
-        dsk.edit()
-
         global_work_size = dsk.get_global_work_size(work_size,local_work_size)
         (static_shared_bytes, dynamic_shared_bytes) = \
                 dsk.required_workgroup_cache_size(local_work_size)
@@ -282,38 +281,43 @@ class TestDirectionalStretching(object):
         
         buffers = [(varname,host_buffers_reference[varname],host_buffers_gpu[varname]) 
                         for varname in vorticity]
-        self._cmp_buffers(buffers,view)
+        self._cmp_buffers(buffers,view,dsk)
     
-    def _cmp_buffers(self,buffers,view):
+    def _cmp_buffers(self,buffers,view,dsk):
         good = True
         err_buffers = []
 
         for (name,host,dev) in buffers:
             (l1,l2,linf) = self._distances(host,dev,view)
             print '\t{} -> l1={}  l2={}  linf={}'.format(name,l1,l2,linf)
-            if linf>0:
+            if linf>1e-12:
                 err_buffers.append(name)
                 good = False
         if not good:
             msg = '\n[FAIL] Buffer comparisson failed for buffers {}.\n'.format(err_buffers)
             print msg
+            dsk.edit()
             if self.enable_error_plots:
                 from matplotlib import pyplot as plt
                 for (name,host,dev) in buffers:
                     if name in err_buffers:
-                        fig = plt.figure()
+                        fig,axes = plt.subplots(2,2)
 
-                        host = host[view][0,:,:]
-                        dev  =  dev[view][0,:,:]
+                        host = host[view]
+                        dev  = dev[view]
 
                         d = (dev-host)*(dev-host)
                         d -= np.mean(d)
                         
                         plt.title(name)
-                        plt.imshow(d,interpolation='nearest')
+
+                        axes[0][0].imshow(np.sum(d,axis=0),interpolation='nearest')
+                        axes[0][1].imshow(np.sum(d,axis=1),interpolation='nearest')
+                        axes[1][0].imshow(np.sum(d,axis=2),interpolation='nearest')
+                        axes[1][1].imshow(np.sum(d,axis=(0,1))[np.newaxis,:],interpolation='nearest')
 
                         fig.show()
-                raw_input('Press enter to close windows.')
+                raw_input('Press enter to close all windows.')
             raise RuntimeError(msg)
 
     def _distances(self,lhs,rhs,view):
@@ -330,16 +334,19 @@ class TestDirectionalStretching(object):
         assert formulation in StretchingFormulation.entries()
         assert isinstance(rk_scheme, ExplicitRungeKutta)
 
-        order=6
         direction=0
-        boundary=BoundaryCondition.PERIODIC
         cached=False
 
-        self._do_compute_cpu(order=order, direction=direction, boundary=boundary,
-                formulation=formulation, rk_scheme=rk_scheme)
+        orders=[4,6]
+        boundaries=[BoundaryCondition.NONE,BoundaryCondition.PERIODIC]
         
-        self._do_compute_gpu_and_check(order=order, direction=direction, boundary=boundary,
-                formulation=formulation, rk_scheme=rk_scheme, cached=cached)
+        for boundary in boundaries:
+            for order in orders:
+                self._do_compute_cpu(order=order, direction=direction, boundary=boundary,
+                        formulation=formulation, rk_scheme=rk_scheme)
+                
+                self._do_compute_gpu_and_check(order=order, direction=direction, boundary=boundary,
+                        formulation=formulation, rk_scheme=rk_scheme, cached=cached)
    
 
     #########
-- 
GitLab