Commit b2163dfb authored by Loic Huder's avatar Loic Huder
Browse files

Merge remote-tracking branch 'origin/master'

parents 63175989 457d68e0
......@@ -5,21 +5,21 @@
%% Cell type:markdown id: tags:
Algorithm on series (*e.g.* time series, character strings, ...)
A series can be either a one dimensional ordered series of items (e.g. an numerical array, a string).
A series is a one dimensional ordered series of items (e.g. an numerical array, a string).
We want to compute a dissimilarity measure between series. The measure can either apply to series of
same length or not, and can be a metric (i.e. symmetric, $d(x, y) = 0 \iff x = y$, triangular inequality).
We consider two dissimilarity measures with different features.
We consider two (dis)similarity measures with different features.
S1 and S2 two series of length |S1| and |S2|
- **dtw**: dynamic time wrapping, complexity O(|S1|*|S2|)
- **cort**: normalized cosine distances between derivatives, complexity O(|S1| + |S2|)
- **dtw**: similarity measure: dynamic time wrapping, complexity O(|S1|*|S2|)
- **cort**: normalized cosine similarity measure between derivatives, complexity O(|S1| + |S2|)
%% Cell type:markdown id: tags:
......@@ -160,34 +160,22 @@
Cort
-------
**Input**: two series S1 and S2 *of same length*
**Output:** a dissimilarity measure
**Output:** a similarity measure
**Complexity:** O(|S1|+|S2|)
**Metric:** yes
What do we compute:
-------------------------------
The cosine distance between derivatives of the series.
normalize
num = 0.0
sum_square_x = 0.0
sum_square_y = 0.0
for t in range(len(s1) - 1):
slope_1 = s1[t + 1] - s1[t]
slope_2 = s2[t + 1] - s2[t]
num += slope_1 * slope_2
sum_square_x += slope_1 * slope_1
sum_square_y += slope_2 * slope_2
return num / (np.sqrt(sum_square_x * sum_square_y))
The cosine similarity measure between derivatives of the series.
%% Cell type:markdown id: tags:
......@@ -195,25 +183,16 @@
------------------------------
$$ \begin{eqnarray}
cort(A, B) &=& \cos(dA, dB) \\
&=& \frac{dA \cdot dB}{\Vert dA\Vert \Vert dB\Vert} \\
&=& \frac{\sum_{i=0}^T dA_i dB_i}{\Vert dA\Vert \Vert dB\Vert} \\
&=& \frac{\sum_{i=0}^T (A_{i+1}-A_i) (B_{i+1}-B_i)}{\sqrt{\sum_{i=0}^T (A_{i+1}-A_i)^2} \sqrt{\sum_{i=0}^T (B_{i+1}-B_i)^2}}
&=& \frac{\sum_{i=0}^{T} dA_i dB_i}{\Vert dA\Vert \Vert dB\Vert} \\
&=& \frac{\sum_{i=0}^{T-1} (A_{i+1}-A_i) (B_{i+1}-B_i)}{\sqrt{\sum_{i=0}^{T-1} (A_{i+1}-A_i)^2} \sqrt{\sum_{i=0}^{T-1} (B_{i+1}-B_i)^2}}
\end{eqnarray} $$
%% Cell type:markdown id: tags:
What do we compute:
------------------------------
<div align="middle">
<img src="./fig/cort_formule.png" style="width: 75%">
</div>
%% Cell type:markdown id: tags:
Easy enough to implement:
---------------------------------------
%% Cell type:code id: tags:
......@@ -246,11 +225,7 @@
``` python
x = [1, 2, 3, 5, 5, 6]
y = [1, 1, 2, 2, 3, 5]
cort(x,y)
print(f"cort(x,2*x)={cort(x, 2*x)} cort([1,2], [2,1])={cort([1,2], [2,1])}")
```
%%%% Output: execute_result
0.4629100498862757
......
all:
python3 setup.py develop
clean:
rm -f cdtw.*.so
rm -rf build dtw_cort_dist_mat.egg-info
#include <stdlib.h>
#include <stdio.h>
#include <float.h>
#include <math.h>
#include <float.h> /* for max_float */
#include "cdtw.h"
double min3(const double x, const double y, const double z){
if (x < y)
if (x < z) return x;
if (y < z) return y;
return z;
}
/*
Returns the minimum-distance warp path
between multivariate time series x and y.
*/
double dtw(double* x, double* y, int size_x, int size_y){
int i, j;
double* temp;
double *costP = malloc(size_x*sizeof(double));
if (! costP) return -1;
double *costC = malloc(size_x*sizeof(double));
if (! costC){
free(costP);
return -1; /* TODO: better handling of the error */
}
costP[0] = fabs(x[0] - y[0]);
for (i=1; i < size_x; i++){
costP[i] = fabs(x[i] - y[0]) + costP[i-1];
}
for (j=1; j< size_y; j++){
costC[0] = costP[0] + fabs(x[0] - y[j]);
for (i=1; i < size_x; i++){
costC[i] = min3(costC[i-1], costP[i-1], costP[i]) + fabs(x[i] - y[j]);
}
temp = costP;
costP = costC;
costC = temp;
}
/* result is read in costP because switching between costP and costC is done */
double ret = costP[size_x-1];
free(costP);
free(costC);
return ret;
}
double cort(double *x, double *y, int size){
int t;
double slopex, slopey;
double num;
double sumSquareXslope, sumSquareYslope;
num = sumSquareXslope = sumSquareYslope = 0.0;
for (t = 0; t < size-1; t++){
slopex = x[t+1] - x[t];
slopey = y[t+1] - y[t];
num += slopex * slopey;
sumSquareXslope += slopex * slopex;
sumSquareYslope += slopey * slopey;
}
return num/sqrt(sumSquareXslope*sumSquareYslope);
}
double dtw(double* x, double* y, int size_x, int size_y);
double cort(double *x, double *y, int size);
from setuptools import setup, Extension
version = "0.1"
module_distance = Extension(
name="cdtw",
sources=["cdtw.c"],
extra_compile_args=["-O3", "-mavx2"],
extra_link_args=["-O3"],
)
setup(
name="dtw_cort_dist_mat",
version=version,
description="data scientist tool for time series",
long_description="data scientist tool for time series",
classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
author="IKATS project",
author_email="Franck.Thollard@imag.fr",
url="http://ikats.imag.fr",
license="GPL",
include_package_data=True,
zip_safe=False,
install_requires=["cffi", "numpy", "setuptools"],
entry_points="",
ext_modules=[module_distance],
)
from pathlib import Path
from invoke import task
import numpy as np
here = Path(__file__).parent.absolute()
......@@ -11,7 +12,10 @@ def clean(c):
for path in directories:
if (path / "Makefile").exists():
c.run(f"cd {path} && make clean", echo=True)
for algo in ("dtw", "cort"):
f = path / "ref_{algo}.npy"
if f.exists():
f.unlink()
@task
def build(c):
......@@ -19,6 +23,18 @@ def build(c):
if (path / "Makefile").exists():
c.run(f"cd {path} && make", echo=True)
@task
def check(c):
ref_dtw = np.load("ref_small_dtw.npy")
ref_cort = np.load("ref_small_cort.npy")
for path in directories:
c.run(f"cd {path} && ./dtw_cort_dist_mat.py -s res", echo=True)
calc_dtw = np.load(f"{path / 'res_dtw.npy'}")
print(f"{path.name} dtw : allclose {np.allclose(ref_dtw, calc_dtw)}")
calc_cort = np.load(f"{path / 'res_cort.npy'}")
print(f"{path.name} cort: allclose {np.allclose(ref_cort, calc_cort)}")
@task
def bench(c, medium=False):
......
......@@ -33,6 +33,9 @@ def main(compute, only_init=False):
parser.add_argument(
"-n", type=int, default=None, help="number of series (default all 200)"
)
parser.add_argument(
"-s", type=str, default=None, help="save the matrices (provide prefixes)"
)
parser.add_argument(
"-v", action="store_true", default=None, help="visualize the matrix"
)
......@@ -55,12 +58,18 @@ def main(compute, only_init=False):
_dist_mat_dtw, _dist_mat_cort = compute(series, nb_series)
print("elapsed time = {:.3f} s".format(time() - t0))
if args.s:
dtw_mat = "{}_dtw".format(args.s)
np.save(dtw_mat, _dist_mat_dtw)
cort_mat = "{}_cort".format(args.s)
np.save(cort_mat, _dist_mat_cort)
if args.v:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2)
cax0 = axes[0].imshow(_dist_mat_dtw, cmap=plt.cm.gray)
axes[0].imshow(_dist_mat_dtw, cmap=plt.cm.gray)
axes[0].set_title("dtw")
cax1 = axes[1].imshow(_dist_mat_cort, cmap=plt.cm.gray)
axes[1].imshow(_dist_mat_cort, cmap=plt.cm.gray)
axes[1].set_title("cort")
plt.show()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment