From e3b2f5aa133fff581e1dd662a6f8c46ada50cdf2 Mon Sep 17 00:00:00 2001
From: "Jean-Luc Parouty Jean-Luc.Parouty@simap.grenoble-inp.fr"
 <paroutyj@f-dahu.u-ga.fr>
Date: Fri, 31 Jan 2020 12:45:51 +0100
Subject: [PATCH] Update LSTM

Former-commit-id: 3e5e007d7909ab1fb8b064db23825fd2af3601d8
---
 IMDB/.~03-LSTM-Keras.ipynb    | 679 ++++++++++++++++++++++++++++++++++
 IMDB/01-Embedding-Keras.ipynb | 105 +-----
 IMDB/03-LSTM-Keras.ipynb      | 440 ----------------------
 3 files changed, 691 insertions(+), 533 deletions(-)
 create mode 100644 IMDB/.~03-LSTM-Keras.ipynb

diff --git a/IMDB/.~03-LSTM-Keras.ipynb b/IMDB/.~03-LSTM-Keras.ipynb
new file mode 100644
index 0000000..db8bfb1
--- /dev/null
+++ b/IMDB/.~03-LSTM-Keras.ipynb
@@ -0,0 +1,679 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Text Embedding - IMDB dataset\n",
+    "=============================\n",
+    "---\n",
+    "Introduction au Deep Learning  (IDLE) - S. Arias, E. Maldonado, JL. Parouty - CNRS/SARI/DEVLOG - 2020  \n",
+    "\n",
+    "## Text classification using **Text embedding** :\n",
+    "\n",
+    "The objective is to guess whether film reviews are **positive or negative** based on the analysis of the text. \n",
+    "\n",
+    "Original dataset can be find **[there](http://ai.stanford.edu/~amaas/data/sentiment/)**  \n",
+    "Note that [IMDb.com](https://imdb.com) offers several easy-to-use [datasets](https://www.imdb.com/interfaces/)  \n",
+    "For simplicity's sake, we'll use the dataset directly [embedded in Keras](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)\n",
+    "\n",
+    "What we're going to do:\n",
+    "\n",
+    " - Retrieve data\n",
+    " - Preparing the data\n",
+    " - Build a model\n",
+    " - Train the model\n",
+    " - Evaluate the result\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1 - Init python stuff"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "IDLE 2020 - Practical Work Module\n",
+      "  Version            : 0.2.4\n",
+      "  Run time           : Wednesday 29 January 2020, 13:22:54\n",
+      "  Matplotlib style   : fidle/talk.mplstyle\n",
+      "  TensorFlow version : 2.0.0\n",
+      "  Keras version      : 2.2.4-tf\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import tensorflow.keras as keras\n",
+    "import tensorflow.keras.datasets.imdb as imdb\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib\n",
+    "import seaborn as sns\n",
+    "\n",
+    "import os,h5py,json\n",
+    "\n",
+    "import fidle.pwk as ooo\n",
+    "from importlib import reload\n",
+    "\n",
+    "ooo.init()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2 - Retrieve data\n",
+    "\n",
+    "**From Keras :**\n",
+    "This IMDb dataset can bet get directly from [Keras datasets](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)  \n",
+    "\n",
+    "Due to their nature, textual data can be somewhat complex.\n",
+    "\n",
+    "### 2.1 - Data structure :  \n",
+    "The dataset is composed of 2 parts: **reviews** and **opinions** (positive/negative),  with a **dictionary**\n",
+    "\n",
+    "  - dataset = (reviews, opinions)\n",
+    "    - reviews = \\[ review_0, review_1, ...\\]\n",
+    "      - review_i = [ int1, int2, ...] where int_i is the index of the word in the dictionary.\n",
+    "    - opinions = \\[ int0, int1, ...\\] where int_j == 0 if opinion is negative or 1 if opinion is positive.\n",
+    "  - dictionary = \\[ mot1:int1, mot2:int2, ... ]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 - Get dataset\n",
+    "For simplicity, we will use a pre-formatted dataset.  \n",
+    "See : https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb/load_data  \n",
+    "\n",
+    "However, Keras offers some usefull tools for formatting textual data.  \n",
+    "See : https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab_size = 10000\n",
+    "\n",
+    "# ----- Retrieve x,y\n",
+    "#\n",
+    "(x_train, y_train), (x_test, y_test) = imdb.load_data( num_words  = vocab_size,\n",
+    "                                                       skip_top   = 0,\n",
+    "                                                       maxlen     = None,\n",
+    "                                                       seed       = 42,\n",
+    "                                                       start_char = 1,\n",
+    "                                                       oov_char   = 2,\n",
+    "                                                       index_from = 3, )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Max(x_train,x_test)  :  9999\n",
+      "  x_train : (25000,)  y_train : (25000,)\n",
+      "  x_test  : (25000,)  y_test  : (25000,)\n",
+      "\n",
+      "Review example (x_train[12]) :\n",
+      "\n",
+      " [1, 14, 22, 1367, 53, 206, 159, 4, 636, 898, 74, 26, 11, 436, 363, 108, 7, 14, 432, 14, 22, 9, 1055, 34, 8599, 2, 5, 381, 3705, 4509, 14, 768, 47, 839, 25, 111, 1517, 2579, 1991, 438, 2663, 587, 4, 280, 725, 6, 58, 11, 2714, 201, 4, 206, 16, 702, 5, 5176, 19, 480, 5920, 157, 13, 64, 219, 4, 2, 11, 107, 665, 1212, 39, 4, 206, 4, 65, 410, 16, 565, 5, 24, 43, 343, 17, 5602, 8, 169, 101, 85, 206, 108, 8, 3008, 14, 25, 215, 168, 18, 6, 2579, 1991, 438, 2, 11, 129, 1609, 36, 26, 66, 290, 3303, 46, 5, 633, 115, 4363]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"  Max(x_train,x_test)  : \", ooo.rmax([x_train,x_test]) )\n",
+    "print(\"  x_train : {}  y_train : {}\".format(x_train.shape, y_train.shape))\n",
+    "print(\"  x_test  : {}  y_test  : {}\".format(x_test.shape,  y_test.shape))\n",
+    "\n",
+    "print('\\nReview example (x_train[12]) :\\n\\n',x_train[12])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 - Have a look for humans (optional)\n",
+    "When we loaded the dataset, we asked for using \\<start\\> as 1, \\<unknown word\\> as 2  \n",
+    "So, we shifted the dataset by 3 with the parameter index_from=3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Retrieve dictionary {word:index}, and encode it in ascii\n",
+    "\n",
+    "word_index = imdb.get_word_index()\n",
+    "\n",
+    "# ---- Shift the dictionary from +3\n",
+    "\n",
+    "word_index = {w:(i+3) for w,i in word_index.items()}\n",
+    "\n",
+    "# ---- Add <pad>, <start> and unknown tags\n",
+    "\n",
+    "word_index.update( {'<pad>':0, '<start>':1, '<unknown>':2} )\n",
+    "\n",
+    "# ---- Create a reverse dictionary : {index:word}\n",
+    "\n",
+    "index_word = {index:word for word,index in word_index.items()} \n",
+    "\n",
+    "# ---- Add a nice function to transpose :\n",
+    "#\n",
+    "def dataset2text(review):\n",
+    "    return ' '.join([index_word.get(i, '?') for i in review])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Dictionary size     :  88587\n",
+      "\n",
+      "Review example (x_train[12]) :\n",
+      "\n",
+      " [1, 14, 22, 1367, 53, 206, 159, 4, 636, 898, 74, 26, 11, 436, 363, 108, 7, 14, 432, 14, 22, 9, 1055, 34, 8599, 2, 5, 381, 3705, 4509, 14, 768, 47, 839, 25, 111, 1517, 2579, 1991, 438, 2663, 587, 4, 280, 725, 6, 58, 11, 2714, 201, 4, 206, 16, 702, 5, 5176, 19, 480, 5920, 157, 13, 64, 219, 4, 2, 11, 107, 665, 1212, 39, 4, 206, 4, 65, 410, 16, 565, 5, 24, 43, 343, 17, 5602, 8, 169, 101, 85, 206, 108, 8, 3008, 14, 25, 215, 168, 18, 6, 2579, 1991, 438, 2, 11, 129, 1609, 36, 26, 66, 290, 3303, 46, 5, 633, 115, 4363]\n",
+      "\n",
+      "In real words :\n",
+      "\n",
+      " <start> this film contains more action before the opening credits than are in entire hollywood films of this sort this film is produced by tsui <unknown> and stars jet li this team has brought you many worthy hong kong cinema productions including the once upon a time in china series the action was fast and furious with amazing wire work i only saw the <unknown> in two shots aside from the action the story itself was strong and not just used as filler to find any other action films to rival this you must look for a hong kong cinema <unknown> in your area they are really worth checking out and usually never disappoint\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('\\nDictionary size     : ', len(word_index))\n",
+    "print('\\nReview example (x_train[12]) :\\n\\n',x_train[12])\n",
+    "print('\\nIn real words :\\n\\n', dataset2text(x_train[12]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4 - Have a look for neurons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 864x432 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "plt.figure(figsize=(12, 6))\n",
+    "ax=sns.distplot([len(i) for i in x_train],bins=60)\n",
+    "ax.set_title('Distribution of reviews by size')\n",
+    "plt.xlabel(\"Review's sizes\")\n",
+    "plt.ylabel('Density')\n",
+    "ax.set_xlim(0, 1500)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3 - Preprocess the data\n",
+    "In order to be processed by an NN, all entries must have the same length.  \n",
+    "We chose a review length of **review_len**  \n",
+    "We will therefore complete them with a padding (of \\<pad\\>\\)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Review example (x_train[12]) :\n",
+      "\n",
+      " [   1   14   22 1367   53  206  159    4  636  898   74   26   11  436\n",
+      "  363  108    7   14  432   14   22    9 1055   34 8599    2    5  381\n",
+      " 3705 4509   14  768   47  839   25  111 1517 2579 1991  438 2663  587\n",
+      "    4  280  725    6   58   11 2714  201    4  206   16  702    5 5176\n",
+      "   19  480 5920  157   13   64  219    4    2   11  107  665 1212   39\n",
+      "    4  206    4   65  410   16  565    5   24   43  343   17 5602    8\n",
+      "  169  101   85  206  108    8 3008   14   25  215  168   18    6 2579\n",
+      " 1991  438    2   11  129 1609   36   26   66  290 3303   46    5  633\n",
+      "  115 4363    0    0    0    0    0    0    0    0    0    0    0    0\n",
+      "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
+      "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
+      "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
+      "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
+      "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
+      "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
+      "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
+      "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
+      "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
+      "    0    0    0    0]\n",
+      "\n",
+      "In real words :\n",
+      "\n",
+      " <start> this film contains more action before the opening credits than are in entire hollywood films of this sort this film is produced by tsui <unknown> and stars jet li this team has brought you many worthy hong kong cinema productions including the once upon a time in china series the action was fast and furious with amazing wire work i only saw the <unknown> in two shots aside from the action the story itself was strong and not just used as filler to find any other action films to rival this you must look for a hong kong cinema <unknown> in your area they are really worth checking out and usually never disappoint <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>\n"
+     ]
+    }
+   ],
+   "source": [
+    "review_len = 256\n",
+    "\n",
+    "x_train = keras.preprocessing.sequence.pad_sequences(x_train,\n",
+    "                                                     value   = 0,\n",
+    "                                                     padding = 'post',\n",
+    "                                                     maxlen  = review_len)\n",
+    "\n",
+    "x_test  = keras.preprocessing.sequence.pad_sequences(x_test,\n",
+    "                                                     value   = 0 ,\n",
+    "                                                     padding = 'post',\n",
+    "                                                     maxlen  = review_len)\n",
+    "\n",
+    "print('\\nReview example (x_train[12]) :\\n\\n',x_train[12])\n",
+    "print('\\nIn real words :\\n\\n', dataset2text(x_train[12]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Save dataset and dictionary (can be usefull)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Saved.\n"
+     ]
+    }
+   ],
+   "source": [
+    "os.makedirs('./data',   mode=0o750, exist_ok=True)\n",
+    "\n",
+    "with h5py.File('./data/dataset_imdb.h5', 'w') as f:\n",
+    "    f.create_dataset(\"x_train\",    data=x_train)\n",
+    "    f.create_dataset(\"y_train\",    data=y_train)\n",
+    "    f.create_dataset(\"x_test\",     data=x_test)\n",
+    "    f.create_dataset(\"y_test\",     data=y_test)\n",
+    "\n",
+    "with open('./data/word_index.json', 'w') as fp:\n",
+    "    json.dump(word_index, fp)\n",
+    "\n",
+    "with open('./data/index_word.json', 'w') as fp:\n",
+    "    json.dump(index_word, fp)\n",
+    "\n",
+    "print('Saved.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 4 - Build the model\n",
+    "Few remarks :\n",
+    "1. We'll choose a dense vector size for the embedding output with **dense_vector_size**\n",
+    "2. **GlobalAveragePooling1D** do a pooling on the last dimension : (None, lx, ly) -> (None, ly)  \n",
+    "In other words: we average the set of vectors/words of a sentence\n",
+    "3. L'embedding de Keras fonctionne de manière supervisée. Il s'agit d'une couche de *vocab_size* neurones vers *n_neurons* permettant de maintenir une table de vecteurs (les poids constituent les vecteurs). Cette couche ne calcule pas de sortie a la façon des couches normales, mais renvois la valeur des vecteurs. n mots => n vecteurs (ensuite empilés par le pooling)  \n",
+    "Voir : https://stats.stackexchange.com/questions/324992/how-the-embedding-layer-is-trained-in-keras-embedding-layer\n",
+    "\n",
+    "A SUIVRE : https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks\n",
+    "### 4.1 - Build\n",
+    "More documentation about :\n",
+    " - [Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding)\n",
+    " - [GlobalAveragePooling1D](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GlobalAveragePooling1D)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_model(dense_vector_size=128):\n",
+    "    \n",
+    "    model = keras.Sequential()\n",
+    "    model.add(keras.layers.Embedding(input_dim    = vocab_size, \n",
+    "                                     output_dim   = dense_vector_size, \n",
+    "                                     input_length = review_len))\n",
+    "    model.add(keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n",
+    "    model.add(keras.layers.Dense(1,                 activation='sigmoid'))\n",
+    "\n",
+    "    model.compile(optimizer = 'adam',\n",
+    "                  loss      = 'binary_crossentropy',\n",
+    "                  metrics   = ['accuracy'])\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 5 - Train the model\n",
+    "### 5.1 - Get it"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model: \"sequential_1\"\n",
+      "_________________________________________________________________\n",
+      "Layer (type)                 Output Shape              Param #   \n",
+      "=================================================================\n",
+      "embedding_1 (Embedding)      (None, 256, 128)          1280000   \n",
+      "_________________________________________________________________\n",
+      "lstm_1 (LSTM)                (None, 128)               131584    \n",
+      "_________________________________________________________________\n",
+      "dense_1 (Dense)              (None, 1)                 129       \n",
+      "=================================================================\n",
+      "Total params: 1,411,713\n",
+      "Trainable params: 1,411,713\n",
+      "Non-trainable params: 0\n",
+      "_________________________________________________________________\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = get_model()\n",
+    "\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.2 - Add callback"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.makedirs('./run/models',   mode=0o750, exist_ok=True)\n",
+    "save_dir = \"./run/models/best_model.h5\"\n",
+    "savemodel_callback = tf.keras.callbacks.ModelCheckpoint(filepath=save_dir, verbose=0, save_best_only=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.1 - Train it\n",
+    "GPU : batch_size=512 : 305s"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train on 25000 samples, validate on 25000 samples\n",
+      "Epoch 1/10\n",
+      "25000/25000 [==============================] - 206s 8ms/sample - loss: 0.6684 - accuracy: 0.5654 - val_loss: 0.6667 - val_accuracy: 0.5606\n",
+      "Epoch 2/10\n",
+      " 9120/25000 [=========>....................] - ETA: 1:42 - loss: 0.6228 - accuracy: 0.5968"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "n_epochs   = 10\n",
+    "batch_size = 32\n",
+    "\n",
+    "history = model.fit(x_train,\n",
+    "                    y_train,\n",
+    "                    epochs          = n_epochs,\n",
+    "                    batch_size      = batch_size,\n",
+    "                    validation_data = (x_test, y_test),\n",
+    "                    verbose         = 1,\n",
+    "                    callbacks       = [savemodel_callback])\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 6 - Evaluate\n",
+    "### 6.1 - Training history"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 576x432 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 576x432 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ooo.plot_history(history)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 6.2 - Reload and evaluate best model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x_test / loss      : 0.4452\n",
+      "x_test / accuracy  : 0.8282\n"
+     ]
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "#### Accuracy donut is :"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 432x432 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "#### Confusion matrix is :"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style  type=\"text/css\" >\n",
+       "    #T_7ebbbdc8_4293_11ea_ab68_4df1eb20668drow0_col0 {\n",
+       "            background-color:  #ffa500;\n",
+       "            color:  #000000;\n",
+       "            font-size:  20pt;\n",
+       "        }    #T_7ebbbdc8_4293_11ea_ab68_4df1eb20668drow0_col1 {\n",
+       "            background-color:  #fff6e5;\n",
+       "            color:  #000000;\n",
+       "            font-size:  20pt;\n",
+       "        }    #T_7ebbbdc8_4293_11ea_ab68_4df1eb20668drow1_col0 {\n",
+       "            background-color:  #fff6e5;\n",
+       "            color:  #000000;\n",
+       "            font-size:  20pt;\n",
+       "        }    #T_7ebbbdc8_4293_11ea_ab68_4df1eb20668drow1_col1 {\n",
+       "            background-color:  #ffa500;\n",
+       "            color:  #000000;\n",
+       "            font-size:  20pt;\n",
+       "        }</style><table id=\"T_7ebbbdc8_4293_11ea_ab68_4df1eb20668d\" ><thead>    <tr>        <th class=\"blank level0\" ></th>        <th class=\"col_heading level0 col0\" >0</th>        <th class=\"col_heading level0 col1\" >1</th>    </tr></thead><tbody>\n",
+       "                <tr>\n",
+       "                        <th id=\"T_7ebbbdc8_4293_11ea_ab68_4df1eb20668dlevel0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
+       "                        <td id=\"T_7ebbbdc8_4293_11ea_ab68_4df1eb20668drow0_col0\" class=\"data row0 col0\" >0.81</td>\n",
+       "                        <td id=\"T_7ebbbdc8_4293_11ea_ab68_4df1eb20668drow0_col1\" class=\"data row0 col1\" >0.19</td>\n",
+       "            </tr>\n",
+       "            <tr>\n",
+       "                        <th id=\"T_7ebbbdc8_4293_11ea_ab68_4df1eb20668dlevel0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
+       "                        <td id=\"T_7ebbbdc8_4293_11ea_ab68_4df1eb20668drow1_col0\" class=\"data row1 col0\" >0.15</td>\n",
+       "                        <td id=\"T_7ebbbdc8_4293_11ea_ab68_4df1eb20668drow1_col1\" class=\"data row1 col1\" >0.85</td>\n",
+       "            </tr>\n",
+       "    </tbody></table>"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x7f879daa7750>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "model = keras.models.load_model('./run/models/best_model.h5')\n",
+    "\n",
+    "# ---- Evaluate\n",
+    "reload(ooo)\n",
+    "score  = model.evaluate(x_test, y_test, verbose=0)\n",
+    "\n",
+    "print('x_test / loss      : {:5.4f}'.format(score[0]))\n",
+    "print('x_test / accuracy  : {:5.4f}'.format(score[1]))\n",
+    "\n",
+    "values=[score[1], 1-score[1]]\n",
+    "ooo.plot_donut(values,[\"Accuracy\",\"Errors\"], title=\"#### Accuracy donut is :\")\n",
+    "\n",
+    "# ---- Confusion matrix\n",
+    "\n",
+    "y_pred   = model.predict_classes(x_test)\n",
+    "\n",
+    "ooo.display_confusion_matrix(y_test,y_pred,labels=range(2),color='orange',font_size='20pt')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/IMDB/01-Embedding-Keras.ipynb b/IMDB/01-Embedding-Keras.ipynb
index 1d5df5e..b5cd9d3 100644
--- a/IMDB/01-Embedding-Keras.ipynb
+++ b/IMDB/01-Embedding-Keras.ipynb
@@ -44,7 +44,7 @@
      "text": [
       "IDLE 2020 - Practical Work Module\n",
       "  Version            : 0.2.4\n",
-      "  Run time           : Monday 27 January 2020, 22:38:12\n",
+      "  Run time           : Monday 27 January 2020, 23:33:47\n",
       "  Matplotlib style   : fidle/talk.mplstyle\n",
       "  TensorFlow version : 2.0.0\n",
       "  Keras version      : 2.2.4-tf\n"
@@ -403,32 +403,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Model: \"sequential\"\n",
-      "_________________________________________________________________\n",
-      "Layer (type)                 Output Shape              Param #   \n",
-      "=================================================================\n",
-      "embedding (Embedding)        (None, 256, 32)           320000    \n",
-      "_________________________________________________________________\n",
-      "global_average_pooling1d (Gl (None, 32)                0         \n",
-      "_________________________________________________________________\n",
-      "dense (Dense)                (None, 32)                1056      \n",
-      "_________________________________________________________________\n",
-      "dense_1 (Dense)              (None, 1)                 33        \n",
-      "=================================================================\n",
-      "Total params: 321,089\n",
-      "Trainable params: 321,089\n",
-      "Non-trainable params: 0\n",
-      "_________________________________________________________________\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "model = get_model(32)\n",
     "\n",
@@ -462,76 +439,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Train on 25000 samples, validate on 25000 samples\n",
-      "Epoch 1/30\n",
-      "25000/25000 [==============================] - 2s 62us/sample - loss: 0.6889 - accuracy: 0.5997 - val_loss: 0.6801 - val_accuracy: 0.7119\n",
-      "Epoch 2/30\n",
-      "25000/25000 [==============================] - 1s 31us/sample - loss: 0.6539 - accuracy: 0.7605 - val_loss: 0.6206 - val_accuracy: 0.7505\n",
-      "Epoch 3/30\n",
-      "25000/25000 [==============================] - 1s 31us/sample - loss: 0.5606 - accuracy: 0.8069 - val_loss: 0.5124 - val_accuracy: 0.8144\n",
-      "Epoch 4/30\n",
-      "25000/25000 [==============================] - 1s 30us/sample - loss: 0.4439 - accuracy: 0.8508 - val_loss: 0.4171 - val_accuracy: 0.8472\n",
-      "Epoch 5/30\n",
-      "25000/25000 [==============================] - 1s 31us/sample - loss: 0.3577 - accuracy: 0.8750 - val_loss: 0.3616 - val_accuracy: 0.8614\n",
-      "Epoch 6/30\n",
-      "25000/25000 [==============================] - 1s 31us/sample - loss: 0.3053 - accuracy: 0.8904 - val_loss: 0.3297 - val_accuracy: 0.8683\n",
-      "Epoch 7/30\n",
-      "25000/25000 [==============================] - 1s 31us/sample - loss: 0.2697 - accuracy: 0.9019 - val_loss: 0.3115 - val_accuracy: 0.8736\n",
-      "Epoch 8/30\n",
-      "25000/25000 [==============================] - 1s 30us/sample - loss: 0.2441 - accuracy: 0.9113 - val_loss: 0.2999 - val_accuracy: 0.8768\n",
-      "Epoch 9/30\n",
-      "25000/25000 [==============================] - 1s 30us/sample - loss: 0.2235 - accuracy: 0.9193 - val_loss: 0.2926 - val_accuracy: 0.8797\n",
-      "Epoch 10/30\n",
-      "25000/25000 [==============================] - 1s 30us/sample - loss: 0.2069 - accuracy: 0.9260 - val_loss: 0.2889 - val_accuracy: 0.8823\n",
-      "Epoch 11/30\n",
-      "25000/25000 [==============================] - 1s 30us/sample - loss: 0.1922 - accuracy: 0.9318 - val_loss: 0.2869 - val_accuracy: 0.8834\n",
-      "Epoch 12/30\n",
-      "25000/25000 [==============================] - 1s 29us/sample - loss: 0.1799 - accuracy: 0.9368 - val_loss: 0.2901 - val_accuracy: 0.8822\n",
-      "Epoch 13/30\n",
-      "25000/25000 [==============================] - 1s 29us/sample - loss: 0.1693 - accuracy: 0.9406 - val_loss: 0.2905 - val_accuracy: 0.8809\n",
-      "Epoch 14/30\n",
-      "25000/25000 [==============================] - 1s 29us/sample - loss: 0.1596 - accuracy: 0.9451 - val_loss: 0.2918 - val_accuracy: 0.8826\n",
-      "Epoch 15/30\n",
-      "25000/25000 [==============================] - 1s 30us/sample - loss: 0.1502 - accuracy: 0.9492 - val_loss: 0.2994 - val_accuracy: 0.8790\n",
-      "Epoch 16/30\n",
-      "25000/25000 [==============================] - 1s 30us/sample - loss: 0.1422 - accuracy: 0.9526 - val_loss: 0.3014 - val_accuracy: 0.8800\n",
-      "Epoch 17/30\n",
-      "25000/25000 [==============================] - 1s 29us/sample - loss: 0.1342 - accuracy: 0.9551 - val_loss: 0.3079 - val_accuracy: 0.8788\n",
-      "Epoch 18/30\n",
-      "25000/25000 [==============================] - 1s 29us/sample - loss: 0.1276 - accuracy: 0.9582 - val_loss: 0.3154 - val_accuracy: 0.8771\n",
-      "Epoch 19/30\n",
-      "25000/25000 [==============================] - 1s 29us/sample - loss: 0.1207 - accuracy: 0.9611 - val_loss: 0.3205 - val_accuracy: 0.8779\n",
-      "Epoch 20/30\n",
-      "25000/25000 [==============================] - 1s 29us/sample - loss: 0.1146 - accuracy: 0.9636 - val_loss: 0.3294 - val_accuracy: 0.8745\n",
-      "Epoch 21/30\n",
-      "25000/25000 [==============================] - 1s 29us/sample - loss: 0.1093 - accuracy: 0.9658 - val_loss: 0.3361 - val_accuracy: 0.8749\n",
-      "Epoch 22/30\n",
-      "25000/25000 [==============================] - 1s 30us/sample - loss: 0.1036 - accuracy: 0.9683 - val_loss: 0.3463 - val_accuracy: 0.8710\n",
-      "Epoch 23/30\n",
-      "25000/25000 [==============================] - 1s 30us/sample - loss: 0.0993 - accuracy: 0.9702 - val_loss: 0.3546 - val_accuracy: 0.8722\n",
-      "Epoch 24/30\n",
-      "25000/25000 [==============================] - 1s 29us/sample - loss: 0.0941 - accuracy: 0.9718 - val_loss: 0.3643 - val_accuracy: 0.8704\n",
-      "Epoch 25/30\n",
-      "25000/25000 [==============================] - 1s 31us/sample - loss: 0.0891 - accuracy: 0.9749 - val_loss: 0.3783 - val_accuracy: 0.8669\n",
-      "Epoch 26/30\n",
-      "25000/25000 [==============================] - 1s 30us/sample - loss: 0.0850 - accuracy: 0.9761 - val_loss: 0.3877 - val_accuracy: 0.8680\n",
-      "Epoch 27/30\n",
-      "25000/25000 [==============================] - 1s 29us/sample - loss: 0.0807 - accuracy: 0.9772 - val_loss: 0.4038 - val_accuracy: 0.8640\n",
-      "Epoch 28/30\n",
-      "25000/25000 [==============================] - 1s 29us/sample - loss: 0.0774 - accuracy: 0.9791 - val_loss: 0.4123 - val_accuracy: 0.8642\n",
-      "Epoch 29/30\n",
-      "25000/25000 [==============================] - 1s 30us/sample - loss: 0.0733 - accuracy: 0.9811 - val_loss: 0.4195 - val_accuracy: 0.8640\n",
-      "Epoch 30/30\n",
-      "25000/25000 [==============================] - 1s 29us/sample - loss: 0.0693 - accuracy: 0.9823 - val_loss: 0.4328 - val_accuracy: 0.8625\n",
-      "CPU times: user 1min 35s, sys: 4.22 s, total: 1min 39s\n",
-      "Wall time: 23.2 s\n"
+     "ename": "NameError",
+     "evalue": "name 'model' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
      ]
     }
    ],
diff --git a/IMDB/03-LSTM-Keras.ipynb b/IMDB/03-LSTM-Keras.ipynb
index 2f9dadc..e69de29 100644
--- a/IMDB/03-LSTM-Keras.ipynb
+++ b/IMDB/03-LSTM-Keras.ipynb
@@ -1,440 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Text Embedding - IMDB dataset\n",
-    "=============================\n",
-    "---\n",
-    "Introduction au Deep Learning  (IDLE) - S. Arias, E. Maldonado, JL. Parouty - CNRS/SARI/DEVLOG - 2020  \n",
-    "\n",
-    "## Text classification using **Text embedding** :\n",
-    "\n",
-    "The objective is to guess whether film reviews are **positive or negative** based on the analysis of the text. \n",
-    "\n",
-    "Original dataset can be find **[there](http://ai.stanford.edu/~amaas/data/sentiment/)**  \n",
-    "Note that [IMDb.com](https://imdb.com) offers several easy-to-use [datasets](https://www.imdb.com/interfaces/)  \n",
-    "For simplicity's sake, we'll use the dataset directly [embedded in Keras](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)\n",
-    "\n",
-    "What we're going to do:\n",
-    "\n",
-    " - Retrieve data\n",
-    " - Preparing the data\n",
-    " - Build a model\n",
-    " - Train the model\n",
-    " - Evaluate the result\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 1 - Init python stuff"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "import tensorflow as tf\n",
-    "import tensorflow.keras as keras\n",
-    "import tensorflow.keras.datasets.imdb as imdb\n",
-    "\n",
-    "import matplotlib.pyplot as plt\n",
-    "import matplotlib\n",
-    "import seaborn as sns\n",
-    "\n",
-    "import os,h5py,json\n",
-    "\n",
-    "import fidle.pwk as ooo\n",
-    "from importlib import reload\n",
-    "\n",
-    "ooo.init()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 2 - Retrieve data\n",
-    "\n",
-    "**From Keras :**\n",
-    "This IMDb dataset can bet get directly from [Keras datasets](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)  \n",
-    "\n",
-    "Due to their nature, textual data can be somewhat complex.\n",
-    "\n",
-    "### 2.1 - Data structure :  \n",
-    "The dataset is composed of 2 parts: **reviews** and **opinions** (positive/negative),  with a **dictionary**\n",
-    "\n",
-    "  - dataset = (reviews, opinions)\n",
-    "    - reviews = \\[ review_0, review_1, ...\\]\n",
-    "      - review_i = [ int1, int2, ...] where int_i is the index of the word in the dictionary.\n",
-    "    - opinions = \\[ int0, int1, ...\\] where int_j == 0 if opinion is negative or 1 if opinion is positive.\n",
-    "  - dictionary = \\[ mot1:int1, mot2:int2, ... ]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.2 - Get dataset\n",
-    "For simplicity, we will use a pre-formatted dataset.  \n",
-    "See : https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb/load_data  \n",
-    "\n",
-    "However, Keras offers some usefull tools for formatting textual data.  \n",
-    "See : https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "vocab_size = 10000\n",
-    "\n",
-    "# ----- Retrieve x,y\n",
-    "#\n",
-    "(x_train, y_train), (x_test, y_test) = imdb.load_data( num_words  = vocab_size,\n",
-    "                                                       skip_top   = 0,\n",
-    "                                                       maxlen     = None,\n",
-    "                                                       seed       = 42,\n",
-    "                                                       start_char = 1,\n",
-    "                                                       oov_char   = 2,\n",
-    "                                                       index_from = 3, )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"  Max(x_train,x_test)  : \", ooo.rmax([x_train,x_test]) )\n",
-    "print(\"  x_train : {}  y_train : {}\".format(x_train.shape, y_train.shape))\n",
-    "print(\"  x_test  : {}  y_test  : {}\".format(x_test.shape,  y_test.shape))\n",
-    "\n",
-    "print('\\nReview example (x_train[12]) :\\n\\n',x_train[12])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.3 - Have a look for humans (optional)\n",
-    "When we loaded the dataset, we asked for using \\<start\\> as 1, \\<unknown word\\> as 2  \n",
-    "So, we shifted the dataset by 3 with the parameter index_from=3"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ---- Retrieve dictionary {word:index}, and encode it in ascii\n",
-    "\n",
-    "word_index = imdb.get_word_index()\n",
-    "\n",
-    "# ---- Shift the dictionary from +3\n",
-    "\n",
-    "word_index = {w:(i+3) for w,i in word_index.items()}\n",
-    "\n",
-    "# ---- Add <pad>, <start> and unknown tags\n",
-    "\n",
-    "word_index.update( {'<pad>':0, '<start>':1, '<unknown>':2} )\n",
-    "\n",
-    "# ---- Create a reverse dictionary : {index:word}\n",
-    "\n",
-    "index_word = {index:word for word,index in word_index.items()} \n",
-    "\n",
-    "# ---- Add a nice function to transpose :\n",
-    "#\n",
-    "def dataset2text(review):\n",
-    "    return ' '.join([index_word.get(i, '?') for i in review])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print('\\nDictionary size     : ', len(word_index))\n",
-    "print('\\nReview example (x_train[12]) :\\n\\n',x_train[12])\n",
-    "print('\\nIn real words :\\n\\n', dataset2text(x_train[12]))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.4 - Have a look for neurons"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.figure(figsize=(12, 6))\n",
-    "ax=sns.distplot([len(i) for i in x_train],bins=60)\n",
-    "ax.set_title('Distribution of reviews by size')\n",
-    "plt.xlabel(\"Review's sizes\")\n",
-    "plt.ylabel('Density')\n",
-    "ax.set_xlim(0, 1500)\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 3 - Preprocess the data\n",
-    "In order to be processed by an NN, all entries must have the same length.  \n",
-    "We chose a review length of **review_len**  \n",
-    "We will therefore complete them with a padding (of \\<pad\\>\\)  "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "review_len = 256\n",
-    "\n",
-    "x_train = keras.preprocessing.sequence.pad_sequences(x_train,\n",
-    "                                                     value   = 0,\n",
-    "                                                     padding = 'post',\n",
-    "                                                     maxlen  = review_len)\n",
-    "\n",
-    "x_test  = keras.preprocessing.sequence.pad_sequences(x_test,\n",
-    "                                                     value   = 0 ,\n",
-    "                                                     padding = 'post',\n",
-    "                                                     maxlen  = review_len)\n",
-    "\n",
-    "print('\\nReview example (x_train[12]) :\\n\\n',x_train[12])\n",
-    "print('\\nIn real words :\\n\\n', dataset2text(x_train[12]))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Save dataset and dictionary (can be usefull)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.makedirs('./data',   mode=0o750, exist_ok=True)\n",
-    "\n",
-    "with h5py.File('./data/dataset_imdb.h5', 'w') as f:\n",
-    "    f.create_dataset(\"x_train\",    data=x_train)\n",
-    "    f.create_dataset(\"y_train\",    data=y_train)\n",
-    "    f.create_dataset(\"x_test\",     data=x_test)\n",
-    "    f.create_dataset(\"y_test\",     data=y_test)\n",
-    "\n",
-    "with open('./data/word_index.json', 'w') as fp:\n",
-    "    json.dump(word_index, fp)\n",
-    "\n",
-    "with open('./data/index_word.json', 'w') as fp:\n",
-    "    json.dump(index_word, fp)\n",
-    "\n",
-    "print('Saved.')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 4 - Build the model\n",
-    "Few remarks :\n",
-    "1. We'll choose a dense vector size for the embedding output with **dense_vector_size**\n",
-    "2. **GlobalAveragePooling1D** do a pooling on the last dimension : (None, lx, ly) -> (None, ly)  \n",
-    "In other words: we average the set of vectors/words of a sentence\n",
-    "3. L'embedding de Keras fonctionne de manière supervisée. Il s'agit d'une couche de *vocab_size* neurones vers *n_neurons* permettant de maintenir une table de vecteurs (les poids constituent les vecteurs). Cette couche ne calcule pas de sortie a la façon des couches normales, mais renvois la valeur des vecteurs. n mots => n vecteurs (ensuite empilés par le pooling)  \n",
-    "Voir : https://stats.stackexchange.com/questions/324992/how-the-embedding-layer-is-trained-in-keras-embedding-layer\n",
-    "\n",
-    "A SUIVRE : https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks\n",
-    "### 4.1 - Build\n",
-    "More documentation about :\n",
-    " - [Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding)\n",
-    " - [GlobalAveragePooling1D](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GlobalAveragePooling1D)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_model(dense_vector_size=16):\n",
-    "    \n",
-    "    model = keras.Sequential()\n",
-    "    model.add(keras.layers.Embedding(input_dim    = vocab_size, \n",
-    "                                     output_dim   = dense_vector_size, \n",
-    "                                     input_length = review_len))\n",
-    "    model.add(keras.layers.LSTM(100))\n",
-    "    model.add(keras.layers.Dense(16, activation='relu'))\n",
-    "    model.add(keras.layers.Dense(1,                 activation='sigmoid'))\n",
-    "\n",
-    "    model.compile(optimizer = 'adam',\n",
-    "                  loss      = 'binary_crossentropy',\n",
-    "                  metrics   = ['accuracy'])\n",
-    "    return model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 5 - Train the model\n",
-    "### 5.1 - Get it"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = get_model()\n",
-    "\n",
-    "model.summary()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 5.2 - Add callback"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.makedirs('./run/models',   mode=0o750, exist_ok=True)\n",
-    "save_dir = \"./run/models/best_model.h5\"\n",
-    "savemodel_callback = tf.keras.callbacks.ModelCheckpoint(filepath=save_dir, verbose=0, save_best_only=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 5.1 - Train it"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%time\n",
-    "\n",
-    "n_epochs   = 5\n",
-    "batch_size = 512\n",
-    "\n",
-    "history = model.fit(x_train,\n",
-    "                    y_train,\n",
-    "                    epochs          = n_epochs,\n",
-    "                    batch_size      = batch_size,\n",
-    "                    validation_data = (x_test, y_test),\n",
-    "                    verbose         = 1,\n",
-    "                    callbacks       = [savemodel_callback])\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 6 - Evaluate\n",
-    "### 6.1 - Training history"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ooo.plot_history(history)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 6.2 - Reload and evaluate best model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = keras.models.load_model('./run/models/best_model.h5')\n",
-    "\n",
-    "# ---- Evaluate\n",
-    "reload(ooo)\n",
-    "score  = model.evaluate(x_test, y_test, verbose=0)\n",
-    "\n",
-    "print('x_test / loss      : {:5.4f}'.format(score[0]))\n",
-    "print('x_test / accuracy  : {:5.4f}'.format(score[1]))\n",
-    "\n",
-    "values=[score[1], 1-score[1]]\n",
-    "ooo.plot_donut(values,[\"Accuracy\",\"Errors\"], title=\"#### Accuracy donut is :\")\n",
-    "\n",
-    "# ---- Confusion matrix\n",
-    "\n",
-    "y_pred   = model.predict_classes(x_test)\n",
-    "\n",
-    "ooo.display_confusion_matrix(y_test,y_pred,labels=range(2),color='orange',font_size='20pt')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
-- 
GitLab