Add / Change Embedding notebooks

e322b67c · Jean-Luc Parouty · 29406c69 · e322b67c · 29406c69 · 29406c69
Commit e322b67c authored 4 years ago by Jean-Luc Parouty
--- a/IMDB/01-One-hot-encoding.ipynb
+++ b/IMDB/01-One-hot-encoding.ipynb
@@ -286,7 +286,7 @@
    "\n",
    "# ---- Add <pad>, <start> and <unknown> tags\n",
    "#\n",
-    "word_index.update( {'<pad>':0, '<start>':1, '<unknown>':2} )\n",
+    "word_index.update( {'<pad>':0, '<start>':1, '<unknown>':2, '<undef>':3,} )\n",
    "\n",
    "# ---- Create a reverse dictionary : {index:word}\n",
    "#\n",
@@ -451,9 +451,10 @@
    "def get_model(vector_size=10000):\n",
    "    \n",
    "    model = keras.Sequential()\n",
-    "    model.add(keras.layers.Dense(32, activation='relu', input_shape=(10000,)))\n",
-    "    model.add(keras.layers.Dense(32, activation='relu'))\n",
-    "    model.add(keras.layers.Dense(1, activation='sigmoid'))\n",
+    "    model.add(keras.layers.Input( shape=(vector_size,) ))\n",
+    "    model.add(keras.layers.Dense( 32, activation='relu'))\n",
+    "    model.add(keras.layers.Dense( 32, activation='relu'))\n",
+    "    model.add(keras.layers.Dense( 1, activation='sigmoid'))\n",
    "    \n",
    "    model.compile(optimizer = 'rmsprop',\n",
    "                  loss      = 'binary_crossentropy',\n",
@@ -493,8 +494,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "os.makedirs('./run/models',   mode=0o750, exist_ok=True)\n",
-    "save_dir = \"./run/models/best_model.h5\"\n",
+    "os.makedirs(f'{run_dir}/models',   mode=0o750, exist_ok=True)\n",
+    "save_dir = f'{run_dir}/models/best_model.h5'\n",
    "savemodel_callback = tf.keras.callbacks.ModelCheckpoint(filepath=save_dir, verbose=0, save_best_only=True)"
   ]
  },
@@ -552,7 +553,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "model = keras.models.load_model('./run/models/best_model.h5')\n",
+    "model = keras.models.load_model(f'{run_dir}/models/best_model.h5')\n",
    "\n",
    "# ---- Evaluate\n",
    "score  = model.evaluate(x_test, y_test, verbose=0)\n",

 %% Cell type:markdown id: tags:

 <img width="800px" src="../fidle/img/00-Fidle-header-01.svg"></img>

 # <!-- TITLE --> [IMDB1] - Sentiment analysis with hot-one encoding
 <!-- DESC --> A basic example of sentiment analysis with sparse encoding, using a dataset from Internet Movie Database (IMDB)
 <!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->

 ## Objectives :
 - The objective is to guess whether film reviews are **positive or negative** based on the analysis of the text.
 - Understand the management of **textual data** and **sentiment analysis**

 Original dataset can be find **[there](http://ai.stanford.edu/~amaas/data/sentiment/)**
 Note that [IMDb.com](https://imdb.com) offers several easy-to-use [datasets](https://www.imdb.com/interfaces/)
 For simplicity's sake, we'll use the dataset directly [embedded in Keras](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)

 ## What we're going to do :

 - Retrieve data
 - Preparing the data
 - Build a model
 - Train the model
 - Evaluate the result

 %% Cell type:markdown id: tags:

 ## Step 1 - Import and init
 ### 1.1 - Python stuff

 %% Cell type:code id: tags:

 ``` python
 import numpy as np

 import tensorflow as tf
 import tensorflow.keras as keras
 import tensorflow.keras.datasets.imdb as imdb

 import matplotlib.pyplot as plt
 import matplotlib

 import pandas as pd

 import os,sys,h5py,json
 from importlib import reload

 sys.path.append('..')
 import fidle.pwk as pwk

 run_dir = './run/IMDB1'
 datasets_dir = pwk.init('IMDB1', run_dir)
 ```

 %% Cell type:markdown id: tags:

 ### 1.2 - Parameters
 The words in the vocabulary are classified from the most frequent to the rarest.
 `vocab_size` is the number of words we will remember in our vocabulary (the other words will be considered as unknown).
 `hide_most_frequently` is the number of ignored words, among the most common ones

 %% Cell type:code id: tags:

 ``` python
 vocab_size           = 10000
 hide_most_frequently = 0

 epochs     = 10
 batch_size = 512
 ```

 %% Cell type:markdown id: tags:

 Override parameters (batch mode) - Just forget this cell

 %% Cell type:code id: tags:

 ``` python
 pwk.override('vocab_size', 'hide_most_frequently', 'batch_size', 'epochs')
 ```

 %% Cell type:markdown id: tags:

 ## Step 2 - Understanding hot-one encoding
 #### We have a **sentence** and a **dictionary** :

 %% Cell type:code id: tags:

 ``` python
 sentence = "I've never seen a movie like this before"

 dictionary  = {"a":0, "before":1, "fantastic":2, "i've":3, "is":4, "like":5, "movie":6, "never":7, "seen":8, "this":9}
 ```

 %% Cell type:markdown id: tags:

 #### We encode our sentence as a **numerical vector** :

 %% Cell type:code id: tags:

 ``` python
 sentence_words = sentence.lower().split()

 sentence_vect  = [ dictionary[w] for w in sentence_words ]

 print('Words sentence are         : ', sentence_words)
 print('Our vectorized sentence is : ', sentence_vect)
 ```

 %% Cell type:markdown id: tags:

 #### Next, we **one-hot** encode our vectorized sentence as a tensor :

 %% Cell type:code id: tags:

 ``` python
 # ---- We get a (sentence length x vector size) matrix of zeros
 #
 onehot = np.zeros( (10,8) )

 # ---- We set some 1 for each word
 #
 for i,w in enumerate(sentence_vect):
    onehot[w,i]=1

 # --- Show it
 #
 print('In a basic way :\n\n', onehot, '\n\nWith a pandas wiew :\n')
 data={ f'{sentence_words[i]:.^10}':onehot[:,i] for i,w in enumerate(sentence_vect) }
 df=pd.DataFrame(data)
 df.index=dictionary.keys()
 df.style.set_precision(0).highlight_max(axis=0).set_properties(**{'text-align': 'center'})
 ```

 %% Cell type:markdown id: tags:

 ## Step 2 - Retrieve data

 IMDb dataset can bet get directly from Keras - see [documentation](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)
 Note : Due to their nature, textual data can be somewhat complex.

 ### 2.1 - Data structure :
 The dataset is composed of 2 parts:

 - **reviews**, this will be our **x**
 - **opinions** (positive/negative), this will be our **y**

 There are also a **dictionary**, because words are indexed in reviews

 ```
 <dataset> = (<reviews>, <opinions>)

 with :  <reviews>  = [ <review1>, <review2>, ... ]
        <opinions> = [ <rate1>,   <rate2>,   ... ]   where <ratei>   = integer

 where : <reviewi> = [ <w1>, <w2>, ...]    <wi> are the index (int) of the word in the dictionary
        <ratei>   = int                   0 for negative opinion, 1 for positive


 <dictionary> = [ <word1>:<w1>, <word2>:<w2>, ... ]

 with :  <wordi>   = word
        <wi>      = int

 ```

 %% Cell type:markdown id: tags:

 ### 2.2 - Load dataset
 For simplicity, we will use a pre-formatted dataset - See [documentation](https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb/load_data)
 However, Keras offers some usefull tools for formatting textual data - See [documentation](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text)

 By default :
 - Start of a sequence will be marked with : 1
 - Out of vocabulary word will be : 2
 - First index will be : 3

 %% Cell type:code id: tags:

 ``` python
 # ----- Retrieve x,y
 #
 (x_train, y_train), (x_test, y_test) = imdb.load_data( num_words=vocab_size, skip_top=hide_most_frequently)

 y_train = np.asarray(y_train).astype('float32')
 y_test  = np.asarray(y_test ).astype('float32')

 # ---- About
 #
 print("Max(x_train,x_test)  : ", pwk.rmax([x_train,x_test]) )
 print("Min(x_train,x_test)  : ", pwk.rmin([x_train,x_test]) )
 print("x_train : {}  y_train : {}".format(x_train.shape, y_train.shape))
 print("x_test  : {}  y_test  : {}".format(x_test.shape,  y_test.shape))
 ```

 %% Cell type:markdown id: tags:

 ## Step 3 - About our dataset
 When we loaded the dataset, we asked for using \<start\> as 1, \<unknown word\> as 2
 So, we shifted the dataset by 3 with the parameter index_from=3

 ### 3.1 - Sentences encoding

 %% Cell type:code id: tags:

 ``` python
 print('\nReview example (x_train[12]) :\n\n',x_train[12])
 print('\nOpinions (y_train) :\n\n',y_train)
 ```

 %% Cell type:markdown id: tags:

 ### 3.2 - Load dictionary

 %% Cell type:code id: tags:

 ``` python
 # ---- Retrieve dictionary {word:index}, and encode it in ascii
 #
 word_index = imdb.get_word_index()

 # ---- Shift the dictionary from +3
 #
 word_index = {w:(i+3) for w,i in word_index.items()}

 # ---- Add <pad>, <start> and <unknown> tags
 #
-word_index.update( {'<pad>':0, '<start>':1, '<unknown>':2} )
+word_index.update( {'<pad>':0, '<start>':1, '<unknown>':2, '<undef>':3,} )

 # ---- Create a reverse dictionary : {index:word}
 #
 index_word = {index:word for word,index in word_index.items()}

 # ---- About dictionary
 #
 print('\nDictionary size     : ', len(word_index))
 print('\nSmall extract :\n')
 for k in range(440,455):print(f'    {k:2d} : {index_word[k]}' )

 # ---- Add a nice function to transpose :
 #
 def dataset2text(review):
    return ' '.join([index_word.get(i, '?') for i in review])
 ```

 %% Cell type:markdown id: tags:

 ### 3.3 - Have a look, for human

 %% Cell type:code id: tags:

 ``` python
 pwk.subtitle('Review example :')
 print(x_train[12])
 pwk.subtitle('After translation :')
 print(dataset2text(x_train[12]))
 ```

 %% Cell type:markdown id: tags:

 ### 3.4 - Few statistics

 %% Cell type:code id: tags:

 ``` python
 sizes=[len(i) for i in x_train]
 plt.figure(figsize=(16,6))
 plt.hist(sizes, bins=400)
 plt.gca().set(title='Distribution of reviews by size - [{:5.2f}, {:5.2f}]'.format(min(sizes),max(sizes)),
              xlabel='Size', ylabel='Density', xlim=[0,1500])
 pwk.save_fig('01-stats-sizes')
 plt.show()
 ```

 %% Cell type:code id: tags:

 ``` python
 unk=[ 100*(s.count(2)/len(s)) for s in x_train]
 plt.figure(figsize=(16,6))
 plt.hist(unk, bins=100)
 plt.gca().set(title='Percent of unknown words - [{:5.2f}, {:5.2f}]'.format(min(unk),max(unk)),
              xlabel='# unknown', ylabel='Density', xlim=[0,30])
 pwk.save_fig('02-stats-unknown')
 plt.show()
 ```

 %% Cell type:markdown id: tags:

 ## Step 3 - Basic approach with "one-hot" vector encoding
 Basic approach.

 Each sentence is encoded with a **vector** of length equal to the **size of the dictionary**.

 Each sentence will therefore be encoded with a simple vector.
 The value of each component is 0 if the word is not present in the sentence or 1 if the word is present.

 For a sentence s=[3,4,7] and a dictionary of 10 words...
 We wil have a vector v=[0,0,0,1,1,0,0,1,0,0,0]


 %% Cell type:markdown id: tags:

 ### 3.1 - Our one-hot encoder

 %% Cell type:code id: tags:

 ``` python
 def one_hot_encoder(x, vector_size=10000):

    # ---- Set all to 0
    #
    x_encoded = np.zeros((len(x), vector_size))

    # ---- For each sentence
    #
    for i,sentence in enumerate(x):
        for word in sentence:
            x_encoded[i, word] = 1.

    return x_encoded
 ```

 %% Cell type:markdown id: tags:

 ### 3.2 - Encoding..

 %% Cell type:code id: tags:

 ``` python
 x_train = one_hot_encoder(x_train)
 x_test  = one_hot_encoder(x_test)

 print("To have a look, x_train[12] became :", x_train[12] )
 ```

 %% Cell type:markdown id: tags:

 ## Step 4 - Build the model
 Few remarks :
 - We'll choose a dense vector size for the embedding output with **dense_vector_size**
 - **GlobalAveragePooling1D** do a pooling on the last dimension : (None, lx, ly) -> (None, ly)
   In other words: we average the set of vectors/words of a sentence
 - L'embedding de Keras fonctionne de manière supervisée. Il s'agit d'une couche de *vocab_size* neurones vers *n_neurons* permettant de maintenir une table de vecteurs (les poids constituent les vecteurs). Cette couche ne calcule pas de sortie a la façon des couches normales, mais renvois la valeur des vecteurs. n mots => n vecteurs (ensuite empilés par le pooling)
 Voir : [Explication plus détaillée (en)](https://stats.stackexchange.com/questions/324992/how-the-embedding-layer-is-trained-in-keras-embedding-layer)
 ainsi que : [Sentiment detection with Keras](https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks)

 More documentation about this model functions :
 - [Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding)
 - [GlobalAveragePooling1D](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GlobalAveragePooling1D)

 %% Cell type:code id: tags:

 ``` python
 def get_model(vector_size=10000):

    model = keras.Sequential()
-    model.add(keras.layers.Dense(32, activation='relu', input_shape=(10000,)))
-    model.add(keras.layers.Dense(32, activation='relu'))
-    model.add(keras.layers.Dense(1, activation='sigmoid'))
+    model.add(keras.layers.Input( shape=(vector_size,) ))
+    model.add(keras.layers.Dense( 32, activation='relu'))
+    model.add(keras.layers.Dense( 32, activation='relu'))
+    model.add(keras.layers.Dense( 1, activation='sigmoid'))

    model.compile(optimizer = 'rmsprop',
                  loss      = 'binary_crossentropy',
                  metrics   = ['accuracy'])
    return model
 ```

 %% Cell type:markdown id: tags:

 ## Step 5 - Train the model
 ### 5.1 - Get it

 %% Cell type:code id: tags:

 ``` python
 model = get_model(vector_size=vocab_size)

 model.summary()
 ```

 %% Cell type:markdown id: tags:

 ### 5.2 - Add callback

 %% Cell type:code id: tags:

 ``` python
-os.makedirs('./run/models',   mode=0o750, exist_ok=True)
-save_dir = "./run/models/best_model.h5"
+os.makedirs(f'{run_dir}/models',   mode=0o750, exist_ok=True)
+save_dir = f'{run_dir}/models/best_model.h5'
 savemodel_callback = tf.keras.callbacks.ModelCheckpoint(filepath=save_dir, verbose=0, save_best_only=True)
 ```

 %% Cell type:markdown id: tags:

 ### 5.1 - Train it

 %% Cell type:code id: tags:

 ``` python
 %%time

 history = model.fit(x_train,
                    y_train,
                    epochs          = epochs,
                    batch_size      = batch_size,
                    validation_data = (x_test, y_test),
                    verbose         = 1,
                    callbacks       = [savemodel_callback])
 ```

 %% Cell type:markdown id: tags:

 ## Step 6 - Evaluate
 ### 6.1 - Training history

 %% Cell type:code id: tags:

 ``` python
 pwk.plot_history(history, save_as='02-history')
 ```

 %% Cell type:markdown id: tags:

 ### 6.2 - Reload and evaluate best model

 %% Cell type:code id: tags:

 ``` python
-model = keras.models.load_model('./run/models/best_model.h5')
+model = keras.models.load_model(f'{run_dir}/models/best_model.h5')

 # ---- Evaluate
 score  = model.evaluate(x_test, y_test, verbose=0)

 print('x_test / loss      : {:5.4f}'.format(score[0]))
 print('x_test / accuracy  : {:5.4f}'.format(score[1]))

 values=[score[1], 1-score[1]]
 pwk.plot_donut(values,["Accuracy","Errors"], title="#### Accuracy donut is :", save_as='03-donut')

 # ---- Confusion matrix

 y_sigmoid = model.predict(x_test)

 y_pred = y_sigmoid.copy()
 y_pred[ y_sigmoid< 0.5 ] = 0
 y_pred[ y_sigmoid>=0.5 ] = 1

 pwk.display_confusion_matrix(y_test,y_pred,labels=range(2))
 pwk.plot_confusion_matrix(y_test,y_pred,range(2), figsize=(8, 8),normalize=False, save_as='04-confusion-matrix')
 ```

 %% Cell type:code id: tags:

 ``` python
 pwk.end()
 ```

 %% Cell type:markdown id: tags:

 ---
 <img width="80px" src="../fidle/img/00-Fidle-logo-01.svg"></img>

--- a/IMDB/02-Embedding-Keras.ipynb
+++ b/IMDB/02-Embedding-Keras.ipynb
--- a/IMDB/02-Embedding-Keras==done==.ipynb
+++ b/IMDB/02-Embedding-Keras==done==.ipynb
--- a/IMDB/02-Keras-embedding.ipynb
+++ b/IMDB/02-Keras-embedding.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img width=\"800px\" src=\"../fidle/img/00-Fidle-header-01.svg\"></img>\n",
+    "\n",
+    "# <!-- TITLE --> [IMDB2] - Sentiment analysis with text embedding\n",
+    "<!-- DESC --> A very classical example of word embedding with a dataset from Internet Movie Database (IMDB)\n",
+    "<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->\n",
+    "\n",
+    "## Objectives :\n",
+    " - The objective is to guess whether film reviews are **positive or negative** based on the analysis of the text. \n",
+    " - Understand the management of **textual data** and **sentiment analysis**\n",
+    "\n",
+    "Original dataset can be find **[there](http://ai.stanford.edu/~amaas/data/sentiment/)**  \n",
+    "Note that [IMDb.com](https://imdb.com) offers several easy-to-use [datasets](https://www.imdb.com/interfaces/)  \n",
+    "For simplicity's sake, we'll use the dataset directly [embedded in Keras](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)\n",
+    "\n",
+    "## What we're going to do :\n",
+    "\n",
+    " - Retrieve data\n",
+    " - Preparing the data\n",
+    " - Build a model\n",
+    " - Train the model\n",
+    " - Evaluate the result\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1 - Import and init\n",
+    "### 1.1 - Python stuff"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import tensorflow.keras as keras\n",
+    "import tensorflow.keras.datasets.imdb as imdb\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib\n",
+    "\n",
+    "import os,sys,h5py,json\n",
+    "from importlib import reload\n",
+    "\n",
+    "sys.path.append('..')\n",
+    "import fidle.pwk as pwk\n",
+    "\n",
+    "run_dir = './run/IMDB2'\n",
+    "datasets_dir = pwk.init('IMDB2', run_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.2 - Parameters\n",
+    "The words in the vocabulary are classified from the most frequent to the rarest.  \n",
+    "`vocab_size` is the number of words we will remember in our vocabulary (the other words will be considered as unknown).  \n",
+    "`hide_most_frequently` is the number of ignored words, among the most common ones  \n",
+    "`review_len` is the review length  \n",
+    "`dense_vector_size` is the size of the generated dense vectors  \n",
+    "`output_dir` is where we will go to save our dictionaries. (./data is a good choice)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab_size           = 10000\n",
+    "hide_most_frequently = 0\n",
+    "\n",
+    "review_len           = 256\n",
+    "dense_vector_size    = 16\n",
+    "\n",
+    "epochs               = 30\n",
+    "batch_size           = 512\n",
+    "\n",
+    "output_dir           = './data'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Override parameters (batch mode) - Just forget this cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pwk.override('vocab_size', 'hide_most_frequently', 'review_len', 'dense_vector_size')\n",
+    "pwk.override('batch_size', 'epochs', 'output_dir')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2 - Retrieve data\n",
+    "\n",
+    "IMDb dataset can bet get directly from Keras - see [documentation](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)  \n",
+    "Note : Due to their nature, textual data can be somewhat complex.\n",
+    "\n",
+    "For more details about the management of this dataset, see notebook [IMDB1](01-One-hot-encoding.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 - Get dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(x_train, y_train), (x_test, y_test) = imdb.load_data( num_words=vocab_size, skip_top=hide_most_frequently, seed= 42,)\n",
+    "\n",
+    "y_train = np.asarray(y_train).astype('float32')\n",
+    "y_test  = np.asarray(y_test ).astype('float32')\n",
+    "\n",
+    "# ---- About\n",
+    "#\n",
+    "print(\"Max(x_train,x_test)  : \", pwk.rmax([x_train,x_test]) )\n",
+    "print(\"Min(x_train,x_test)  : \", pwk.rmin([x_train,x_test]) )\n",
+    "print(\"x_train : {}  y_train : {}\".format(x_train.shape, y_train.shape))\n",
+    "print(\"x_test  : {}  y_test  : {}\".format(x_test.shape,  y_test.shape))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 - Load dictionary\n",
+    "Not essential, but nice if you want to take a closer look at our reviews ;-)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Retrieve dictionary {word:index}, and encode it in ascii\n",
+    "#      Shift the dictionary from +3\n",
+    "#      Add <pad>, <start> and <unknown> tags\n",
+    "#      Create a reverse dictionary : {index:word}\n",
+    "#\n",
+    "word_index = imdb.get_word_index()\n",
+    "word_index = {w:(i+3) for w,i in word_index.items()}\n",
+    "word_index.update( {'<pad>':0, '<start>':1, '<unknown>':2, '<undef>':3,} )\n",
+    "index_word = {index:word for word,index in word_index.items()} \n",
+    "\n",
+    "# ---- A nice function to transpose :\n",
+    "#\n",
+    "def dataset2text(review):\n",
+    "    return ' '.join([index_word.get(i, '?') for i in review])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3 - Preprocess the data (padding)\n",
+    "In order to be processed by an NN, all entries must have the **same length.**  \n",
+    "We chose a review length of **review_len**  \n",
+    "We will therefore complete them with a padding (of \\<pad\\>\\)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train = keras.preprocessing.sequence.pad_sequences(x_train,\n",
+    "                                                     value   = 0,\n",
+    "                                                     padding = 'post',\n",
+    "                                                     maxlen  = review_len)\n",
+    "\n",
+    "x_test  = keras.preprocessing.sequence.pad_sequences(x_test,\n",
+    "                                                     value   = 0 ,\n",
+    "                                                     padding = 'post',\n",
+    "                                                     maxlen  = review_len)\n",
+    "\n",
+    "pwk.subtitle('After padding :')\n",
+    "print(x_train[12])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Save dataset and dictionary (For future use but not mandatory)**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Write dataset in a h5 file, could be usefull\n",
+    "#\n",
+    "pwk.mkdir(output_dir)\n",
+    "\n",
+    "with h5py.File(f'{output_dir}/dataset_imdb.h5', 'w') as f:\n",
+    "    f.create_dataset(\"x_train\",    data=x_train)\n",
+    "    f.create_dataset(\"y_train\",    data=y_train)\n",
+    "    f.create_dataset(\"x_test\",     data=x_test)\n",
+    "    f.create_dataset(\"y_test\",     data=y_test)\n",
+    "\n",
+    "with open(f'{output_dir}/word_index.json', 'w') as fp:\n",
+    "    json.dump(word_index, fp)\n",
+    "\n",
+    "with open(f'{output_dir}/index_word.json', 'w') as fp:\n",
+    "    json.dump(index_word, fp)\n",
+    "\n",
+    "print('Saved.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 4 - Build the model\n",
+    "Few remarks :\n",
+    " - We'll choose a dense vector size for the embedding output with **dense_vector_size**\n",
+    " - **GlobalAveragePooling1D** do a pooling on the last dimension : (None, lx, ly) -> (None, ly)  \n",
+    "   In other words: we average the set of vectors/words of a sentence\n",
+    " - L'embedding de Keras fonctionne de manière supervisée. Il s'agit d'une couche de *vocab_size* neurones vers *n_neurons* permettant de maintenir une table de vecteurs (les poids constituent les vecteurs). Cette couche ne calcule pas de sortie a la façon des couches normales, mais renvois la valeur des vecteurs. n mots => n vecteurs (ensuite empilés par le pooling)  \n",
+    "Voir : [Explication plus détaillée (en)](https://stats.stackexchange.com/questions/324992/how-the-embedding-layer-is-trained-in-keras-embedding-layer)  \n",
+    "ainsi que : [Sentiment detection with Keras](https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks)  \n",
+    "\n",
+    "More documentation about this model functions :\n",
+    " - [Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding)\n",
+    " - [GlobalAveragePooling1D](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GlobalAveragePooling1D)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_model(vocab_size=10000, dense_vector_size=32, review_len=256):\n",
+    "    \n",
+    "    model = keras.Sequential()\n",
+    "    model.add(keras.layers.Input( shape=(review_len,) ))\n",
+    "    model.add(keras.layers.Embedding(input_dim    = vocab_size, \n",
+    "                                     output_dim   = dense_vector_size, \n",
+    "                                     input_length = review_len))\n",
+    "    model.add(keras.layers.GlobalAveragePooling1D())\n",
+    "    model.add(keras.layers.Dense(dense_vector_size, activation='relu'))\n",
+    "    model.add(keras.layers.Dense(1,                 activation='sigmoid'))\n",
+    "\n",
+    "    model.compile(optimizer = 'adam',\n",
+    "                  loss      = 'binary_crossentropy',\n",
+    "                  metrics   = ['accuracy'])\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 5 - Train the model\n",
+    "### 5.1 - Get it"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = get_model(vocab_size, dense_vector_size, review_len)\n",
+    "\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.2 - Add callback"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.makedirs(f'{run_dir}/models',   mode=0o750, exist_ok=True)\n",
+    "save_dir = f'{run_dir}/models/best_model.h5'\n",
+    "savemodel_callback = tf.keras.callbacks.ModelCheckpoint(filepath=save_dir, verbose=0, save_best_only=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.1 - Train it"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "history = model.fit(x_train,\n",
+    "                    y_train,\n",
+    "                    epochs          = epochs,\n",
+    "                    batch_size      = batch_size,\n",
+    "                    validation_data = (x_test, y_test),\n",
+    "                    verbose         = 1,\n",
+    "                    callbacks       = [savemodel_callback])\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 6 - Evaluate\n",
+    "### 6.1 - Training history"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pwk.plot_history(history, save_as='02-history')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 6.2 - Reload and evaluate best model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = keras.models.load_model(f'{run_dir}/models/best_model.h5')\n",
+    "\n",
+    "# ---- Evaluate\n",
+    "score  = model.evaluate(x_test, y_test, verbose=0)\n",
+    "\n",
+    "print('x_test / loss      : {:5.4f}'.format(score[0]))\n",
+    "print('x_test / accuracy  : {:5.4f}'.format(score[1]))\n",
+    "\n",
+    "values=[score[1], 1-score[1]]\n",
+    "pwk.plot_donut(values,[\"Accuracy\",\"Errors\"], title=\"#### Accuracy donut is :\", save_as='03-donut')\n",
+    "\n",
+    "# ---- Confusion matrix\n",
+    "\n",
+    "y_sigmoid = model.predict(x_test)\n",
+    "\n",
+    "y_pred = y_sigmoid.copy()\n",
+    "y_pred[ y_sigmoid< 0.5 ] = 0\n",
+    "y_pred[ y_sigmoid>=0.5 ] = 1    \n",
+    "\n",
+    "pwk.display_confusion_matrix(y_test,y_pred,labels=range(2))\n",
+    "pwk.plot_confusion_matrix(y_test,y_pred,range(2), figsize=(8, 8),normalize=False, save_as='04-confusion-matrix')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pwk.end()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "<img width=\"80px\" src=\"../fidle/img/00-Fidle-logo-01.svg\"></img>"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:markdown id: tags:
+
+<img width="800px" src="../fidle/img/00-Fidle-header-01.svg"></img>
+
+# <!-- TITLE --> [IMDB2] - Sentiment analysis with text embedding
+<!-- DESC --> A very classical example of word embedding with a dataset from Internet Movie Database (IMDB)
+<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->
+
+## Objectives :
+ - The objective is to guess whether film reviews are **positive or negative** based on the analysis of the text.
+ - Understand the management of **textual data** and **sentiment analysis**
+
+Original dataset can be find **[there](http://ai.stanford.edu/~amaas/data/sentiment/)**
+Note that [IMDb.com](https://imdb.com) offers several easy-to-use [datasets](https://www.imdb.com/interfaces/)
+For simplicity's sake, we'll use the dataset directly [embedded in Keras](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)
+
+## What we're going to do :
+
+ - Retrieve data
+ - Preparing the data
+ - Build a model
+ - Train the model
+ - Evaluate the result
+
+%% Cell type:markdown id: tags:
+
+## Step 1 - Import and init
+### 1.1 - Python stuff
+
+%% Cell type:code id: tags:
+
+``` python
+import numpy as np
+
+import tensorflow as tf
+import tensorflow.keras as keras
+import tensorflow.keras.datasets.imdb as imdb
+
+import matplotlib.pyplot as plt
+import matplotlib
+
+import os,sys,h5py,json
+from importlib import reload
+
+sys.path.append('..')
+import fidle.pwk as pwk
+
+run_dir = './run/IMDB2'
+datasets_dir = pwk.init('IMDB2', run_dir)
+```
+
+%% Cell type:markdown id: tags:
+
+### 1.2 - Parameters
+The words in the vocabulary are classified from the most frequent to the rarest.
+`vocab_size` is the number of words we will remember in our vocabulary (the other words will be considered as unknown).
+`hide_most_frequently` is the number of ignored words, among the most common ones
+`review_len` is the review length
+`dense_vector_size` is the size of the generated dense vectors
+`output_dir` is where we will go to save our dictionaries. (./data is a good choice)
+
+%% Cell type:code id: tags:
+
+``` python
+vocab_size           = 10000
+hide_most_frequently = 0
+
+review_len           = 256
+dense_vector_size    = 16
+
+epochs               = 30
+batch_size           = 512
+
+output_dir           = './data'
+```
+
+%% Cell type:markdown id: tags:
+
+Override parameters (batch mode) - Just forget this cell
+
+%% Cell type:code id: tags:
+
+``` python
+pwk.override('vocab_size', 'hide_most_frequently', 'review_len', 'dense_vector_size')
+pwk.override('batch_size', 'epochs', 'output_dir')
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 2 - Retrieve data
+
+IMDb dataset can bet get directly from Keras - see [documentation](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)
+Note : Due to their nature, textual data can be somewhat complex.
+
+For more details about the management of this dataset, see notebook [IMDB1](01-One-hot-encoding.ipynb)
+
+%% Cell type:markdown id: tags:
+
+### 2.2 - Get dataset
+
+%% Cell type:code id: tags:
+
+``` python
+(x_train, y_train), (x_test, y_test) = imdb.load_data( num_words=vocab_size, skip_top=hide_most_frequently, seed= 42,)
+
+y_train = np.asarray(y_train).astype('float32')
+y_test  = np.asarray(y_test ).astype('float32')
+
+# ---- About
+#
+print("Max(x_train,x_test)  : ", pwk.rmax([x_train,x_test]) )
+print("Min(x_train,x_test)  : ", pwk.rmin([x_train,x_test]) )
+print("x_train : {}  y_train : {}".format(x_train.shape, y_train.shape))
+print("x_test  : {}  y_test  : {}".format(x_test.shape,  y_test.shape))
+```
+
+%% Cell type:markdown id: tags:
+
+### 2.2 - Load dictionary
+Not essential, but nice if you want to take a closer look at our reviews ;-)
+
+%% Cell type:code id: tags:
+
+``` python
+# ---- Retrieve dictionary {word:index}, and encode it in ascii
+#      Shift the dictionary from +3
+#      Add <pad>, <start> and <unknown> tags
+#      Create a reverse dictionary : {index:word}
+#
+word_index = imdb.get_word_index()
+word_index = {w:(i+3) for w,i in word_index.items()}
+word_index.update( {'<pad>':0, '<start>':1, '<unknown>':2, '<undef>':3,} )
+index_word = {index:word for word,index in word_index.items()}
+
+# ---- A nice function to transpose :
+#
+def dataset2text(review):
+    return ' '.join([index_word.get(i, '?') for i in review])
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 3 - Preprocess the data (padding)
+In order to be processed by an NN, all entries must have the **same length.**
+We chose a review length of **review_len**
+We will therefore complete them with a padding (of \<pad\>\)
+
+%% Cell type:code id: tags:
+
+``` python
+x_train = keras.preprocessing.sequence.pad_sequences(x_train,
+                                                     value   = 0,
+                                                     padding = 'post',
+                                                     maxlen  = review_len)
+
+x_test  = keras.preprocessing.sequence.pad_sequences(x_test,
+                                                     value   = 0 ,
+                                                     padding = 'post',
+                                                     maxlen  = review_len)
+
+pwk.subtitle('After padding :')
+print(x_train[12])
+```
+
+%% Cell type:markdown id: tags:
+
+**Save dataset and dictionary (For future use but not mandatory)**
+
+%% Cell type:code id: tags:
+
+``` python
+# ---- Write dataset in a h5 file, could be usefull
+#
+pwk.mkdir(output_dir)
+
+with h5py.File(f'{output_dir}/dataset_imdb.h5', 'w') as f:
+    f.create_dataset("x_train",    data=x_train)
+    f.create_dataset("y_train",    data=y_train)
+    f.create_dataset("x_test",     data=x_test)
+    f.create_dataset("y_test",     data=y_test)
+
+with open(f'{output_dir}/word_index.json', 'w') as fp:
+    json.dump(word_index, fp)
+
+with open(f'{output_dir}/index_word.json', 'w') as fp:
+    json.dump(index_word, fp)
+
+print('Saved.')
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 4 - Build the model
+Few remarks :
+ - We'll choose a dense vector size for the embedding output with **dense_vector_size**
+ - **GlobalAveragePooling1D** do a pooling on the last dimension : (None, lx, ly) -> (None, ly)
+   In other words: we average the set of vectors/words of a sentence
+ - L'embedding de Keras fonctionne de manière supervisée. Il s'agit d'une couche de *vocab_size* neurones vers *n_neurons* permettant de maintenir une table de vecteurs (les poids constituent les vecteurs). Cette couche ne calcule pas de sortie a la façon des couches normales, mais renvois la valeur des vecteurs. n mots => n vecteurs (ensuite empilés par le pooling)
+Voir : [Explication plus détaillée (en)](https://stats.stackexchange.com/questions/324992/how-the-embedding-layer-is-trained-in-keras-embedding-layer)
+ainsi que : [Sentiment detection with Keras](https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks)
+
+More documentation about this model functions :
+ - [Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding)
+ - [GlobalAveragePooling1D](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GlobalAveragePooling1D)
+
+%% Cell type:code id: tags:
+
+``` python
+def get_model(vocab_size=10000, dense_vector_size=32, review_len=256):
+
+    model = keras.Sequential()
+    model.add(keras.layers.Input( shape=(review_len,) ))
+    model.add(keras.layers.Embedding(input_dim    = vocab_size,
+                                     output_dim   = dense_vector_size,
+                                     input_length = review_len))
+    model.add(keras.layers.GlobalAveragePooling1D())
+    model.add(keras.layers.Dense(dense_vector_size, activation='relu'))
+    model.add(keras.layers.Dense(1,                 activation='sigmoid'))
+
+    model.compile(optimizer = 'adam',
+                  loss      = 'binary_crossentropy',
+                  metrics   = ['accuracy'])
+    return model
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 5 - Train the model
+### 5.1 - Get it
+
+%% Cell type:code id: tags:
+
+``` python
+model = get_model(vocab_size, dense_vector_size, review_len)
+
+model.summary()
+```
+
+%% Cell type:markdown id: tags:
+
+### 5.2 - Add callback
+
+%% Cell type:code id: tags:
+
+``` python
+os.makedirs(f'{run_dir}/models',   mode=0o750, exist_ok=True)
+save_dir = f'{run_dir}/models/best_model.h5'
+savemodel_callback = tf.keras.callbacks.ModelCheckpoint(filepath=save_dir, verbose=0, save_best_only=True)
+```
+
+%% Cell type:markdown id: tags:
+
+### 5.1 - Train it
+
+%% Cell type:code id: tags:
+
+``` python
+%%time
+
+history = model.fit(x_train,
+                    y_train,
+                    epochs          = epochs,
+                    batch_size      = batch_size,
+                    validation_data = (x_test, y_test),
+                    verbose         = 1,
+                    callbacks       = [savemodel_callback])
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 6 - Evaluate
+### 6.1 - Training history
+
+%% Cell type:code id: tags:
+
+``` python
+pwk.plot_history(history, save_as='02-history')
+```
+
+%% Cell type:markdown id: tags:
+
+### 6.2 - Reload and evaluate best model
+
+%% Cell type:code id: tags:
+
+``` python
+model = keras.models.load_model(f'{run_dir}/models/best_model.h5')
+
+# ---- Evaluate
+score  = model.evaluate(x_test, y_test, verbose=0)
+
+print('x_test / loss      : {:5.4f}'.format(score[0]))
+print('x_test / accuracy  : {:5.4f}'.format(score[1]))
+
+values=[score[1], 1-score[1]]
+pwk.plot_donut(values,["Accuracy","Errors"], title="#### Accuracy donut is :", save_as='03-donut')
+
+# ---- Confusion matrix
+
+y_sigmoid = model.predict(x_test)
+
+y_pred = y_sigmoid.copy()
+y_pred[ y_sigmoid< 0.5 ] = 0
+y_pred[ y_sigmoid>=0.5 ] = 1
+
+pwk.display_confusion_matrix(y_test,y_pred,labels=range(2))
+pwk.plot_confusion_matrix(y_test,y_pred,range(2), figsize=(8, 8),normalize=False, save_as='04-confusion-matrix')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+pwk.end()
+```
+
+%% Cell type:markdown id: tags:
+
+---
+<img width="80px" src="../fidle/img/00-Fidle-logo-01.svg"></img>
--- a/IMDB/03-Prediction.ipynb
+++ b/IMDB/03-Prediction.ipynb
@@ -6,21 +6,17 @@
   "source": [
    "<img width=\"800px\" src=\"../fidle/img/00-Fidle-header-01.svg\"></img>\n",
    "\n",
-    "# <!-- TITLE --> [IMDB2] - Reload and reuse a saved model\n",
+    "# <!-- TITLE --> [IMDB3] - Reload and reuse a saved model\n",
    "<!-- DESC --> Retrieving a saved model to perform a sentiment analysis (movie review)\n",
    "<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->\n",
    "\n",
    "## Objectives :\n",
-    " - The objective is to guess whether film reviews are **positive or negative** based on the analysis of the text. \n",
+    " - The objective is to guess whether our personal film reviews are **positive or negative** based on the analysis of the text. \n",
    " - For this, we will use our **previously saved model**.\n",
    "\n",
-    "Original dataset can be find **[there](http://ai.stanford.edu/~amaas/data/sentiment/)**  \n",
-    "Note that [IMDb.com](https://imdb.com) offers several easy-to-use [datasets](https://www.imdb.com/interfaces/)  \n",
-    "For simplicity's sake, we'll use the dataset directly [embedded in Keras](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)\n",
-    "\n",
    "## What we're going to do :\n",
    "\n",
-    " - Preparing the data\n",
+    " - Preparing our data\n",
    " - Retrieve our saved model\n",
    " - Evaluate the result\n"
   ]
@@ -34,97 +30,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<style>\n",
-       "\n",
-       "div.warn {    \n",
-       "    background-color: #fcf2f2;\n",
-       "    border-color: #dFb5b4;\n",
-       "    border-left: 5px solid #dfb5b4;\n",
-       "    padding: 0.5em;\n",
-       "    font-weight: bold;\n",
-       "    font-size: 1.1em;;\n",
-       "    }\n",
-       "\n",
-       "\n",
-       "\n",
-       "div.nota {    \n",
-       "    background-color: #DAFFDE;\n",
-       "    border-left: 5px solid #92CC99;\n",
-       "    padding: 0.5em;\n",
-       "    }\n",
-       "\n",
-       "div.todo:before { content:url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSI1My44OTEyIiBoZWlnaHQ9IjE0My4zOTAyIiB2aWV3Qm94PSIwIDAgNTMuODkxMiAxNDMuMzkwMiI+PHRpdGxlPjAwLUJvYi10b2RvPC90aXRsZT48cGF0aCBkPSJNMjMuNDU2OCwxMTQuMzAxNmExLjgwNjMsMS44MDYzLDAsMSwxLDEuODE1NywxLjgyNEExLjgyMDksMS44MjA5LDAsMCwxLDIzLjQ1NjgsMTE0LjMwMTZabS0xMC42NjEyLDEuODIyQTEuODI3MiwxLjgyNzIsMCwxLDAsMTAuOTgsMTE0LjMsMS44MiwxLjgyLDAsMCwwLDEyLjc5NTYsMTE2LjEyMzZabS03LjcwNyw0LjU4NzR2LTVzLjQ4NjMtOS4xMjIzLDguMDIxNS0xMS45Njc1YTE5LjIwODIsMTkuMjA4MiwwLDAsMSw2LjA0ODYtMS4yNDU0LDE5LjE3NzgsMTkuMTc3OCwwLDAsMSw2LjA0ODcsMS4yNDc1YzcuNTM1MSwyLjgzNDcsOC4wMTc0LDExLjk2NzQsOC4wMTc0LDExLjk2NzR2NS4wMjM0bC4wMDQyLDcuNjgydjIuNGMuMDE2Ny4xOTkyLjAzMzYuMzkyMS4wMzM2LjU4NzEsMCwuMjEzOC0uMDE2OC40MTA5LS4wMzM2LjYzMzJ2LjA1ODdoLS4wMDg0YTguMzcxOSw4LjM3MTksMCwwLDEtNy4zNzM4LDcuNjU0N3MtLjk5NTMsMy42MzgtNi42OTMzLDMuNjM4LTYuNjkzNC0zLjYzOC02LjY5MzQtMy42MzhhOC4zNyw4LjM3LDAsMCwxLTcuMzcxNi03LjY1NDdINS4wODQzdi0uMDU4N2MtLjAxODktLjIyLS4wMjk0LS40MTk0LS4wMjk0LS42MzMyLDAtLjE5MjkuMDE2Ny0uMzgzNy4wMjk0LS41ODcxdi0yLjRtMTguMDkzNy00LjA0YTEuMTU2NSwxLjE1NjUsMCwxLDAtMi4zMTI2LDAsMS4xNTY0LDEuMTU2NCwwLDEsMCwyLjMxMjYsMFptNC4wODM0LDBhMS4xNTk1LDEuMTU5NSwwLDEsMC0xLjE2MzYsMS4xN0ExLjE3NSwxLjE3NSwwLDAsMCwyNy4yNjE0LDEyNC4zNzc5Wk05LjM3MzksMTE0LjYzNWMwLDMuMTA5MywyLjQxMzIsMy4zMSwyLjQxMzIsMy4zMWExMzMuOTI0MywxMzMuOTI0MywwLDAsMCwxNC43MzQ4LDBzMi40MTExLS4xOTI5LDIuNDExMS0zLjMxYTguMDc3Myw4LjA3NzMsMCwwLDAtMi40MTExLTUuNTUxOWMtNC41LTMuNTAzMy05LjkxMjYtMy41MDMzLTE0Ljc0MTEsMEE4LjA4NTEsOC4wODUxLDAsMCwwLDkuMzczOSwxMTQuNjM1WiIgc3R5bGU9ImZpbGw6IzAxMDEwMSIvPjxjaXJjbGUgY3g9IjMzLjE0MzYiIGN5PSIxMjQuNTM0IiByPSIzLjgzNjMiIHN0eWxlPSJmaWxsOiMwMTAxMDEiLz48cmVjdCB4PSIzNS42NjU5IiB5PSIxMTIuOTYyNSIgd2lkdGg9IjIuMDc3IiBoZWlnaHQ9IjEwLjU0NTgiIHRyYW5zZm9ybT0idHJhbnNsYXRlKDIxLjYgMjQxLjExMjEpIHJvdGF0ZSgtMTU1Ljc0NikiIHN0eWxlPSJmaWxsOiMwMTAxMDEiLz48Y2lyY2xlIGN4PSIzOC44NzA0IiBjeT0iMTEzLjQyNzkiIHI9IjIuNDA4NSIgc3R5bGU9ImZpbGw6IzAxMDEwMSIvPjxjaXJjbGUgY3g9IjUuMjI0OCIgY3k9IjEyNC41MzQiIHI9IjMuODM2MyIgc3R5bGU9ImZpbGw6IzAxMDEwMSIvPjxyZWN0IHg9IjEuNDE2NCIgeT0iMTI0LjYzMDEiIHdpZHRoPSIyLjA3NyIgaGVpZ2h0PSIxMC41NDU4IiB0cmFuc2Zvcm09InRyYW5zbGF0ZSg0LjkwOTcgMjU5LjgwNikgcm90YXRlKC0xODApIiBzdHlsZT0iZmlsbDojMDEwMTAxIi8+PGNpcmNsZSBjeD0iMi40MDkxIiBjeT0iMTM3LjA5OTYiIHI9IjIuNDA4NSIgc3R5bGU9ImZpbGw6IzAxMDEwMSIvPjxwYXRoIGQ9Ik0xOC4wNTExLDEwMC4xMDY2aC0uMDE0NlYxMDIuNjFoMi4zdi0yLjQyNzlhMi40MjI5LDIuNDIyOSwwLDEsMC0yLjI4NTQtLjA3NTVaIiBzdHlsZT0iZmlsbDojMDEwMTAxIi8+PHBhdGggZD0iTTM5LjQyMTQsMjcuMjU4djEuMDVBMTEuOTQ1MiwxMS45NDUyLDAsMCwwLDQ0LjU5NTQsNS43OWEuMjQ0OS4yNDQ5LDAsMCwxLS4wMjM1LS40MjI3TDQ2Ljc1LDMuOTUxNWEuMzg5Mi4zODkyLDAsMCwxLC40MjYyLDAsMTQuODQ0MiwxNC44NDQyLDAsMCwxLTcuNzU0MywyNy4yNTkxdjEuMDY3YS40NS40NSwwLDAsMS0uNzA0Ny4zNzU4bC0zLjg0MTktMi41MWEuNDUuNDUsMCwwLDEsMC0uNzUxNmwzLjg0MTktMi41MWEuNDUuNDUsMCwwLDEsLjY5NDYuMzc1OFpNNDMuMjMsMi41ODkyLDM5LjM4NzguMDc5NGEuNDUuNDUsMCwwLDAtLjcwNDYuMzc1OHYxLjA2N2ExNC44NDQyLDE0Ljg0NDIsMCwwLDAtNy43NTQzLDI3LjI1OTEuMzg5LjM4OSwwLDAsMCwuNDI2MSwwbDIuMTc3Ny0xLjQxOTNhLjI0NS4yNDUsMCwwLDAtLjAyMzUtLjQyMjgsMTEuOTQ1MSwxMS45NDUxLDAsMCwxLDUuMTc0LTIyLjUxNDZ2MS4wNWEuNDUuNDUsMCwwLDAsLjcwNDYuMzc1OGwzLjg1NTMtMi41MWEuNDUuNDUsMCwwLDAsMC0uNzUxNlpNMzkuMDUyMywxNC4yNDU4YTIuMTIwNiwyLjEyMDYsMCwxLDAsMi4xMjA2LDIuMTIwNmgwQTIuMTI0LDIuMTI0LDAsMCwwLDM5LjA1MjMsMTQuMjQ1OFptNi4wNzMyLTQuNzc4MS44MjU0LjgyNTVhMS4wNTY4LDEuMDU2OCwwLDAsMSwuMTE3NSwxLjM0MjFsLS44MDIsMS4xNDQyYTcuMTAxOCw3LjEwMTgsMCwwLDEsLjcxMTQsMS43MTEybDEuMzc1Ny4yNDE2YTEuMDU2OSwxLjA1NjksMCwwLDEsLjg3NTcsMS4wNHYxLjE2NDNhMS4wNTY5LDEuMDU2OSwwLDAsMS0uODc1NywxLjA0bC0xLjM3MjQuMjQxNkE3LjExLDcuMTEsMCwwLDEsNDUuMjcsMTkuOTNsLjgwMTksMS4xNDQyYTEuMDU3LDEuMDU3LDAsMCwxLS4xMTc0LDEuMzQyMmwtLjgyODguODQ4OWExLjA1NywxLjA1NywwLDAsMS0xLjM0MjEuMTE3NGwtMS4xNDQyLS44MDE5YTcuMTMzOCw3LjEzMzgsMCwwLDEtMS43MTEzLjcxMTNsLS4yNDE2LDEuMzcyNGExLjA1NjgsMS4wNTY4LDAsMCwxLTEuMDQuODc1N0gzOC40Njg0YTEuMDU2OCwxLjA1NjgsMCwwLDEtMS4wNC0uODc1N2wtLjI0MTYtMS4zNzI0YTcuMTM1NSw3LjEzNTUsMCwwLDEtMS43MTEzLS43MTEzbC0xLjE0NDEuODAxOWExLjA1NzEsMS4wNTcxLDAsMCwxLTEuMzQyMi0uMTE3NGwtLjgzNTUtLjgyNTVhMS4wNTcsMS4wNTcsMCwwLDEtLjExNzQtMS4zNDIxbC44MDE5LTEuMTQ0MmE3LjEyMSw3LjEyMSwwLDAsMS0uNzExMy0xLjcxMTJsLTEuMzcyNC0uMjQxNmExLjA1NjksMS4wNTY5LDAsMCwxLS44NzU3LTEuMDRWMTUuNzgyNmExLjA1NjksMS4wNTY5LDAsMCwxLC44NzU3LTEuMDRsMS4zNzU3LS4yNDE2YTcuMTEsNy4xMSwwLDAsMSwuNzExNC0xLjcxMTJsLS44MDItMS4xNDQyYTEuMDU3LDEuMDU3LDAsMCwxLC4xMTc1LTEuMzQyMmwuODI1NC0uODI1NEExLjA1NjgsMS4wNTY4LDAsMCwxLDM0LjMyNDUsOS4zNmwxLjE0NDIuODAxOUE3LjEzNTUsNy4xMzU1LDAsMCwxLDM3LjE4LDkuNDUxbC4yNDE2LTEuMzcyNGExLjA1NjgsMS4wNTY4LDAsMCwxLDEuMDQtLjg3NTdoMS4xNjc3YTEuMDU2OSwxLjA1NjksMCwwLDEsMS4wNC44NzU3bC4yNDE2LDEuMzcyNGE3LjEyNSw3LjEyNSwwLDAsMSwxLjcxMTIuNzExM0w0My43NjY2LDkuMzZBMS4wNTY5LDEuMDU2OSwwLDAsMSw0NS4xMjU1LDkuNDY3N1ptLTIuMDMsNi44OTg3QTQuMDQzMyw0LjA0MzMsMCwxLDAsMzkuMDUyMywyMC40MWgwQTQuMDQ2NSw0LjA0NjUsMCwwLDAsNDMuMDk1NSwxNi4zNjY0WiIgc3R5bGU9ImZpbGw6I2UxMjIyOSIvPjxwb2x5Z29uIHBvaW50cz0iMzkuNDEzIDM0Ljc1NyAzOS41MzcgMzQuNzU3IDM5LjY3NSAzNC43NTcgMzkuNjc1IDEwOS41MSAzOS41MzcgMTA5LjUxIDM5LjQxMyAxMDkuNTEgMzkuNDEzIDM0Ljc1NyAzOS40MTMgMzQuNzU3IiBzdHlsZT0iZmlsbDpub25lO3N0cm9rZTojOTk5O3N0cm9rZS1saW5lY2FwOnJvdW5kO3N0cm9rZS1taXRlcmxpbWl0OjEwO3N0cm9rZS13aWR0aDowLjMwODg1NDQ1MDU2MDE2MThweDtmaWxsLXJ1bGU6ZXZlbm9kZCIvPjwvc3ZnPg==);\n",
-       "    float:left;\n",
-       "    margin-right:20px;\n",
-       "    margin-top:-20px;\n",
-       "    margin-bottom:20px;\n",
-       "}\n",
-       "div.todo{\n",
-       "    font-weight: bold;\n",
-       "    font-size: 1.1em;\n",
-       "    margin-top:40px;\n",
-       "}\n",
-       "div.todo ul{\n",
-       "    margin: 0.2em;\n",
-       "}\n",
-       "div.todo li{\n",
-       "    margin-left:60px;\n",
-       "    margin-top:0;\n",
-       "    margin-bottom:0;\n",
-       "}\n",
-       "\n",
-       "div .comment{\n",
-       "    font-size:0.8em;\n",
-       "    color:#696969;\n",
-       "}\n",
-       "\n",
-       "\n",
-       "\n",
-       "</style>\n",
-       "\n"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/markdown": [
-       "**FIDLE 2020 - Practical Work Module**"
-      ],
-      "text/plain": [
-       "<IPython.core.display.Markdown object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Version              : 0.6.1 DEV\n",
-      "Notebook id          : IMDB2\n",
-      "Run time             : Friday 18 December 2020, 18:21:49\n",
-      "TensorFlow version   : 2.0.0\n",
-      "Keras version        : 2.2.4-tf\n",
-      "Datasets dir         : /home/pjluc/datasets/fidle\n",
-      "Running mode         : full\n",
-      "Update keras cache   : False\n",
-      "Save figs            : True\n",
-      "Path figs            : ./run/figs\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
@@ -143,7 +51,47 @@
    "sys.path.append('..')\n",
    "import fidle.pwk as pwk\n",
    "\n",
-    "datasets_dir = pwk.init('IMDB2')"
+    "run_dir = './run/IMDB2'\n",
+    "datasets_dir = pwk.init('IMDB3', run_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.2 - Parameters\n",
+    "The words in the vocabulary are classified from the most frequent to the rarest.  \n",
+    "`vocab_size` is the number of words we will remember in our vocabulary (the other words will be considered as unknown).  \n",
+    "`review_len` is the review length  \n",
+    "`dictionaries_dir` is where we will go to save our dictionaries. (./data is a good choice)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab_size           = 10000\n",
+    "review_len           = 256\n",
+    "\n",
+    "dictionaries_dir     = './data'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Override parameters (batch mode) - Just forget this cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pwk.override('vocab_size', 'review_len', 'dictionaries_dir')"
   ]
  },
  {
@@ -156,12 +104,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews = [ \"This film is particularly nice, a must see.\",\n",
-    "             \"Some films are great classics and cannot be ignored.\",\n",
+    "             \"This film is a great classic that cannot be ignored.\",\n",
+    "             \"I don't remember ever having seen such a movie...\",\n",
    "             \"This movie is just abominable and doesn't deserve to be seen!\"]"
   ]
  },
@@ -175,39 +124,42 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "with open('./data/word_index.json', 'r') as fp:\n",
+    "with open(f'{dictionaries_dir}/word_index.json', 'r') as fp:\n",
    "    word_index = json.load(fp)\n",
-    "    index_word = {index:word for word,index in word_index.items()} "
+    "    word_index = { w:int(i) for w,i in word_index.items() }\n",
+    "    print('Loaded. ', len(word_index), 'entries in word_index' )\n",
+    "    index_word = { i:w      for w,i in word_index.items() }\n",
+    "    print('Loaded. ', len(index_word), 'entries in index_word' )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### 2.3 - Clean, index and padd"
+    "### 2.3 - Clean, index and padd\n",
+    "Phases are split into words, punctuation is removed, sentence length is limited and padding is added...  \n",
+    "**Note** : 1 is \"Start\" and 2 is \"unknown\""
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "max_len    = 256\n",
-    "vocab_size = 10000\n",
-    "\n",
-    "\n",
    "nb_reviews = len(reviews)\n",
    "x_data     = []\n",
    "\n",
    "# ---- For all reviews\n",
    "for review in reviews:\n",
+    "    print('Words are : ', end='')\n",
    "    # ---- First index must be <start>\n",
    "    index_review=[1]\n",
+    "    print('1 ', end='')\n",
    "    # ---- For all words\n",
    "    for w in review.split(' '):\n",
    "        # ---- Clean it\n",
@@ -219,11 +171,13 @@
    "            if w_index>vocab_size : w_index=2\n",
    "            # ---- Add the index if < vocab_size\n",
    "            index_review.append(w_index)\n",
+    "            print(f'{w_index} ', end='')\n",
    "    # ---- Add the indexed review\n",
-    "    x_data.append(index_review)    \n",
+    "    x_data.append(index_review)\n",
+    "    print()\n",
    "\n",
    "# ---- Padding\n",
-    "x_data = keras.preprocessing.sequence.pad_sequences(x_data, value   = 0, padding = 'post', maxlen  = max_len)"
+    "x_data = keras.preprocessing.sequence.pad_sequences(x_data, value   = 0, padding = 'post', maxlen  = review_len)"
   ]
  },
  {
@@ -235,28 +189,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Text review      : This film is particularly nice, a must see.\n",
-      "x_train[0]       : [1, 2, 22, 9, 572, 2, 6, 215, 2, 0, 0, 0, 0, 0] (...)\n",
-      "Translation      : <start> <unknown> film is particularly <unknown> a must <unknown> <pad> <pad> <pad> <pad> <pad> (...)\n",
-      "\n",
-      "Text review      : Some films are great classics and cannot be ignored.\n",
-      "x_train[1]       : [1, 2, 108, 26, 87, 2239, 5, 566, 30, 2, 0, 0, 0, 0, 0] (...)\n",
-      "Translation      : <start> <unknown> films are great classics and cannot be <unknown> <pad> <pad> <pad> <pad> <pad> (...)\n",
-      "\n",
-      "Text review      : This movie is just abominable and doesn't deserve to be seen!\n",
-      "x_train[2]       : [1, 2, 20, 9, 43, 2, 5, 152, 1833, 8, 30, 2, 0, 0, 0, 0, 0] (...)\n",
-      "Translation      : <start> <unknown> movie is just <unknown> and doesn't deserve to be <unknown> <pad> <pad> <pad> <pad> <pad> (...)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "def translate(x):\n",
    "    return ' '.join( [index_word.get(i,'?') for i in x] )\n",
@@ -277,11 +212,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "model = keras.models.load_model('./run/models/best_model.h5')"
+    "model = keras.models.load_model(f'{run_dir}/models/best_model.h5')"
   ]
  },
  {
@@ -293,7 +228,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -309,42 +244,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "This film is particularly nice, a must see.                            => POSITIVE (0.56)\n",
-      "\n",
-      "Some films are great classics and cannot be ignored.                   => POSITIVE (0.63)\n",
-      "\n",
-      "This movie is just abominable and doesn't deserve to be seen!          => NEGATIVE (0.35)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "for i in range(nb_reviews):\n",
-    "    print(f'\\n{reviews[i]:<70} =>',('NEGATIVE' if y_pred[i][0]<0.5 else 'POSITIVE'),f'({y_pred[i][0]:.2f})')"
+    "for i,review in enumerate(reviews):\n",
+    "    rate    = y_pred[i][0]\n",
+    "    opinion =  'NEGATIVE :-(' if rate<0.5 else 'POSITIVE :-)'    \n",
+    "    print(f'{review:<70} => {rate:.2f} - {opinion}')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "End time is : Friday 18 December 2020, 18:21:50\n",
-      "Duration is : 00:00:01 555ms\n",
-      "This notebook ends here\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "pwk.end()"
   ]

 %% Cell type:markdown id: tags:

 <img width="800px" src="../fidle/img/00-Fidle-header-01.svg"></img>

-# <!-- TITLE --> [IMDB2] - Reload and reuse a saved model
+# <!-- TITLE --> [IMDB3] - Reload and reuse a saved model
 <!-- DESC --> Retrieving a saved model to perform a sentiment analysis (movie review)
 <!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->

 ## Objectives :
- - The objective is to guess whether film reviews are **positive or negative** based on the analysis of the text.
+ - The objective is to guess whether our personal film reviews are **positive or negative** based on the analysis of the text.
 - For this, we will use our **previously saved model**.

-Original dataset can be find **[there](http://ai.stanford.edu/~amaas/data/sentiment/)**
-Note that [IMDb.com](https://imdb.com) offers several easy-to-use [datasets](https://www.imdb.com/interfaces/)
-For simplicity's sake, we'll use the dataset directly [embedded in Keras](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)
-
 ## What we're going to do :

- - Preparing the data
+ - Preparing our data
 - Retrieve our saved model
 - Evaluate the result

 %% Cell type:markdown id: tags:

 ## Step 1 - Init python stuff

 %% Cell type:code id: tags:

 ``` python
 import numpy as np

 import tensorflow as tf
 import tensorflow.keras as keras
 import tensorflow.keras.datasets.imdb as imdb

 import matplotlib.pyplot as plt
 import matplotlib
 import pandas as pd

 import os,sys,h5py,json,re

 from importlib import reload

 sys.path.append('..')
 import fidle.pwk as pwk

-datasets_dir = pwk.init('IMDB2')
+run_dir = './run/IMDB2'
+datasets_dir = pwk.init('IMDB3', run_dir)
 ```

-%% Output
+%% Cell type:markdown id: tags:
+
+### 1.2 - Parameters
+The words in the vocabulary are classified from the most frequent to the rarest.
+`vocab_size` is the number of words we will remember in our vocabulary (the other words will be considered as unknown).
+`review_len` is the review length
+`dictionaries_dir` is where we will go to save our dictionaries. (./data is a good choice)

+%% Cell type:code id: tags:

-    **FIDLE 2020 - Practical Work Module**
+``` python
+vocab_size           = 10000
+review_len           = 256

-    Version              : 0.6.1 DEV
-    Notebook id          : IMDB2
-    Run time             : Friday 18 December 2020, 18:21:49
-    TensorFlow version   : 2.0.0
-    Keras version        : 2.2.4-tf
-    Datasets dir         : /home/pjluc/datasets/fidle
-    Running mode         : full
-    Update keras cache   : False
-    Save figs            : True
-    Path figs            : ./run/figs
+dictionaries_dir     = './data'
+```
+
+%% Cell type:markdown id: tags:
+
+Override parameters (batch mode) - Just forget this cell
+
+%% Cell type:code id: tags:
+
+``` python
+pwk.override('vocab_size', 'review_len', 'dictionaries_dir')
+```

 %% Cell type:markdown id: tags:

 ## Step 2 : Preparing the data
 ### 2.1 - Our reviews :

 %% Cell type:code id: tags:

 ``` python
 reviews = [ "This film is particularly nice, a must see.",
-             "Some films are great classics and cannot be ignored.",
+             "This film is a great classic that cannot be ignored.",
+             "I don't remember ever having seen such a movie...",
             "This movie is just abominable and doesn't deserve to be seen!"]
 ```

 %% Cell type:markdown id: tags:

 ### 2.2 - Retrieve dictionaries
 Note : This dictionary is generated by [01-Embedding-Keras](01-Embedding-Keras.ipynb) notebook.

 %% Cell type:code id: tags:

 ``` python
-with open('./data/word_index.json', 'r') as fp:
+with open(f'{dictionaries_dir}/word_index.json', 'r') as fp:
    word_index = json.load(fp)
-    index_word = {index:word for word,index in word_index.items()}
+    word_index = { w:int(i) for w,i in word_index.items() }
+    print('Loaded. ', len(word_index), 'entries in word_index' )
+    index_word = { i:w      for w,i in word_index.items() }
+    print('Loaded. ', len(index_word), 'entries in index_word' )
 ```

 %% Cell type:markdown id: tags:

 ### 2.3 - Clean, index and padd
+Phases are split into words, punctuation is removed, sentence length is limited and padding is added...
+**Note** : 1 is "Start" and 2 is "unknown"

 %% Cell type:code id: tags:

 ``` python
-max_len    = 256
-vocab_size = 10000
-
-
 nb_reviews = len(reviews)
 x_data     = []

 # ---- For all reviews
 for review in reviews:
+    print('Words are : ', end='')
    # ---- First index must be <start>
    index_review=[1]
+    print('1 ', end='')
    # ---- For all words
    for w in review.split(' '):
        # ---- Clean it
        w_clean = re.sub(r"[^a-zA-Z0-9]", "", w)
        # ---- Not empty ?
        if len(w_clean)>0:
            # ---- Get the index
            w_index = word_index.get(w,2)
            if w_index>vocab_size : w_index=2
            # ---- Add the index if < vocab_size
            index_review.append(w_index)
+            print(f'{w_index} ', end='')
    # ---- Add the indexed review
    x_data.append(index_review)
+    print()

 # ---- Padding
-x_data = keras.preprocessing.sequence.pad_sequences(x_data, value   = 0, padding = 'post', maxlen  = max_len)
+x_data = keras.preprocessing.sequence.pad_sequences(x_data, value   = 0, padding = 'post', maxlen  = review_len)
 ```

 %% Cell type:markdown id: tags:

 ### 2.4 - Have a look

 %% Cell type:code id: tags:

 ``` python
 def translate(x):
    return ' '.join( [index_word.get(i,'?') for i in x] )

 for i in range(nb_reviews):
    imax=np.where(x_data[i]==0)[0][0]+5
    print(f'\nText review      :',    reviews[i])
    print(  f'x_train[{i:}]       :', list(x_data[i][:imax]), '(...)')
    print(  'Translation      :', translate(x_data[i][:imax]), '(...)')
 ```

-%% Output
-
-    
-    Text review      : This film is particularly nice, a must see.
-    x_train[0]       : [1, 2, 22, 9, 572, 2, 6, 215, 2, 0, 0, 0, 0, 0] (...)
-    Translation      : <start> <unknown> film is particularly <unknown> a must <unknown> <pad> <pad> <pad> <pad> <pad> (...)
-    
-    Text review      : Some films are great classics and cannot be ignored.
-    x_train[1]       : [1, 2, 108, 26, 87, 2239, 5, 566, 30, 2, 0, 0, 0, 0, 0] (...)
-    Translation      : <start> <unknown> films are great classics and cannot be <unknown> <pad> <pad> <pad> <pad> <pad> (...)
-    
-    Text review      : This movie is just abominable and doesn't deserve to be seen!
-    x_train[2]       : [1, 2, 20, 9, 43, 2, 5, 152, 1833, 8, 30, 2, 0, 0, 0, 0, 0] (...)
-    Translation      : <start> <unknown> movie is just <unknown> and doesn't deserve to be <unknown> <pad> <pad> <pad> <pad> <pad> (...)
-
 %% Cell type:markdown id: tags:

 ## Step 2 - Bring back the model

 %% Cell type:code id: tags:

 ``` python
-model = keras.models.load_model('./run/models/best_model.h5')
+model = keras.models.load_model(f'{run_dir}/models/best_model.h5')
 ```

 %% Cell type:markdown id: tags:

 ## Step 4 - Predict

 %% Cell type:code id: tags:

 ``` python
 y_pred   = model.predict(x_data)
 ```

 %% Cell type:markdown id: tags:

 #### And the winner is :

 %% Cell type:code id: tags:

 ``` python
-for i in range(nb_reviews):
-    print(f'\n{reviews[i]:<70} =>',('NEGATIVE' if y_pred[i][0]<0.5 else 'POSITIVE'),f'({y_pred[i][0]:.2f})')
+for i,review in enumerate(reviews):
+    rate    = y_pred[i][0]
+    opinion =  'NEGATIVE :-(' if rate<0.5 else 'POSITIVE :-)'
+    print(f'{review:<70} => {rate:.2f} - {opinion}')
 ```

-%% Output
-
-    
-    This film is particularly nice, a must see.                            => POSITIVE (0.56)
-    
-    Some films are great classics and cannot be ignored.                   => POSITIVE (0.63)
-    
-    This movie is just abominable and doesn't deserve to be seen!          => NEGATIVE (0.35)
-
 %% Cell type:code id: tags:

 ``` python
 pwk.end()
 ```

-%% Output
-
-    End time is : Friday 18 December 2020, 18:21:50
-    Duration is : 00:00:01 555ms
-    This notebook ends here
-
 %% Cell type:markdown id: tags:

 ---
 <img width="80px" src="../fidle/img/00-Fidle-logo-01.svg"></img>

--- a/IMDB/03-Prediction==done==.ipynb
+++ b/IMDB/03-Prediction==done==.ipynb
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<img width=\"800px\" src=\"../fidle/img/00-Fidle-header-01.svg\"></img>\n",
-    "\n",
-    "# <!-- TITLE --> [IMDB2] - Reload and reuse a saved model\n",
-    "<!-- DESC --> Retrieving a saved model to perform a sentiment analysis (movie review)\n",
-    "<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->\n",
-    "\n",
-    "## Objectives :\n",
-    " - The objective is to guess whether film reviews are **positive or negative** based on the analysis of the text. \n",
-    " - For this, we will use our **previously saved model**.\n",
-    "\n",
-    "Original dataset can be find **[there](http://ai.stanford.edu/~amaas/data/sentiment/)**  \n",
-    "Note that [IMDb.com](https://imdb.com) offers several easy-to-use [datasets](https://www.imdb.com/interfaces/)  \n",
-    "For simplicity's sake, we'll use the dataset directly [embedded in Keras](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)\n",
-    "\n",
-    "## What we're going to do :\n",
-    "\n",
-    " - Preparing the data\n",
-    " - Retrieve our saved model\n",
-    " - Evaluate the result\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 1 - Init python stuff"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-02-10T09:47:34.772909Z",
-     "iopub.status.busy": "2021-02-10T09:47:34.772435Z",
-     "iopub.status.idle": "2021-02-10T09:47:37.370802Z",
-     "shell.execute_reply": "2021-02-10T09:47:37.371290Z"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<style>\n",
-       "\n",
-       "div.warn {    \n",
-       "    background-color: #fcf2f2;\n",
-       "    border-color: #dFb5b4;\n",
-       "    border-left: 5px solid #dfb5b4;\n",
-       "    padding: 0.5em;\n",
-       "    font-weight: bold;\n",
-       "    font-size: 1.1em;;\n",
-       "    }\n",
-       "\n",
-       "\n",
-       "\n",
-       "div.nota {    \n",
-       "    background-color: #DAFFDE;\n",
-       "    border-left: 5px solid #92CC99;\n",
-       "    padding: 0.5em;\n",
-       "    }\n",
-       "\n",
-       "div.todo:before { content:url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSI1My44OTEyIiBoZWlnaHQ9IjE0My4zOTAyIiB2aWV3Qm94PSIwIDAgNTMuODkxMiAxNDMuMzkwMiI+PHRpdGxlPjAwLUJvYi10b2RvPC90aXRsZT48cGF0aCBkPSJNMjMuNDU2OCwxMTQuMzAxNmExLjgwNjMsMS44MDYzLDAsMSwxLDEuODE1NywxLjgyNEExLjgyMDksMS44MjA5LDAsMCwxLDIzLjQ1NjgsMTE0LjMwMTZabS0xMC42NjEyLDEuODIyQTEuODI3MiwxLjgyNzIsMCwxLDAsMTAuOTgsMTE0LjMsMS44MiwxLjgyLDAsMCwwLDEyLjc5NTYsMTE2LjEyMzZabS03LjcwNyw0LjU4NzR2LTVzLjQ4NjMtOS4xMjIzLDguMDIxNS0xMS45Njc1YTE5LjIwODIsMTkuMjA4MiwwLDAsMSw2LjA0ODYtMS4yNDU0LDE5LjE3NzgsMTkuMTc3OCwwLDAsMSw2LjA0ODcsMS4yNDc1YzcuNTM1MSwyLjgzNDcsOC4wMTc0LDExLjk2NzQsOC4wMTc0LDExLjk2NzR2NS4wMjM0bC4wMDQyLDcuNjgydjIuNGMuMDE2Ny4xOTkyLjAzMzYuMzkyMS4wMzM2LjU4NzEsMCwuMjEzOC0uMDE2OC40MTA5LS4wMzM2LjYzMzJ2LjA1ODdoLS4wMDg0YTguMzcxOSw4LjM3MTksMCwwLDEtNy4zNzM4LDcuNjU0N3MtLjk5NTMsMy42MzgtNi42OTMzLDMuNjM4LTYuNjkzNC0zLjYzOC02LjY5MzQtMy42MzhhOC4zNyw4LjM3LDAsMCwxLTcuMzcxNi03LjY1NDdINS4wODQzdi0uMDU4N2MtLjAxODktLjIyLS4wMjk0LS40MTk0LS4wMjk0LS42MzMyLDAtLjE5MjkuMDE2Ny0uMzgzNy4wMjk0LS41ODcxdi0yLjRtMTguMDkzNy00LjA0YTEuMTU2NSwxLjE1NjUsMCwxLDAtMi4zMTI2LDAsMS4xNTY0LDEuMTU2NCwwLDEsMCwyLjMxMjYsMFptNC4wODM0LDBhMS4xNTk1LDEuMTU5NSwwLDEsMC0xLjE2MzYsMS4xN0ExLjE3NSwxLjE3NSwwLDAsMCwyNy4yNjE0LDEyNC4zNzc5Wk05LjM3MzksMTE0LjYzNWMwLDMuMTA5MywyLjQxMzIsMy4zMSwyLjQxMzIsMy4zMWExMzMuOTI0MywxMzMuOTI0MywwLDAsMCwxNC43MzQ4LDBzMi40MTExLS4xOTI5LDIuNDExMS0zLjMxYTguMDc3Myw4LjA3NzMsMCwwLDAtMi40MTExLTUuNTUxOWMtNC41LTMuNTAzMy05LjkxMjYtMy41MDMzLTE0Ljc0MTEsMEE4LjA4NTEsOC4wODUxLDAsMCwwLDkuMzczOSwxMTQuNjM1WiIgc3R5bGU9ImZpbGw6IzAxMDEwMSIvPjxjaXJjbGUgY3g9IjMzLjE0MzYiIGN5PSIxMjQuNTM0IiByPSIzLjgzNjMiIHN0eWxlPSJmaWxsOiMwMTAxMDEiLz48cmVjdCB4PSIzNS42NjU5IiB5PSIxMTIuOTYyNSIgd2lkdGg9IjIuMDc3IiBoZWlnaHQ9IjEwLjU0NTgiIHRyYW5zZm9ybT0idHJhbnNsYXRlKDIxLjYgMjQxLjExMjEpIHJvdGF0ZSgtMTU1Ljc0NikiIHN0eWxlPSJmaWxsOiMwMTAxMDEiLz48Y2lyY2xlIGN4PSIzOC44NzA0IiBjeT0iMTEzLjQyNzkiIHI9IjIuNDA4NSIgc3R5bGU9ImZpbGw6IzAxMDEwMSIvPjxjaXJjbGUgY3g9IjUuMjI0OCIgY3k9IjEyNC41MzQiIHI9IjMuODM2MyIgc3R5bGU9ImZpbGw6IzAxMDEwMSIvPjxyZWN0IHg9IjEuNDE2NCIgeT0iMTI0LjYzMDEiIHdpZHRoPSIyLjA3NyIgaGVpZ2h0PSIxMC41NDU4IiB0cmFuc2Zvcm09InRyYW5zbGF0ZSg0LjkwOTcgMjU5LjgwNikgcm90YXRlKC0xODApIiBzdHlsZT0iZmlsbDojMDEwMTAxIi8+PGNpcmNsZSBjeD0iMi40MDkxIiBjeT0iMTM3LjA5OTYiIHI9IjIuNDA4NSIgc3R5bGU9ImZpbGw6IzAxMDEwMSIvPjxwYXRoIGQ9Ik0xOC4wNTExLDEwMC4xMDY2aC0uMDE0NlYxMDIuNjFoMi4zdi0yLjQyNzlhMi40MjI5LDIuNDIyOSwwLDEsMC0yLjI4NTQtLjA3NTVaIiBzdHlsZT0iZmlsbDojMDEwMTAxIi8+PHBhdGggZD0iTTM5LjQyMTQsMjcuMjU4djEuMDVBMTEuOTQ1MiwxMS45NDUyLDAsMCwwLDQ0LjU5NTQsNS43OWEuMjQ0OS4yNDQ5LDAsMCwxLS4wMjM1LS40MjI3TDQ2Ljc1LDMuOTUxNWEuMzg5Mi4zODkyLDAsMCwxLC40MjYyLDAsMTQuODQ0MiwxNC44NDQyLDAsMCwxLTcuNzU0MywyNy4yNTkxdjEuMDY3YS40NS40NSwwLDAsMS0uNzA0Ny4zNzU4bC0zLjg0MTktMi41MWEuNDUuNDUsMCwwLDEsMC0uNzUxNmwzLjg0MTktMi41MWEuNDUuNDUsMCwwLDEsLjY5NDYuMzc1OFpNNDMuMjMsMi41ODkyLDM5LjM4NzguMDc5NGEuNDUuNDUsMCwwLDAtLjcwNDYuMzc1OHYxLjA2N2ExNC44NDQyLDE0Ljg0NDIsMCwwLDAtNy43NTQzLDI3LjI1OTEuMzg5LjM4OSwwLDAsMCwuNDI2MSwwbDIuMTc3Ny0xLjQxOTNhLjI0NS4yNDUsMCwwLDAtLjAyMzUtLjQyMjgsMTEuOTQ1MSwxMS45NDUxLDAsMCwxLDUuMTc0LTIyLjUxNDZ2MS4wNWEuNDUuNDUsMCwwLDAsLjcwNDYuMzc1OGwzLjg1NTMtMi41MWEuNDUuNDUsMCwwLDAsMC0uNzUxNlpNMzkuMDUyMywxNC4yNDU4YTIuMTIwNiwyLjEyMDYsMCwxLDAsMi4xMjA2LDIuMTIwNmgwQTIuMTI0LDIuMTI0LDAsMCwwLDM5LjA1MjMsMTQuMjQ1OFptNi4wNzMyLTQuNzc4MS44MjU0LjgyNTVhMS4wNTY4LDEuMDU2OCwwLDAsMSwuMTE3NSwxLjM0MjFsLS44MDIsMS4xNDQyYTcuMTAxOCw3LjEwMTgsMCwwLDEsLjcxMTQsMS43MTEybDEuMzc1Ny4yNDE2YTEuMDU2OSwxLjA1NjksMCwwLDEsLjg3NTcsMS4wNHYxLjE2NDNhMS4wNTY5LDEuMDU2OSwwLDAsMS0uODc1NywxLjA0bC0xLjM3MjQuMjQxNkE3LjExLDcuMTEsMCwwLDEsNDUuMjcsMTkuOTNsLjgwMTksMS4xNDQyYTEuMDU3LDEuMDU3LDAsMCwxLS4xMTc0LDEuMzQyMmwtLjgyODguODQ4OWExLjA1NywxLjA1NywwLDAsMS0xLjM0MjEuMTE3NGwtMS4xNDQyLS44MDE5YTcuMTMzOCw3LjEzMzgsMCwwLDEtMS43MTEzLjcxMTNsLS4yNDE2LDEuMzcyNGExLjA1NjgsMS4wNTY4LDAsMCwxLTEuMDQuODc1N0gzOC40Njg0YTEuMDU2OCwxLjA1NjgsMCwwLDEtMS4wNC0uODc1N2wtLjI0MTYtMS4zNzI0YTcuMTM1NSw3LjEzNTUsMCwwLDEtMS43MTEzLS43MTEzbC0xLjE0NDEuODAxOWExLjA1NzEsMS4wNTcxLDAsMCwxLTEuMzQyMi0uMTE3NGwtLjgzNTUtLjgyNTVhMS4wNTcsMS4wNTcsMCwwLDEtLjExNzQtMS4zNDIxbC44MDE5LTEuMTQ0MmE3LjEyMSw3LjEyMSwwLDAsMS0uNzExMy0xLjcxMTJsLTEuMzcyNC0uMjQxNmExLjA1NjksMS4wNTY5LDAsMCwxLS44NzU3LTEuMDRWMTUuNzgyNmExLjA1NjksMS4wNTY5LDAsMCwxLC44NzU3LTEuMDRsMS4zNzU3LS4yNDE2YTcuMTEsNy4xMSwwLDAsMSwuNzExNC0xLjcxMTJsLS44MDItMS4xNDQyYTEuMDU3LDEuMDU3LDAsMCwxLC4xMTc1LTEuMzQyMmwuODI1NC0uODI1NEExLjA1NjgsMS4wNTY4LDAsMCwxLDM0LjMyNDUsOS4zNmwxLjE0NDIuODAxOUE3LjEzNTUsNy4xMzU1LDAsMCwxLDM3LjE4LDkuNDUxbC4yNDE2LTEuMzcyNGExLjA1NjgsMS4wNTY4LDAsMCwxLDEuMDQtLjg3NTdoMS4xNjc3YTEuMDU2OSwxLjA1NjksMCwwLDEsMS4wNC44NzU3bC4yNDE2LDEuMzcyNGE3LjEyNSw3LjEyNSwwLDAsMSwxLjcxMTIuNzExM0w0My43NjY2LDkuMzZBMS4wNTY5LDEuMDU2OSwwLDAsMSw0NS4xMjU1LDkuNDY3N1ptLTIuMDMsNi44OTg3QTQuMDQzMyw0LjA0MzMsMCwxLDAsMzkuMDUyMywyMC40MWgwQTQuMDQ2NSw0LjA0NjUsMCwwLDAsNDMuMDk1NSwxNi4zNjY0WiIgc3R5bGU9ImZpbGw6I2UxMjIyOSIvPjxwb2x5Z29uIHBvaW50cz0iMzkuNDEzIDM0Ljc1NyAzOS41MzcgMzQuNzU3IDM5LjY3NSAzNC43NTcgMzkuNjc1IDEwOS41MSAzOS41MzcgMTA5LjUxIDM5LjQxMyAxMDkuNTEgMzkuNDEzIDM0Ljc1NyAzOS40MTMgMzQuNzU3IiBzdHlsZT0iZmlsbDpub25lO3N0cm9rZTojOTk5O3N0cm9rZS1saW5lY2FwOnJvdW5kO3N0cm9rZS1taXRlcmxpbWl0OjEwO3N0cm9rZS13aWR0aDowLjMwODg1NDQ1MDU2MDE2MThweDtmaWxsLXJ1bGU6ZXZlbm9kZCIvPjwvc3ZnPg==);\n",
-       "    float:left;\n",
-       "    margin-right:20px;\n",
-       "    margin-top:-20px;\n",
-       "    margin-bottom:20px;\n",
-       "}\n",
-       "div.todo{\n",
-       "    font-weight: bold;\n",
-       "    font-size: 1.1em;\n",
-       "    margin-top:40px;\n",
-       "}\n",
-       "div.todo ul{\n",
-       "    margin: 0.2em;\n",
-       "}\n",
-       "div.todo li{\n",
-       "    margin-left:60px;\n",
-       "    margin-top:0;\n",
-       "    margin-bottom:0;\n",
-       "}\n",
-       "\n",
-       "div .comment{\n",
-       "    font-size:0.8em;\n",
-       "    color:#696969;\n",
-       "}\n",
-       "\n",
-       "\n",
-       "\n",
-       "</style>\n",
-       "\n"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/markdown": [
-       "<br>**FIDLE 2020 - Practical Work Module**"
-      ],
-      "text/plain": [
-       "<IPython.core.display.Markdown object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Version              : 2.0.14\n",
-      "Notebook id          : IMDB2\n",
-      "Run time             : Wednesday 10 February 2021, 10:47:37\n",
-      "TensorFlow version   : 2.4.0\n",
-      "Keras version        : 2.4.0\n",
-      "Datasets dir         : /gpfswork/rech/mlh/uja62cb/datasets\n",
-      "Run dir              : ./run\n",
-      "Update keras cache   : False\n",
-      "Save figs            : True\n",
-      "Path figs            : ./run/figs\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "import tensorflow as tf\n",
-    "import tensorflow.keras as keras\n",
-    "import tensorflow.keras.datasets.imdb as imdb\n",
-    "\n",
-    "import matplotlib.pyplot as plt\n",
-    "import matplotlib\n",
-    "import pandas as pd\n",
-    "\n",
-    "import os,sys,h5py,json,re\n",
-    "\n",
-    "from importlib import reload\n",
-    "\n",
-    "sys.path.append('..')\n",
-    "import fidle.pwk as pwk\n",
-    "\n",
-    "datasets_dir = pwk.init('IMDB2')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 2 : Preparing the data\n",
-    "### 2.1 - Our reviews :"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-02-10T09:47:37.374552Z",
-     "iopub.status.busy": "2021-02-10T09:47:37.374081Z",
-     "iopub.status.idle": "2021-02-10T09:47:37.375678Z",
-     "shell.execute_reply": "2021-02-10T09:47:37.376144Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "reviews = [ \"This film is particularly nice, a must see.\",\n",
-    "             \"Some films are great classics and cannot be ignored.\",\n",
-    "             \"This movie is just abominable and doesn't deserve to be seen!\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.2 - Retrieve dictionaries\n",
-    "Note : This dictionary is generated by [01-Embedding-Keras](01-Embedding-Keras.ipynb) notebook."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-02-10T09:47:37.379502Z",
-     "iopub.status.busy": "2021-02-10T09:47:37.379033Z",
-     "iopub.status.idle": "2021-02-10T09:47:37.430816Z",
-     "shell.execute_reply": "2021-02-10T09:47:37.430303Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "with open('./data/word_index.json', 'r') as fp:\n",
-    "    word_index = json.load(fp)\n",
-    "    index_word = {index:word for word,index in word_index.items()} "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.3 - Clean, index and padd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-02-10T09:47:37.436960Z",
-     "iopub.status.busy": "2021-02-10T09:47:37.436484Z",
-     "iopub.status.idle": "2021-02-10T09:47:37.438102Z",
-     "shell.execute_reply": "2021-02-10T09:47:37.438575Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "max_len    = 256\n",
-    "vocab_size = 10000\n",
-    "\n",
-    "\n",
-    "nb_reviews = len(reviews)\n",
-    "x_data     = []\n",
-    "\n",
-    "# ---- For all reviews\n",
-    "for review in reviews:\n",
-    "    # ---- First index must be <start>\n",
-    "    index_review=[1]\n",
-    "    # ---- For all words\n",
-    "    for w in review.split(' '):\n",
-    "        # ---- Clean it\n",
-    "        w_clean = re.sub(r\"[^a-zA-Z0-9]\", \"\", w)\n",
-    "        # ---- Not empty ?\n",
-    "        if len(w_clean)>0:\n",
-    "            # ---- Get the index\n",
-    "            w_index = word_index.get(w,2)\n",
-    "            if w_index>vocab_size : w_index=2\n",
-    "            # ---- Add the index if < vocab_size\n",
-    "            index_review.append(w_index)\n",
-    "    # ---- Add the indexed review\n",
-    "    x_data.append(index_review)    \n",
-    "\n",
-    "# ---- Padding\n",
-    "x_data = keras.preprocessing.sequence.pad_sequences(x_data, value   = 0, padding = 'post', maxlen  = max_len)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.4 - Have a look"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-02-10T09:47:37.444090Z",
-     "iopub.status.busy": "2021-02-10T09:47:37.443629Z",
-     "iopub.status.idle": "2021-02-10T09:47:37.447041Z",
-     "shell.execute_reply": "2021-02-10T09:47:37.447515Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Text review      : This film is particularly nice, a must see.\n",
-      "x_train[0]       : [1, 2, 22, 9, 572, 2, 6, 215, 2, 0, 0, 0, 0, 0] (...)\n",
-      "Translation      : <start> <unknown> film is particularly <unknown> a must <unknown> <pad> <pad> <pad> <pad> <pad> (...)\n",
-      "\n",
-      "Text review      : Some films are great classics and cannot be ignored.\n",
-      "x_train[1]       : [1, 2, 108, 26, 87, 2239, 5, 566, 30, 2, 0, 0, 0, 0, 0] (...)\n",
-      "Translation      : <start> <unknown> films are great classics and cannot be <unknown> <pad> <pad> <pad> <pad> <pad> (...)\n",
-      "\n",
-      "Text review      : This movie is just abominable and doesn't deserve to be seen!\n",
-      "x_train[2]       : [1, 2, 20, 9, 43, 2, 5, 152, 1833, 8, 30, 2, 0, 0, 0, 0, 0] (...)\n",
-      "Translation      : <start> <unknown> movie is just <unknown> and doesn't deserve to be <unknown> <pad> <pad> <pad> <pad> <pad> (...)\n"
-     ]
-    }
-   ],
-   "source": [
-    "def translate(x):\n",
-    "    return ' '.join( [index_word.get(i,'?') for i in x] )\n",
-    "\n",
-    "for i in range(nb_reviews):\n",
-    "    imax=np.where(x_data[i]==0)[0][0]+5\n",
-    "    print(f'\\nText review      :',    reviews[i])\n",
-    "    print(  f'x_train[{i:}]       :', list(x_data[i][:imax]), '(...)')\n",
-    "    print(  'Translation      :', translate(x_data[i][:imax]), '(...)')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 2 - Bring back the model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-02-10T09:47:37.450800Z",
-     "iopub.status.busy": "2021-02-10T09:47:37.450034Z",
-     "iopub.status.idle": "2021-02-10T09:47:38.436118Z",
-     "shell.execute_reply": "2021-02-10T09:47:38.435552Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "model = keras.models.load_model('./run/models/best_model.h5')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 4 - Predict"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-02-10T09:47:38.439376Z",
-     "iopub.status.busy": "2021-02-10T09:47:38.438919Z",
-     "iopub.status.idle": "2021-02-10T09:47:38.873021Z",
-     "shell.execute_reply": "2021-02-10T09:47:38.873546Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "y_pred   = model.predict(x_data)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### And the winner is :"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-02-10T09:47:38.877372Z",
-     "iopub.status.busy": "2021-02-10T09:47:38.876910Z",
-     "iopub.status.idle": "2021-02-10T09:47:38.880363Z",
-     "shell.execute_reply": "2021-02-10T09:47:38.879875Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "This film is particularly nice, a must see.                            => POSITIVE (0.60)\n",
-      "\n",
-      "Some films are great classics and cannot be ignored.                   => POSITIVE (0.66)\n",
-      "\n",
-      "This movie is just abominable and doesn't deserve to be seen!          => NEGATIVE (0.38)\n"
-     ]
-    }
-   ],
-   "source": [
-    "for i in range(nb_reviews):\n",
-    "    print(f'\\n{reviews[i]:<70} =>',('NEGATIVE' if y_pred[i][0]<0.5 else 'POSITIVE'),f'({y_pred[i][0]:.2f})')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-02-10T09:47:38.883481Z",
-     "iopub.status.busy": "2021-02-10T09:47:38.883018Z",
-     "iopub.status.idle": "2021-02-10T09:47:38.885320Z",
-     "shell.execute_reply": "2021-02-10T09:47:38.885799Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "End time is : Wednesday 10 February 2021, 10:47:38\n",
-      "Duration is : 00:00:02 517ms\n",
-      "This notebook ends here\n"
-     ]
-    }
-   ],
-   "source": [
-    "pwk.end()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "<img width=\"80px\" src=\"../fidle/img/00-Fidle-logo-01.svg\"></img>"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
-%% Cell type:markdown id: tags:
-
-<img width="800px" src="../fidle/img/00-Fidle-header-01.svg"></img>
-
-# <!-- TITLE --> [IMDB2] - Reload and reuse a saved model
-<!-- DESC --> Retrieving a saved model to perform a sentiment analysis (movie review)
-<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->
-
-## Objectives :
- - The objective is to guess whether film reviews are **positive or negative** based on the analysis of the text.
- - For this, we will use our **previously saved model**.
-
-Original dataset can be find **[there](http://ai.stanford.edu/~amaas/data/sentiment/)**
-Note that [IMDb.com](https://imdb.com) offers several easy-to-use [datasets](https://www.imdb.com/interfaces/)
-For simplicity's sake, we'll use the dataset directly [embedded in Keras](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)
-
-## What we're going to do :
-
- - Preparing the data
- - Retrieve our saved model
- - Evaluate the result
-
-%% Cell type:markdown id: tags:
-
-## Step 1 - Init python stuff
-
-%% Cell type:code id: tags:
-
-``` python
-import numpy as np
-
-import tensorflow as tf
-import tensorflow.keras as keras
-import tensorflow.keras.datasets.imdb as imdb
-
-import matplotlib.pyplot as plt
-import matplotlib
-import pandas as pd
-
-import os,sys,h5py,json,re
-
-from importlib import reload
-
-sys.path.append('..')
-import fidle.pwk as pwk
-
-datasets_dir = pwk.init('IMDB2')
-```
-
-%% Output
-
-
-    <br>**FIDLE 2020 - Practical Work Module**
-
-    Version              : 2.0.14
-    Notebook id          : IMDB2
-    Run time             : Wednesday 10 February 2021, 10:47:37
-    TensorFlow version   : 2.4.0
-    Keras version        : 2.4.0
-    Datasets dir         : /gpfswork/rech/mlh/uja62cb/datasets
-    Run dir              : ./run
-    Update keras cache   : False
-    Save figs            : True
-    Path figs            : ./run/figs
-
-%% Cell type:markdown id: tags:
-
-## Step 2 : Preparing the data
-### 2.1 - Our reviews :
-
-%% Cell type:code id: tags:
-
-``` python
-reviews = [ "This film is particularly nice, a must see.",
-             "Some films are great classics and cannot be ignored.",
-             "This movie is just abominable and doesn't deserve to be seen!"]
-```
-
-%% Cell type:markdown id: tags:
-
-### 2.2 - Retrieve dictionaries
-Note : This dictionary is generated by [01-Embedding-Keras](01-Embedding-Keras.ipynb) notebook.
-
-%% Cell type:code id: tags:
-
-``` python
-with open('./data/word_index.json', 'r') as fp:
-    word_index = json.load(fp)
-    index_word = {index:word for word,index in word_index.items()}
-```
-
-%% Cell type:markdown id: tags:
-
-### 2.3 - Clean, index and padd
-
-%% Cell type:code id: tags:
-
-``` python
-max_len    = 256
-vocab_size = 10000
-
-
-nb_reviews = len(reviews)
-x_data     = []
-
-# ---- For all reviews
-for review in reviews:
-    # ---- First index must be <start>
-    index_review=[1]
-    # ---- For all words
-    for w in review.split(' '):
-        # ---- Clean it
-        w_clean = re.sub(r"[^a-zA-Z0-9]", "", w)
-        # ---- Not empty ?
-        if len(w_clean)>0:
-            # ---- Get the index
-            w_index = word_index.get(w,2)
-            if w_index>vocab_size : w_index=2
-            # ---- Add the index if < vocab_size
-            index_review.append(w_index)
-    # ---- Add the indexed review
-    x_data.append(index_review)
-
-# ---- Padding
-x_data = keras.preprocessing.sequence.pad_sequences(x_data, value   = 0, padding = 'post', maxlen  = max_len)
-```
-
-%% Cell type:markdown id: tags:
-
-### 2.4 - Have a look
-
-%% Cell type:code id: tags:
-
-``` python
-def translate(x):
-    return ' '.join( [index_word.get(i,'?') for i in x] )
-
-for i in range(nb_reviews):
-    imax=np.where(x_data[i]==0)[0][0]+5
-    print(f'\nText review      :',    reviews[i])
-    print(  f'x_train[{i:}]       :', list(x_data[i][:imax]), '(...)')
-    print(  'Translation      :', translate(x_data[i][:imax]), '(...)')
-```
-
-%% Output
-
-    
-    Text review      : This film is particularly nice, a must see.
-    x_train[0]       : [1, 2, 22, 9, 572, 2, 6, 215, 2, 0, 0, 0, 0, 0] (...)
-    Translation      : <start> <unknown> film is particularly <unknown> a must <unknown> <pad> <pad> <pad> <pad> <pad> (...)
-    
-    Text review      : Some films are great classics and cannot be ignored.
-    x_train[1]       : [1, 2, 108, 26, 87, 2239, 5, 566, 30, 2, 0, 0, 0, 0, 0] (...)
-    Translation      : <start> <unknown> films are great classics and cannot be <unknown> <pad> <pad> <pad> <pad> <pad> (...)
-    
-    Text review      : This movie is just abominable and doesn't deserve to be seen!
-    x_train[2]       : [1, 2, 20, 9, 43, 2, 5, 152, 1833, 8, 30, 2, 0, 0, 0, 0, 0] (...)
-    Translation      : <start> <unknown> movie is just <unknown> and doesn't deserve to be <unknown> <pad> <pad> <pad> <pad> <pad> (...)
-
-%% Cell type:markdown id: tags:
-
-## Step 2 - Bring back the model
-
-%% Cell type:code id: tags:
-
-``` python
-model = keras.models.load_model('./run/models/best_model.h5')
-```
-
-%% Cell type:markdown id: tags:
-
-## Step 4 - Predict
-
-%% Cell type:code id: tags:
-
-``` python
-y_pred   = model.predict(x_data)
-```
-
-%% Cell type:markdown id: tags:
-
-#### And the winner is :
-
-%% Cell type:code id: tags:
-
-``` python
-for i in range(nb_reviews):
-    print(f'\n{reviews[i]:<70} =>',('NEGATIVE' if y_pred[i][0]<0.5 else 'POSITIVE'),f'({y_pred[i][0]:.2f})')
-```
-
-%% Output
-
-    
-    This film is particularly nice, a must see.                            => POSITIVE (0.60)
-    
-    Some films are great classics and cannot be ignored.                   => POSITIVE (0.66)
-    
-    This movie is just abominable and doesn't deserve to be seen!          => NEGATIVE (0.38)
-
-%% Cell type:code id: tags:
-
-``` python
-pwk.end()
-```
-
-%% Output
-
-    End time is : Wednesday 10 February 2021, 10:47:38
-    Duration is : 00:00:02 517ms
-    This notebook ends here
-
-%% Cell type:markdown id: tags:
-
---
-<img width="80px" src="../fidle/img/00-Fidle-logo-01.svg"></img>
--- a/IMDB/04-LSTM-Keras==done==.ipynb
+++ b/IMDB/04-LSTM-Keras==done==.ipynb
--- a/IMDB/04-Show-vectors.ipynb
+++ b/IMDB/04-Show-vectors.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img width=\"800px\" src=\"../fidle/img/00-Fidle-header-01.svg\"></img>\n",
+    "\n",
+    "# <!-- TITLE --> [IMDB4] - Reload embedded vectors\n",
+    "<!-- DESC --> Retrieving embedded vectors from our trained model\n",
+    "<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->\n",
+    "\n",
+    "## Objectives :\n",
+    " - The objective is to retrieve and visualize our embedded vectors\n",
+    " - For this, we will use our **previously saved model**.\n",
+    "\n",
+    "## What we're going to do :\n",
+    "\n",
+    " - Retrieve our saved model\n",
+    " - Extract our vectors\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1 - Init python stuff"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import tensorflow.keras as keras\n",
+    "import tensorflow.keras.datasets.imdb as imdb\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib\n",
+    "import pandas as pd\n",
+    "\n",
+    "import os,sys,h5py,json,re\n",
+    "\n",
+    "from importlib import reload\n",
+    "\n",
+    "sys.path.append('..')\n",
+    "import fidle.pwk as pwk\n",
+    "\n",
+    "run_dir = './run/IMDB2'\n",
+    "datasets_dir = pwk.init('IMDB4')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.2 - Parameters\n",
+    "The words in the vocabulary are classified from the most frequent to the rarest.  \n",
+    "`vocab_size` is the number of words we will remember in our vocabulary (the other words will be considered as unknown).  \n",
+    "`review_len` is the review length  \n",
+    "`dictionaries_dir` is where we will go to save our dictionaries. (./data is a good choice)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab_size           = 10000\n",
+    "review_len           = 256\n",
+    "\n",
+    "dictionaries_dir     = './data'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Override parameters (batch mode) - Just forget this cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pwk.override('vocab_size', 'review_len', 'dictionaries_dir')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2 - Get the embedding vectors !"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 - Load model and dictionaries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = keras.models.load_model(f'{run_dir}/models/best_model.h5')\n",
+    "print('Model loaded.')\n",
+    "\n",
+    "with open(f'{dictionaries_dir}/index_word.json', 'r') as fp:\n",
+    "    index_word = json.load(fp)\n",
+    "    index_word = { int(i):w for i,w in index_word.items() }\n",
+    "    word_index = { w:int(i) for i,w in index_word.items() }\n",
+    "    print('Dictionary loaded.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 - Retrieve embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings = model.layers[0].get_weights()[0]\n",
+    "print('Shape of embeddings : ',embeddings.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 - Build a nice dictionary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_embedding = { index_word[i]:embeddings[i] for i in range(vocab_size) }"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3 - Have a look !"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_embedding['nice']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def l2w(w1,w2):\n",
+    "    v1=word_embedding[w1]\n",
+    "    v2=word_embedding[w2]\n",
+    "    return np.linalg.norm(v2-v1)\n",
+    "\n",
+    "def show_l2(w1,w2):\n",
+    "    print(f'\\nL2 between [{w1}] and [{w2}] : ',l2w(w1,w2))\n",
+    "    \n",
+    "def neighbors(w1):\n",
+    "    v1=word_embedding[w1]\n",
+    "    dd={}\n",
+    "    for i in range(4, 1000):\n",
+    "        w2=index_word[i]\n",
+    "        dd[w2]=l2w(w1,w2)\n",
+    "    dd= {k: v for k, v in sorted(dd.items(), key=lambda item: item[1])}\n",
+    "    print(f'\\nNeighbors of [{w1}] : ', list(dd.keys())[1:15])\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_l2('nice', 'pleasant')\n",
+    "show_l2('nice', 'horrible')\n",
+    "\n",
+    "neighbors('horrible')\n",
+    "neighbors('great')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pwk.end()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "<img width=\"80px\" src=\"../fidle/img/00-Fidle-logo-01.svg\"></img>"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:markdown id: tags:
+
+<img width="800px" src="../fidle/img/00-Fidle-header-01.svg"></img>
+
+# <!-- TITLE --> [IMDB4] - Reload embedded vectors
+<!-- DESC --> Retrieving embedded vectors from our trained model
+<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->
+
+## Objectives :
+ - The objective is to retrieve and visualize our embedded vectors
+ - For this, we will use our **previously saved model**.
+
+## What we're going to do :
+
+ - Retrieve our saved model
+ - Extract our vectors
+
+%% Cell type:markdown id: tags:
+
+## Step 1 - Init python stuff
+
+%% Cell type:code id: tags:
+
+``` python
+import numpy as np
+
+import tensorflow as tf
+import tensorflow.keras as keras
+import tensorflow.keras.datasets.imdb as imdb
+
+import matplotlib.pyplot as plt
+import matplotlib
+import pandas as pd
+
+import os,sys,h5py,json,re
+
+from importlib import reload
+
+sys.path.append('..')
+import fidle.pwk as pwk
+
+run_dir = './run/IMDB2'
+datasets_dir = pwk.init('IMDB4')
+```
+
+%% Cell type:markdown id: tags:
+
+### 1.2 - Parameters
+The words in the vocabulary are classified from the most frequent to the rarest.
+`vocab_size` is the number of words we will remember in our vocabulary (the other words will be considered as unknown).
+`review_len` is the review length
+`dictionaries_dir` is where we will go to save our dictionaries. (./data is a good choice)
+
+%% Cell type:code id: tags:
+
+``` python
+vocab_size           = 10000
+review_len           = 256
+
+dictionaries_dir     = './data'
+```
+
+%% Cell type:markdown id: tags:
+
+Override parameters (batch mode) - Just forget this cell
+
+%% Cell type:code id: tags:
+
+``` python
+pwk.override('vocab_size', 'review_len', 'dictionaries_dir')
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 2 - Get the embedding vectors !
+
+%% Cell type:markdown id: tags:
+
+### 2.1 - Load model and dictionaries
+
+%% Cell type:code id: tags:
+
+``` python
+model = keras.models.load_model(f'{run_dir}/models/best_model.h5')
+print('Model loaded.')
+
+with open(f'{dictionaries_dir}/index_word.json', 'r') as fp:
+    index_word = json.load(fp)
+    index_word = { int(i):w for i,w in index_word.items() }
+    word_index = { w:int(i) for i,w in index_word.items() }
+    print('Dictionary loaded.')
+```
+
+%% Cell type:markdown id: tags:
+
+### 2.2 - Retrieve embeddings
+
+%% Cell type:code id: tags:
+
+``` python
+embeddings = model.layers[0].get_weights()[0]
+print('Shape of embeddings : ',embeddings.shape)
+```
+
+%% Cell type:markdown id: tags:
+
+### 2.3 - Build a nice dictionary
+
+%% Cell type:code id: tags:
+
+``` python
+word_embedding = { index_word[i]:embeddings[i] for i in range(vocab_size) }
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 3 - Have a look !
+
+%% Cell type:code id: tags:
+
+``` python
+word_embedding['nice']
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def l2w(w1,w2):
+    v1=word_embedding[w1]
+    v2=word_embedding[w2]
+    return np.linalg.norm(v2-v1)
+
+def show_l2(w1,w2):
+    print(f'\nL2 between [{w1}] and [{w2}] : ',l2w(w1,w2))
+
+def neighbors(w1):
+    v1=word_embedding[w1]
+    dd={}
+    for i in range(4, 1000):
+        w2=index_word[i]
+        dd[w2]=l2w(w1,w2)
+    dd= {k: v for k, v in sorted(dd.items(), key=lambda item: item[1])}
+    print(f'\nNeighbors of [{w1}] : ', list(dd.keys())[1:15])
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+show_l2('nice', 'pleasant')
+show_l2('nice', 'horrible')
+
+neighbors('horrible')
+neighbors('great')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+pwk.end()
+```
+
+%% Cell type:markdown id: tags:
+
+---
+<img width="80px" src="../fidle/img/00-Fidle-logo-01.svg"></img>
--- a/IMDB/04-LSTM-Keras.ipynb
+++ b/IMDB/04-LSTM-Keras.ipynb