Add detection of dataset dir

71bae5ca · Soraya Arias · 304dc413 · 71bae5ca
Commit 71bae5ca authored 5 years ago by Soraya Arias
--- a/IMDB/03-LSTM-Keras.ipynb
+++ b/IMDB/03-LSTM-Keras.ipynb
@@ -112,6 +112,9 @@
    "                                                       oov_char   = 2,\n",
    "                                                       index_from = 3, )\n",
    "# Or you can use the same pre-loaded dataset if at GRICAD or IDRIS\n",
+    "#place, dataset_dir = ooo.good_place( { 'GRICAD' : f'{os.getenv(\"SCRATCH_DIR\",\"\")}/PROJECTS/pr-fidle/datasets/IMDB',\n",
+    "#                                       'IDRIS'  : f'{os.getenv(\"WORK\",\"\")}/datasets/IMDB',\n",
+    "#                                       'HOME'   : f'{os.getenv(\"HOME\",\"\")}/datasets/IMDB'} )\n",
    "#with  h5py.File(f'{dataset_dir}/dataset_imdb.h5','r') as f:\n",
    "#        x_train = f['x_train'][:]\n",
    "#        y_train = f['y_train'][:]\n",

 %% Cell type:markdown id: tags:

 <img width="800px" src="../fidle/img/00-Fidle-header-01.svg"></img>

 # <!-- TITLE --> [IMDB3] - Text embedding/LSTM model with IMDB
 <!-- DESC --> Still the same problem, but with a network combining embedding and LSTM
 <!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->

 ## Objectives :
 - The objective is to guess whether film reviews are **positive or negative** based on the analysis of the text.
 - Use of a model combining embedding and LSTM

 Original dataset can be find **[there](http://ai.stanford.edu/~amaas/data/sentiment/)**
 Note that [IMDb.com](https://imdb.com) offers several easy-to-use [datasets](https://www.imdb.com/interfaces/)
 For simplicity's sake, we'll use the dataset directly [embedded in Keras](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)

 ## What we're going to do :

 - Retrieve data
 - Preparing the data
 - Build a Embedding/LSTM model
 - Train the model
 - Evaluate the result

 %% Cell type:markdown id: tags:

 ## Step 1 - Init python stuff

 %% Cell type:code id: tags:

 ``` python
 import numpy as np

 import tensorflow as tf
 import tensorflow.keras as keras
 import tensorflow.keras.datasets.imdb as imdb

 import matplotlib.pyplot as plt
 import matplotlib
 import seaborn as sns

 import os,sys,h5py,json

 from importlib import reload

 sys.path.append('..')
 import fidle.pwk as ooo

 ooo.init()
 ```

 %% Cell type:markdown id: tags:

 ## Step 2 - Retrieve data

 **From Keras :**
 This IMDb dataset can bet get directly from [Keras datasets](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)

 Due to their nature, textual data can be somewhat complex.

 ### 2.1 - Data structure :
 The dataset is composed of 2 parts: **reviews** and **opinions** (positive/negative),  with a **dictionary**

  - dataset = (reviews, opinions)
    - reviews = \[ review_0, review_1, ...\]
      - review_i = [ int1, int2, ...] where int_i is the index of the word in the dictionary.
    - opinions = \[ int0, int1, ...\] where int_j == 0 if opinion is negative or 1 if opinion is positive.
  - dictionary = \[ mot1:int1, mot2:int2, ... ]

 %% Cell type:markdown id: tags:

 ### 2.2 - Get dataset
 For simplicity, we will use a pre-formatted dataset.
 See : https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb/load_data

 However, Keras offers some usefull tools for formatting textual data.
 See : https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text

 %% Cell type:code id: tags:

 ``` python
 vocab_size = 10000

 # ----- Retrieve x,y
 #
 # Choose if you want to load dataset directly from keras (small size <20M)
 (x_train, y_train), (x_test, y_test) = imdb.load_data( num_words  = vocab_size,
                                                       skip_top   = 0,
                                                       maxlen     = None,
                                                       seed       = 42,
                                                       start_char = 1,
                                                       oov_char   = 2,
                                                       index_from = 3, )
 # Or you can use the same pre-loaded dataset if at GRICAD or IDRIS
+#place, dataset_dir = ooo.good_place( { 'GRICAD' : f'{os.getenv("SCRATCH_DIR","")}/PROJECTS/pr-fidle/datasets/IMDB',
+#                                       'IDRIS'  : f'{os.getenv("WORK","")}/datasets/IMDB',
+#                                       'HOME'   : f'{os.getenv("HOME","")}/datasets/IMDB'} )
 #with  h5py.File(f'{dataset_dir}/dataset_imdb.h5','r') as f:
 #        x_train = f['x_train'][:]
 #        y_train = f['y_train'][:]
 #        x_test  = f['x_test'][:]
 #        y_test  = f['y_test'][:]
 ```

 %% Cell type:code id: tags:

 ``` python
 print("  Max(x_train,x_test)  : ", ooo.rmax([x_train,x_test]) )
 print("  x_train : {}  y_train : {}".format(x_train.shape, y_train.shape))
 print("  x_test  : {}  y_test  : {}".format(x_test.shape,  y_test.shape))

 print('\nReview example (x_train[12]) :\n\n',x_train[12])
 ```

 %% Cell type:markdown id: tags:

 ### 2.3 - Have a look for humans (optional)
 When we loaded the dataset, we asked for using \<start\> as 1, \<unknown word\> as 2
 So, we shifted the dataset by 3 with the parameter index_from=3

 %% Cell type:code id: tags:

 ``` python
 # ---- Retrieve dictionary {word:index}, and encode it in ascii

 word_index = imdb.get_word_index()

 # ---- Shift the dictionary from +3

 word_index = {w:(i+3) for w,i in word_index.items()}

 # ---- Add <pad>, <start> and unknown tags

 word_index.update( {'<pad>':0, '<start>':1, '<unknown>':2} )

 # ---- Create a reverse dictionary : {index:word}

 index_word = {index:word for word,index in word_index.items()}

 # ---- Add a nice function to transpose :
 #
 def dataset2text(review):
    return ' '.join([index_word.get(i, '?') for i in review])
 ```

 %% Cell type:code id: tags:

 ``` python
 print('\nDictionary size     : ', len(word_index))
 print('\nReview example (x_train[12]) :\n\n',x_train[12])
 print('\nIn real words :\n\n', dataset2text(x_train[12]))
 ```

 %% Cell type:markdown id: tags:

 ### 2.4 - Have a look for neurons

 %% Cell type:code id: tags:

 ``` python
 plt.figure(figsize=(12, 6))
 ax=sns.distplot([len(i) for i in x_train],bins=60)
 ax.set_title('Distribution of reviews by size')
 plt.xlabel("Review's sizes")
 plt.ylabel('Density')
 ax.set_xlim(0, 1500)
 plt.show()
 ```

 %% Cell type:markdown id: tags:

 ## Step 3 - Preprocess the data
 In order to be processed by an NN, all entries must have the same length.
 We chose a review length of **review_len**
 We will therefore complete them with a padding (of \<pad\>\)

 %% Cell type:code id: tags:

 ``` python
 review_len = 256

 x_train = keras.preprocessing.sequence.pad_sequences(x_train,
                                                     value   = 0,
                                                     padding = 'post',
                                                     maxlen  = review_len)

 x_test  = keras.preprocessing.sequence.pad_sequences(x_test,
                                                     value   = 0 ,
                                                     padding = 'post',
                                                     maxlen  = review_len)

 print('\nReview example (x_train[12]) :\n\n',x_train[12])
 print('\nIn real words :\n\n', dataset2text(x_train[12]))
 ```

 %% Cell type:markdown id: tags:

 ### Save dataset and dictionary (can be usefull but not mandatory if at GRICAD or IDRIS)

 %% Cell type:code id: tags:

 ``` python
 os.makedirs('./data',   mode=0o750, exist_ok=True)

 with h5py.File('./data/dataset_imdb.h5', 'w') as f:
    f.create_dataset("x_train",    data=x_train)
    f.create_dataset("y_train",    data=y_train)
    f.create_dataset("x_test",     data=x_test)
    f.create_dataset("y_test",     data=y_test)

 with open('./data/word_index.json', 'w') as fp:
    json.dump(word_index, fp)

 with open('./data/index_word.json', 'w') as fp:
    json.dump(index_word, fp)

 print('Saved.')
 ```

 %% Cell type:markdown id: tags:

 ## Step 4 - Build the model
 Few remarks :
 1. We'll choose a dense vector size for the embedding output with **dense_vector_size**
 2. **GlobalAveragePooling1D** do a pooling on the last dimension : (None, lx, ly) -> (None, ly)
 In other words: we average the set of vectors/words of a sentence
 3. L'embedding de Keras fonctionne de manière supervisée. Il s'agit d'une couche de *vocab_size* neurones vers *n_neurons* permettant de maintenir une table de vecteurs (les poids constituent les vecteurs). Cette couche ne calcule pas de sortie a la façon des couches normales, mais renvois la valeur des vecteurs. n mots => n vecteurs (ensuite empilés par le pooling)
 Voir : https://stats.stackexchange.com/questions/324992/how-the-embedding-layer-is-trained-in-keras-embedding-layer

 A SUIVRE : https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks
 ### 4.1 - Build
 More documentation about :
 - [Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding)
 - [GlobalAveragePooling1D](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GlobalAveragePooling1D)

 %% Cell type:code id: tags:

 ``` python
 def get_model(dense_vector_size=128):

    model = keras.Sequential()
    model.add(keras.layers.Embedding(input_dim    = vocab_size,
                                     output_dim   = dense_vector_size,
                                     input_length = review_len))
    model.add(keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(keras.layers.Dense(1,                 activation='sigmoid'))

    model.compile(optimizer = 'adam',
                  loss      = 'binary_crossentropy',
                  metrics   = ['accuracy'])
    return model
 ```

 %% Cell type:markdown id: tags:

 ## Step 5 - Train the model
 ### 5.1 - Get it

 %% Cell type:code id: tags:

 ``` python
 model = get_model()

 model.summary()
 ```

 %% Cell type:markdown id: tags:

 ### 5.2 - Add callback

 %% Cell type:code id: tags:

 ``` python
 os.makedirs('./run/models',   mode=0o750, exist_ok=True)
 save_dir = "./run/models/best_model.h5"
 savemodel_callback = tf.keras.callbacks.ModelCheckpoint(filepath=save_dir, verbose=0, save_best_only=True)
 ```

 %% Cell type:markdown id: tags:

 ### 5.1 - Train it
 GPU : batch_size=512 : 305s

 %% Cell type:code id: tags:

 ``` python
 %%time

 n_epochs   = 10
 batch_size = 32

 history = model.fit(x_train,
                    y_train,
                    epochs          = n_epochs,
                    batch_size      = batch_size,
                    validation_data = (x_test, y_test),
                    verbose         = 1,
                    callbacks       = [savemodel_callback])
 ```

 %% Cell type:markdown id: tags:

 ## Step 6 - Evaluate
 ### 6.1 - Training history

 %% Cell type:code id: tags:

 ``` python
 ooo.plot_history(history)
 ```

 %% Cell type:markdown id: tags:

 ### 6.2 - Reload and evaluate best model

 %% Cell type:code id: tags:

 ``` python
 model = keras.models.load_model('./run/models/best_model.h5')

 # ---- Evaluate
 reload(ooo)
 score  = model.evaluate(x_test, y_test, verbose=0)

 print('x_test / loss      : {:5.4f}'.format(score[0]))
 print('x_test / accuracy  : {:5.4f}'.format(score[1]))

 values=[score[1], 1-score[1]]
 ooo.plot_donut(values,["Accuracy","Errors"], title="#### Accuracy donut is :")

 # ---- Confusion matrix

 y_pred   = model.predict_classes(x_test)

 ooo.display_confusion_matrix(y_test,y_pred,labels=range(2),color='orange',font_size='20pt')
 ```

 %% Cell type:markdown id: tags:

 ---
 <img width="80px" src="../fidle/img/00-Fidle-logo-01.svg"></img>