Add path to word index json file

770b04de · Soraya Arias · ce31be12 · 770b04de
Commit 770b04de authored 5 years ago by Soraya Arias
--- a/IMDB/02-Prediction.ipynb
+++ b/IMDB/02-Prediction.ipynb
@@ -138,7 +138,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "with open('./data/word_index.json', 'r') as fp:\n",
+    "place, dataset_dir = ooo.good_place( { 'GRICAD' : f'{os.getenv(\"SCRATCH_DIR\",\"\")}/PROJECTS/pr-fidle/datasets/IMDB',\n",
+    "                                       'IDRIS'  : f'{os.getenv(\"WORK\",\"\")}/datasets/IMDB',\n",
+    "                                       'HOME'   : f'{os.getenv(\"HOME\",\"\")}/datasets/IMDB'} )\n",
+    "with open(dataset_dir+'/word_index.json', 'r') as fp:\n",
    "    word_index = json.load(fp)\n",
    "    index_word = {index:word for word,index in word_index.items()} "
   ]

 %% Cell type:markdown id: tags:
 <img width="800px" src="../fidle/img/00-Fidle-header-01.svg"></img>
 # <!-- TITLE --> [IMDB2] - Text embedding with IMDB - Reloaded
 <!-- DESC --> Example of reusing a previously saved model
 <!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->
 ## Objectives :
 - The objective is to guess whether film reviews are **positive or negative** based on the analysis of the text.
 - For this, we will use our **previously saved model**.
 Original dataset can be find **[there](http://ai.stanford.edu/~amaas/data/sentiment/)**
 Note that [IMDb.com](https://imdb.com) offers several easy-to-use [datasets](https://www.imdb.com/interfaces/)
 For simplicity's sake, we'll use the dataset directly [embedded in Keras](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)
 ## What we're going to do :
 - Preparing the data
 - Retrieve our saved model
 - Evaluate the result
 %% Cell type:markdown id: tags:
 ## Step 1 - Init python stuff
 %% Cell type:code id: tags:
 ``` python
 import numpy as np
 import tensorflow as tf
 import tensorflow.keras as keras
 import tensorflow.keras.datasets.imdb as imdb
 import matplotlib.pyplot as plt
 import matplotlib
 import seaborn as sns
 import pandas as pd
 import os,sys,h5py,json,re
 from importlib import reload
 sys.path.append('..')
 import fidle.pwk as ooo
 ooo.init()
 ```
 %% Output
    FIDLE 2020 - Practical Work Module
    Version              : 0.2.9
    Run time             : Wednesday 19 February 2020, 22:08:28
    TensorFlow version   : 2.0.0
    Keras version        : 2.2.4-tf
 %% Cell type:markdown id: tags:
 ## Step 2 : Preparing the data
 ### 2.1 - Our reviews :
 %% Cell type:code id: tags:
 ``` python
 reviews = [ "This film is particularly nice, a must see.",
             "Some films are great classics and cannot be ignored.",
             "This movie is just abominable and doesn't deserve to be seen!"]
 ```
 %% Cell type:markdown id: tags:
 ### 2.2 - Retrieve dictionaries
 %% Cell type:code id: tags:
 ``` python
-with open('./data/word_index.json', 'r') as fp:
+place, dataset_dir = ooo.good_place( { 'GRICAD' : f'{os.getenv("SCRATCH_DIR","")}/PROJECTS/pr-fidle/datasets/IMDB',
+                                       'IDRIS'  : f'{os.getenv("WORK","")}/datasets/IMDB',
+                                       'HOME'   : f'{os.getenv("HOME","")}/datasets/IMDB'} )
+with open(dataset_dir+'/word_index.json', 'r') as fp:
    word_index = json.load(fp)
    index_word = {index:word for word,index in word_index.items()}
 ```
 %% Cell type:markdown id: tags:
 ### 2.3 - Clean, index and padd
 %% Cell type:code id: tags:
 ``` python
 max_len    = 256
 vocab_size = 10000
 nb_reviews = len(reviews)
 x_data     = []
 # ---- For all reviews
 for review in reviews:
    # ---- First index must be <start>
    index_review=[1]
    # ---- For all words
    for w in review.split(' '):
        # ---- Clean it
        w_clean = re.sub(r"[^a-zA-Z0-9]", "", w)
        # ---- Not empty ?
        if len(w_clean)>0:
            # ---- Get the index
            w_index = word_index.get(w,2)
            if w_index>vocab_size : w_index=2
            # ---- Add the index if < vocab_size
            index_review.append(w_index)
    # ---- Add the indexed review
    x_data.append(index_review)
 # ---- Padding
 x_data = keras.preprocessing.sequence.pad_sequences(x_data, value   = 0, padding = 'post', maxlen  = max_len)
 ```
 %% Cell type:markdown id: tags:
 ### 2.4 - Have a look
 %% Cell type:code id: tags:
 ``` python
 def translate(x):
    return ' '.join( [index_word.get(i,'?') for i in x] )
 for i in range(nb_reviews):
    imax=np.where(x_data[i]==0)[0][0]+5
    print(f'\nText review      :',    reviews[i])
    print(  f'x_train[{i:}]       :', list(x_data[i][:imax]), '(...)')
    print(  'Translation      :', translate(x_data[i][:imax]), '(...)')
 ```
 %% Output
    Text review      : This film is particularly nice, a must see.
    x_train[0]       : [1, 2, 22, 9, 572, 2, 6, 215, 2, 0, 0, 0, 0, 0] (...)
    Translation      : <start> <unknown> film is particularly <unknown> a must <unknown> <pad> <pad> <pad> <pad> <pad> (...)
    Text review      : Some films are great classics and cannot be ignored.
    x_train[1]       : [1, 2, 108, 26, 87, 2239, 5, 566, 30, 2, 0, 0, 0, 0, 0] (...)
    Translation      : <start> <unknown> films are great classics and cannot be <unknown> <pad> <pad> <pad> <pad> <pad> (...)
    Text review      : This movie is just abominable and doesn't deserve to be seen!
    x_train[2]       : [1, 2, 20, 9, 43, 2, 5, 152, 1833, 8, 30, 2, 0, 0, 0, 0, 0] (...)
    Translation      : <start> <unknown> movie is just <unknown> and doesn't deserve to be <unknown> <pad> <pad> <pad> <pad> <pad> (...)
 %% Cell type:markdown id: tags:
 ## Step 2 - Bring back the model
 %% Cell type:code id: tags:
 ``` python
 model = keras.models.load_model('./run/models/best_model.h5')
 ```
 %% Cell type:markdown id: tags:
 ## Step 4 - Predict
 %% Cell type:code id: tags:
 ``` python
 y_pred   = model.predict(x_data)
 ```
 %% Cell type:markdown id: tags:
 #### And the winner is :
 %% Cell type:code id: tags:
 ``` python
 for i in range(nb_reviews):
    print(f'\n{reviews[i]:<70} =>',('NEGATIVE' if y_pred[i][0]<0.5 else 'POSITIVE'),f'({y_pred[i][0]:.2f})')
 ```
 %% Output
    This film is particularly nice, a must see.                            => POSITIVE (0.54)
    Some films are great classics and cannot be ignored.                   => POSITIVE (0.61)
    This movie is just abominable and doesn't deserve to be seen!          => NEGATIVE (0.33)
 %% Cell type:markdown id: tags:
 ---
 <img width="80px" src="../fidle/img/00-Fidle-logo-01.svg"></img>