From 29406c698dfe8f68a32f303f147a23524e2862b0 Mon Sep 17 00:00:00 2001 From: Jean-Luc Parouty <Jean-Luc.Parouty@grenoble-inp.fr> Date: Wed, 24 Feb 2021 08:56:34 +0100 Subject: [PATCH] Add one-hot encoding notebook --- IMDB/01-One-hot-encoding.ipynb | 617 ++++++++++++++++++ ...g-Keras.ipynb => 02-Embedding-Keras.ipynb} | 0 ...ipynb => 02-Embedding-Keras==done==.ipynb} | 0 ...2-Prediction.ipynb => 03-Prediction.ipynb} | 0 ...ne==.ipynb => 03-Prediction==done==.ipynb} | 0 ...3-LSTM-Keras.ipynb => 04-LSTM-Keras.ipynb} | 0 ...ne==.ipynb => 04-LSTM-Keras==done==.ipynb} | 0 7 files changed, 617 insertions(+) create mode 100644 IMDB/01-One-hot-encoding.ipynb rename IMDB/{01-Embedding-Keras.ipynb => 02-Embedding-Keras.ipynb} (100%) rename IMDB/{01-Embedding-Keras==done==.ipynb => 02-Embedding-Keras==done==.ipynb} (100%) rename IMDB/{02-Prediction.ipynb => 03-Prediction.ipynb} (100%) rename IMDB/{02-Prediction==done==.ipynb => 03-Prediction==done==.ipynb} (100%) rename IMDB/{03-LSTM-Keras.ipynb => 04-LSTM-Keras.ipynb} (100%) rename IMDB/{03-LSTM-Keras==done==.ipynb => 04-LSTM-Keras==done==.ipynb} (100%) diff --git a/IMDB/01-One-hot-encoding.ipynb b/IMDB/01-One-hot-encoding.ipynb new file mode 100644 index 0000000..71e1d9b --- /dev/null +++ b/IMDB/01-One-hot-encoding.ipynb @@ -0,0 +1,617 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<img width=\"800px\" src=\"../fidle/img/00-Fidle-header-01.svg\"></img>\n", + "\n", + "# <!-- TITLE --> [IMDB1] - Sentiment analysis with hot-one encoding\n", + "<!-- DESC --> A basic example of sentiment analysis with sparse encoding, using a dataset from Internet Movie Database (IMDB)\n", + "<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->\n", + "\n", + "## Objectives :\n", + " - The objective is to guess whether film reviews are **positive or negative** based on the analysis of the text. \n", + " - Understand the management of **textual data** and **sentiment analysis**\n", + "\n", + "Original dataset can be find **[there](http://ai.stanford.edu/~amaas/data/sentiment/)** \n", + "Note that [IMDb.com](https://imdb.com) offers several easy-to-use [datasets](https://www.imdb.com/interfaces/) \n", + "For simplicity's sake, we'll use the dataset directly [embedded in Keras](https://www.tensorflow.org/api_docs/python/tf/keras/datasets)\n", + "\n", + "## What we're going to do :\n", + "\n", + " - Retrieve data\n", + " - Preparing the data\n", + " - Build a model\n", + " - Train the model\n", + " - Evaluate the result\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1 - Import and init\n", + "### 1.1 - Python stuff" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "import tensorflow as tf\n", + "import tensorflow.keras as keras\n", + "import tensorflow.keras.datasets.imdb as imdb\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib\n", + "\n", + "import pandas as pd\n", + "\n", + "import os,sys,h5py,json\n", + "from importlib import reload\n", + "\n", + "sys.path.append('..')\n", + "import fidle.pwk as pwk\n", + "\n", + "run_dir = './run/IMDB1'\n", + "datasets_dir = pwk.init('IMDB1', run_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 - Parameters\n", + "The words in the vocabulary are classified from the most frequent to the rarest. \n", + "`vocab_size` is the number of words we will remember in our vocabulary (the other words will be considered as unknown). \n", + "`hide_most_frequently` is the number of ignored words, among the most common ones " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vocab_size = 10000\n", + "hide_most_frequently = 0\n", + "\n", + "epochs = 10\n", + "batch_size = 512" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Override parameters (batch mode) - Just forget this cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pwk.override('vocab_size', 'hide_most_frequently', 'batch_size', 'epochs')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2 - Understanding hot-one encoding\n", + "#### We have a **sentence** and a **dictionary** :" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"I've never seen a movie like this before\"\n", + "\n", + "dictionary = {\"a\":0, \"before\":1, \"fantastic\":2, \"i've\":3, \"is\":4, \"like\":5, \"movie\":6, \"never\":7, \"seen\":8, \"this\":9}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### We encode our sentence as a **numerical vector** :" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence_words = sentence.lower().split()\n", + "\n", + "sentence_vect = [ dictionary[w] for w in sentence_words ]\n", + "\n", + "print('Words sentence are : ', sentence_words)\n", + "print('Our vectorized sentence is : ', sentence_vect)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Next, we **one-hot** encode our vectorized sentence as a tensor :" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ---- We get a (sentence length x vector size) matrix of zeros\n", + "#\n", + "onehot = np.zeros( (10,8) )\n", + "\n", + "# ---- We set some 1 for each word\n", + "#\n", + "for i,w in enumerate(sentence_vect):\n", + " onehot[w,i]=1\n", + "\n", + "# --- Show it\n", + "#\n", + "print('In a basic way :\\n\\n', onehot, '\\n\\nWith a pandas wiew :\\n')\n", + "data={ f'{sentence_words[i]:.^10}':onehot[:,i] for i,w in enumerate(sentence_vect) }\n", + "df=pd.DataFrame(data)\n", + "df.index=dictionary.keys()\n", + "df.style.set_precision(0).highlight_max(axis=0).set_properties(**{'text-align': 'center'})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2 - Retrieve data\n", + "\n", + "IMDb dataset can bet get directly from Keras - see [documentation](https://www.tensorflow.org/api_docs/python/tf/keras/datasets) \n", + "Note : Due to their nature, textual data can be somewhat complex.\n", + "\n", + "### 2.1 - Data structure : \n", + "The dataset is composed of 2 parts: \n", + "\n", + " - **reviews**, this will be our **x**\n", + " - **opinions** (positive/negative), this will be our **y**\n", + "\n", + "There are also a **dictionary**, because words are indexed in reviews\n", + "\n", + "```\n", + "<dataset> = (<reviews>, <opinions>)\n", + "\n", + "with : <reviews> = [ <review1>, <review2>, ... ]\n", + " <opinions> = [ <rate1>, <rate2>, ... ] where <ratei> = integer\n", + "\n", + "where : <reviewi> = [ <w1>, <w2>, ...] <wi> are the index (int) of the word in the dictionary\n", + " <ratei> = int 0 for negative opinion, 1 for positive\n", + "\n", + "\n", + "<dictionary> = [ <word1>:<w1>, <word2>:<w2>, ... ]\n", + "\n", + "with : <wordi> = word\n", + " <wi> = int\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 - Load dataset\n", + "For simplicity, we will use a pre-formatted dataset - See [documentation](https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb/load_data) \n", + "However, Keras offers some usefull tools for formatting textual data - See [documentation](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text) \n", + "\n", + "By default : \n", + " - Start of a sequence will be marked with : 1\n", + " - Out of vocabulary word will be : 2\n", + " - First index will be : 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ----- Retrieve x,y\n", + "#\n", + "(x_train, y_train), (x_test, y_test) = imdb.load_data( num_words=vocab_size, skip_top=hide_most_frequently)\n", + "\n", + "y_train = np.asarray(y_train).astype('float32')\n", + "y_test = np.asarray(y_test ).astype('float32')\n", + "\n", + "# ---- About\n", + "#\n", + "print(\"Max(x_train,x_test) : \", pwk.rmax([x_train,x_test]) )\n", + "print(\"Min(x_train,x_test) : \", pwk.rmin([x_train,x_test]) )\n", + "print(\"x_train : {} y_train : {}\".format(x_train.shape, y_train.shape))\n", + "print(\"x_test : {} y_test : {}\".format(x_test.shape, y_test.shape))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3 - About our dataset\n", + "When we loaded the dataset, we asked for using \\<start\\> as 1, \\<unknown word\\> as 2 \n", + "So, we shifted the dataset by 3 with the parameter index_from=3\n", + "\n", + "### 3.1 - Sentences encoding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('\\nReview example (x_train[12]) :\\n\\n',x_train[12])\n", + "print('\\nOpinions (y_train) :\\n\\n',y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2 - Load dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ---- Retrieve dictionary {word:index}, and encode it in ascii\n", + "#\n", + "word_index = imdb.get_word_index()\n", + "\n", + "# ---- Shift the dictionary from +3\n", + "#\n", + "word_index = {w:(i+3) for w,i in word_index.items()}\n", + "\n", + "# ---- Add <pad>, <start> and <unknown> tags\n", + "#\n", + "word_index.update( {'<pad>':0, '<start>':1, '<unknown>':2} )\n", + "\n", + "# ---- Create a reverse dictionary : {index:word}\n", + "#\n", + "index_word = {index:word for word,index in word_index.items()} \n", + "\n", + "# ---- About dictionary\n", + "#\n", + "print('\\nDictionary size : ', len(word_index))\n", + "print('\\nSmall extract :\\n')\n", + "for k in range(440,455):print(f' {k:2d} : {index_word[k]}' )\n", + "\n", + "# ---- Add a nice function to transpose :\n", + "#\n", + "def dataset2text(review):\n", + " return ' '.join([index_word.get(i, '?') for i in review])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.3 - Have a look, for human" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pwk.subtitle('Review example :')\n", + "print(x_train[12])\n", + "pwk.subtitle('After translation :')\n", + "print(dataset2text(x_train[12]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.4 - Few statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sizes=[len(i) for i in x_train]\n", + "plt.figure(figsize=(16,6))\n", + "plt.hist(sizes, bins=400)\n", + "plt.gca().set(title='Distribution of reviews by size - [{:5.2f}, {:5.2f}]'.format(min(sizes),max(sizes)), \n", + " xlabel='Size', ylabel='Density', xlim=[0,1500])\n", + "pwk.save_fig('01-stats-sizes')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "unk=[ 100*(s.count(2)/len(s)) for s in x_train]\n", + "plt.figure(figsize=(16,6))\n", + "plt.hist(unk, bins=100)\n", + "plt.gca().set(title='Percent of unknown words - [{:5.2f}, {:5.2f}]'.format(min(unk),max(unk)), \n", + " xlabel='# unknown', ylabel='Density', xlim=[0,30])\n", + "pwk.save_fig('02-stats-unknown')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3 - Basic approach with \"one-hot\" vector encoding\n", + "Basic approach. \n", + "\n", + "Each sentence is encoded with a **vector** of length equal to the **size of the dictionary**. \n", + "\n", + "Each sentence will therefore be encoded with a simple vector. \n", + "The value of each component is 0 if the word is not present in the sentence or 1 if the word is present.\n", + "\n", + "For a sentence s=[3,4,7] and a dictionary of 10 words... \n", + "We wil have a vector v=[0,0,0,1,1,0,0,1,0,0,0]\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.1 - Our one-hot encoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def one_hot_encoder(x, vector_size=10000):\n", + " \n", + " # ---- Set all to 0\n", + " #\n", + " x_encoded = np.zeros((len(x), vector_size))\n", + " \n", + " # ---- For each sentence\n", + " #\n", + " for i,sentence in enumerate(x):\n", + " for word in sentence:\n", + " x_encoded[i, word] = 1.\n", + "\n", + " return x_encoded" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2 - Encoding.." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_train = one_hot_encoder(x_train)\n", + "x_test = one_hot_encoder(x_test)\n", + "\n", + "print(\"To have a look, x_train[12] became :\", x_train[12] )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4 - Build the model\n", + "Few remarks :\n", + " - We'll choose a dense vector size for the embedding output with **dense_vector_size**\n", + " - **GlobalAveragePooling1D** do a pooling on the last dimension : (None, lx, ly) -> (None, ly) \n", + " In other words: we average the set of vectors/words of a sentence\n", + " - L'embedding de Keras fonctionne de manière supervisée. Il s'agit d'une couche de *vocab_size* neurones vers *n_neurons* permettant de maintenir une table de vecteurs (les poids constituent les vecteurs). Cette couche ne calcule pas de sortie a la façon des couches normales, mais renvois la valeur des vecteurs. n mots => n vecteurs (ensuite empilés par le pooling) \n", + "Voir : [Explication plus détaillée (en)](https://stats.stackexchange.com/questions/324992/how-the-embedding-layer-is-trained-in-keras-embedding-layer) \n", + "ainsi que : [Sentiment detection with Keras](https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks) \n", + "\n", + "More documentation about this model functions :\n", + " - [Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding)\n", + " - [GlobalAveragePooling1D](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GlobalAveragePooling1D)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_model(vector_size=10000):\n", + " \n", + " model = keras.Sequential()\n", + " model.add(keras.layers.Dense(32, activation='relu', input_shape=(10000,)))\n", + " model.add(keras.layers.Dense(32, activation='relu'))\n", + " model.add(keras.layers.Dense(1, activation='sigmoid'))\n", + " \n", + " model.compile(optimizer = 'rmsprop',\n", + " loss = 'binary_crossentropy',\n", + " metrics = ['accuracy'])\n", + " return model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5 - Train the model\n", + "### 5.1 - Get it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = get_model(vector_size=vocab_size)\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.2 - Add callback" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs('./run/models', mode=0o750, exist_ok=True)\n", + "save_dir = \"./run/models/best_model.h5\"\n", + "savemodel_callback = tf.keras.callbacks.ModelCheckpoint(filepath=save_dir, verbose=0, save_best_only=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.1 - Train it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "history = model.fit(x_train,\n", + " y_train,\n", + " epochs = epochs,\n", + " batch_size = batch_size,\n", + " validation_data = (x_test, y_test),\n", + " verbose = 1,\n", + " callbacks = [savemodel_callback])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6 - Evaluate\n", + "### 6.1 - Training history" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pwk.plot_history(history, save_as='02-history')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.2 - Reload and evaluate best model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = keras.models.load_model('./run/models/best_model.h5')\n", + "\n", + "# ---- Evaluate\n", + "score = model.evaluate(x_test, y_test, verbose=0)\n", + "\n", + "print('x_test / loss : {:5.4f}'.format(score[0]))\n", + "print('x_test / accuracy : {:5.4f}'.format(score[1]))\n", + "\n", + "values=[score[1], 1-score[1]]\n", + "pwk.plot_donut(values,[\"Accuracy\",\"Errors\"], title=\"#### Accuracy donut is :\", save_as='03-donut')\n", + "\n", + "# ---- Confusion matrix\n", + "\n", + "y_sigmoid = model.predict(x_test)\n", + "\n", + "y_pred = y_sigmoid.copy()\n", + "y_pred[ y_sigmoid< 0.5 ] = 0\n", + "y_pred[ y_sigmoid>=0.5 ] = 1 \n", + "\n", + "pwk.display_confusion_matrix(y_test,y_pred,labels=range(2))\n", + "pwk.plot_confusion_matrix(y_test,y_pred,range(2), figsize=(8, 8),normalize=False, save_as='04-confusion-matrix')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pwk.end()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "<img width=\"80px\" src=\"../fidle/img/00-Fidle-logo-01.svg\"></img>" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/IMDB/01-Embedding-Keras.ipynb b/IMDB/02-Embedding-Keras.ipynb similarity index 100% rename from IMDB/01-Embedding-Keras.ipynb rename to IMDB/02-Embedding-Keras.ipynb diff --git a/IMDB/01-Embedding-Keras==done==.ipynb b/IMDB/02-Embedding-Keras==done==.ipynb similarity index 100% rename from IMDB/01-Embedding-Keras==done==.ipynb rename to IMDB/02-Embedding-Keras==done==.ipynb diff --git a/IMDB/02-Prediction.ipynb b/IMDB/03-Prediction.ipynb similarity index 100% rename from IMDB/02-Prediction.ipynb rename to IMDB/03-Prediction.ipynb diff --git a/IMDB/02-Prediction==done==.ipynb b/IMDB/03-Prediction==done==.ipynb similarity index 100% rename from IMDB/02-Prediction==done==.ipynb rename to IMDB/03-Prediction==done==.ipynb diff --git a/IMDB/03-LSTM-Keras.ipynb b/IMDB/04-LSTM-Keras.ipynb similarity index 100% rename from IMDB/03-LSTM-Keras.ipynb rename to IMDB/04-LSTM-Keras.ipynb diff --git a/IMDB/03-LSTM-Keras==done==.ipynb b/IMDB/04-LSTM-Keras==done==.ipynb similarity index 100% rename from IMDB/03-LSTM-Keras==done==.ipynb rename to IMDB/04-LSTM-Keras==done==.ipynb -- GitLab