From 0814d09760fdc51e7d93ad91174f3488568ce41a Mon Sep 17 00:00:00 2001 From: Jean-Luc Parouty <Jean-Luc.Parouty@simap.grenoble-inp.fr> Date: Wed, 26 Jan 2022 15:24:54 +0100 Subject: [PATCH] Add Transformers :-) --- DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb | 18 +- DCGAN/modules/models/DCGAN.py | 23 +- README.ipynb | 20 +- README.md | 10 +- Transformers/01-Distilbert.ipynb | 529 +++++++++++++++++++++++++ Transformers/02-distilbert_colab.ipynb | 487 +++++++++++++++++++++++ fidle/01-update-index.ipynb | 4 +- fidle/ci/default.yml | 14 +- fidle/config.py | 2 +- fidle/logs/catalog.json | 20 +- 10 files changed, 1083 insertions(+), 44 deletions(-) create mode 100755 Transformers/01-Distilbert.ipynb create mode 100755 Transformers/02-distilbert_colab.ipynb diff --git a/DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb b/DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb index e726589..b6470ec 100644 --- a/DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb +++ b/DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb @@ -6,7 +6,7 @@ "source": [ "<img width=\"800px\" src=\"../fidle/img/00-Fidle-header-01.svg\"></img>\n", "\n", - "# <!-- TITLE --> [DCGAN01] - A first DCGAN to Draw a Sheep\n", + "# <!-- TITLE --> [SHEEP1] - A first DCGAN to Draw a Sheep\n", "<!-- DESC --> Episode 1 : Draw me a sheep, revisited with a DCGAN\n", "<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->\n", "\n", @@ -198,17 +198,6 @@ "metadata": {}, "outputs": [], "source": [ - "# inputs = keras.Input(shape=(latent_dim,))\n", - "# x = layers.Dense(7 * 7 * 64)(inputs)\n", - "# x = layers.Reshape((7, 7, 64))(x)\n", - "# x = layers.Conv2DTranspose(128, kernel_size=3, strides=2, padding=\"same\")(x)\n", - "# x = layers.LeakyReLU(alpha=0.2)(x)\n", - "# x = layers.Conv2DTranspose(256, kernel_size=3, strides=2, padding=\"same\")(x)\n", - "# x = layers.LeakyReLU(alpha=0.2)(x)\n", - "# outputs = layers.Conv2D(1, kernel_size=5, padding=\"same\", activation=\"sigmoid\")(x)\n", - "\n", - "# ---- Using upsampling2D give better images ;-)\n", - "\n", "inputs = keras.Input(shape=(latent_dim,))\n", "x = layers.Dense(7 * 7 * 64)(inputs)\n", "x = layers.Reshape((7, 7, 64))(x)\n", @@ -219,8 +208,7 @@ "outputs = layers.Conv2D(1, kernel_size=5, strides=1, padding=\"same\", activation=\"sigmoid\")(x)\n", "\n", "generator = keras.Model(inputs, outputs, name=\"generator\")\n", - "generator.summary()\n", - "\n" + "generator.summary()" ] }, { @@ -237,7 +225,7 @@ "metadata": {}, "outputs": [], "source": [ - "!rm $run_dir/images/*.jpg >/dev/null 2>&1 " + "# !rm $run_dir/images/*.jpg >/dev/null 2>&1 " ] }, { diff --git a/DCGAN/modules/models/DCGAN.py b/DCGAN/modules/models/DCGAN.py index a7bcbab..17a1bef 100644 --- a/DCGAN/modules/models/DCGAN.py +++ b/DCGAN/modules/models/DCGAN.py @@ -111,13 +111,13 @@ class DCGAN(keras.Model): generated_images = self.generator(random_latent_vectors) # Combine them with real images - combined_images = tf.concat([generated_images, real_images], axis=0) + combined_images = tf.concat( [generated_images, real_images], axis=0) # Creation of labels corresponding to real or fake images - labels = tf.concat( - [tf.ones((batch_size, 1)), tf.zeros((batch_size, 1))], axis=0 - ) - # Add random noise to the labels - important trick! + # 1 is generated, 0 is real + labels = tf.concat( [tf.ones((batch_size, 1)), tf.zeros((batch_size, 1))], axis=0) + + # Add random noise to the labels - important trick ! labels += 0.05 * tf.random.uniform(tf.shape(labels)) # ---- Train the discriminator ----------------------------- @@ -153,6 +153,8 @@ class DCGAN(keras.Model): # ---- Train the generator --------------------------------- # ---------------------------------------------------------- # We should *not* update the weights of the discriminator! + # We will train our generator so that the generated images + # are considered as real by the discriminator # # ---- Forward pass # Run the forward pass and record operations with the GradientTape. @@ -186,14 +188,7 @@ class DCGAN(keras.Model): "g_loss": self.g_loss_metric.result(), } - - # def predict(self,inputs): - # '''Our predict function...''' - # z_mean, z_var, z = self.encoder.predict(inputs) - # outputs = self.decoder.predict(z) - # return outputs - - + def save(self,filename): '''Save model in 2 part''' save_dir = os.path.dirname(filename) @@ -217,7 +212,7 @@ class DCGAN(keras.Model): @classmethod def about(cls): '''Basic whoami method''' - display(Markdown('<br>**FIDLE 2021 - DCGAN**')) + display(Markdown('<br>**FIDLE 2022 - DCGAN**')) print('Version :', cls.version) print('TensorFlow version :', tf.__version__) print('Keras version :', tf.keras.__version__) diff --git a/README.ipynb b/README.ipynb index 2cf30cf..f7e4054 100644 --- a/README.ipynb +++ b/README.ipynb @@ -3,13 +3,13 @@ { "cell_type": "code", "execution_count": 1, - "id": "45cecdf8", + "id": "3f8b217d", "metadata": { "execution": { - "iopub.execute_input": "2022-01-17T09:57:30.406301Z", - "iopub.status.busy": "2022-01-17T09:57:30.402848Z", - "iopub.status.idle": "2022-01-17T09:57:30.418558Z", - "shell.execute_reply": "2022-01-17T09:57:30.418866Z" + "iopub.execute_input": "2022-01-26T14:19:05.708967Z", + "iopub.status.busy": "2022-01-26T14:19:05.705058Z", + "iopub.status.idle": "2022-01-26T14:19:05.717400Z", + "shell.execute_reply": "2022-01-26T14:19:05.717043Z" }, "jupyter": { "source_hidden": true @@ -52,7 +52,7 @@ "[<img width=\"200px\" style=\"vertical-align:middle\" src=\"fidle/img/00-Mail_contact.svg\"></img>](#top)\n", "\n", "Current Version : <!-- VERSION_BEGIN -->\n", - "**2.0.32**\n", + "**2.0.33**\n", "<!-- VERSION_END -->\n", "\n", "\n", @@ -137,6 +137,12 @@ "- **[SYNOP3](SYNOP/SYNOP3-12h-predictions.ipynb)** - [12h predictions](SYNOP/SYNOP3-12h-predictions.ipynb) \n", "Episode 3: Attempt to predict in a more longer term \n", "\n", + "### Sentiment analysis with transformers\n", + "- **[TRANS1](Transformers/01-Distilbert.ipynb)** - [IMDB, Sentiment analysis with Transformers ](Transformers/01-Distilbert.ipynb) \n", + "Using a Tranformer to perform a sentiment analysis (IMDB) - Jean Zay version\n", + "- **[TRANS2](Transformers/02-distilbert_colab.ipynb)** - [IMDB, Sentiment analysis with Transformers ](Transformers/02-distilbert_colab.ipynb) \n", + "Using a Tranformer to perform a sentiment analysis (IMDB) - Colab version\n", + "\n", "### Unsupervised learning with an autoencoder neural network (AE)\n", "- **[AE1](AE/01-Prepare-MNIST-dataset.ipynb)** - [Prepare a noisy MNIST dataset](AE/01-Prepare-MNIST-dataset.ipynb) \n", "Episode 1: Preparation of a noisy MNIST dataset\n", @@ -170,7 +176,7 @@ "Bash script for SLURM batch submission of VAE8 notebooks \n", "\n", "### Generative Adversarial Networks (GANs)\n", - "- **[DCGAN01](DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb)** - [A first DCGAN to Draw a Sheep](DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb) \n", + "- **[SHEEP1](DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb)** - [A first DCGAN to Draw a Sheep](DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb) \n", "Episode 1 : Draw me a sheep, revisited with a DCGAN\n", "\n", "### Miscellaneous\n", diff --git a/README.md b/README.md index 305ba7d..5c48bf6 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ For more information, you can contact us at : [<img width="200px" style="vertical-align:middle" src="fidle/img/00-Mail_contact.svg"></img>](#top) Current Version : <!-- VERSION_BEGIN --> -**2.0.32** +**2.0.33** <!-- VERSION_END --> @@ -116,6 +116,12 @@ Episode 2 : RNN training session for weather prediction attempt at 3h - **[SYNOP3](SYNOP/SYNOP3-12h-predictions.ipynb)** - [12h predictions](SYNOP/SYNOP3-12h-predictions.ipynb) Episode 3: Attempt to predict in a more longer term +### Sentiment analysis with transformers +- **[TRANS1](Transformers/01-Distilbert.ipynb)** - [IMDB, Sentiment analysis with Transformers ](Transformers/01-Distilbert.ipynb) +Using a Tranformer to perform a sentiment analysis (IMDB) - Jean Zay version +- **[TRANS2](Transformers/02-distilbert_colab.ipynb)** - [IMDB, Sentiment analysis with Transformers ](Transformers/02-distilbert_colab.ipynb) +Using a Tranformer to perform a sentiment analysis (IMDB) - Colab version + ### Unsupervised learning with an autoencoder neural network (AE) - **[AE1](AE/01-Prepare-MNIST-dataset.ipynb)** - [Prepare a noisy MNIST dataset](AE/01-Prepare-MNIST-dataset.ipynb) Episode 1: Preparation of a noisy MNIST dataset @@ -149,7 +155,7 @@ Episode 5 : Exploring latent space to generate new data Bash script for SLURM batch submission of VAE8 notebooks ### Generative Adversarial Networks (GANs) -- **[DCGAN01](DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb)** - [A first DCGAN to Draw a Sheep](DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb) +- **[SHEEP1](DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb)** - [A first DCGAN to Draw a Sheep](DCGAN/01-DCGAN-Draw-me-a-sheep.ipynb) Episode 1 : Draw me a sheep, revisited with a DCGAN ### Miscellaneous diff --git a/Transformers/01-Distilbert.ipynb b/Transformers/01-Distilbert.ipynb new file mode 100755 index 0000000..2a598f8 --- /dev/null +++ b/Transformers/01-Distilbert.ipynb @@ -0,0 +1,529 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<img width=\"800px\" src=\"../fidle/img/00-Fidle-header-01.svg\"></img>\n", + "\n", + "# <!-- TITLE --> [TRANS1] - IMDB, Sentiment analysis with Transformers \n", + "<!-- DESC --> Using a Tranformer to perform a sentiment analysis (IMDB) - Jean Zay version\n", + "<!-- AUTHOR : Hatim Bourfoune (IDRIS) and Nathan Cassereau (IDRIS) -->\n", + "\n", + "By : Hatim Bourfoune (IDRIS) and Nathan Cassereau (IDRIS)\n", + "\n", + "\n", + "## Objectives :\n", + " - Complement the learning of a Transformer to perform a sentiment analysis\n", + " - Understand the use of a pre-trained transformer\n", + "\n", + "This task is exactly the same as the Sentiment analysis with text embedding. Only this time, \n", + "we are going to exploit the strenght of transformers. Considering how computation-heavy transformer \n", + "pretraining is, we are going to use a pretrained BERT model from HuggingFace. \n", + "This notebook performs the fine-tuning process. If possible, try to use a GPU to speed up \n", + "the training, transformers are difficult to train on CPU.\n", + "\n", + "## What we are going to do:\n", + "\n", + "* Retrieve the dataset\n", + "* Prepare the dataset\n", + "* Fetch a pretrained BERT model from HuggingFace's platform (https://huggingface.co/models)\n", + "* Fine-tune the model on a sequence classification task: the sentiment analysis of the IMDB dataset\n", + "* Evaluate the result\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Installations\n", + "\n", + "**IMPORTANT :** We will need to use the library `transformers` created by HuggingFace.\n", + "\n", + "The next line only applies on Jean Zay, it allows us to load a very specific environment, which contains Tensorflow with GPU support. Ignore that line if this notebook is not executed on Jean Zay." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "50QKMUQzPv3n", + "outputId": "3ac2016d-596d-4f9a-c2ec-738c939c49a0" + }, + "outputs": [], + "source": [ + "#!pip install transformers\n", + "!module load tensorflow-gpu/py3/2.6.0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports and initialisation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZrV8ONYZPi8L", + "outputId": "ad10d385-3e1f-4ecf-80f2-87dccc286db7" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "import tensorflow as tf\n", + "import tensorflow.keras as keras\n", + "import tensorflow.keras.datasets.imdb as imdb\n", + "from tensorflow.keras.layers import Dense, Dropout\n", + "from tensorflow.keras.optimizers import Adam\n", + "from tensorflow.keras.losses import SparseCategoricalCrossentropy\n", + "from tensorflow.keras.metrics import SparseCategoricalAccuracy\n", + "from tensorflow.keras import mixed_precision\n", + "\n", + "from transformers import (\n", + " DistilBertTokenizer,\n", + " TFDistilBertModel,\n", + " DataCollatorWithPadding,\n", + " BertTokenizer,\n", + " TFBertModel\n", + ")\n", + "\n", + "import pickle\n", + "import multiprocessing\n", + "import itertools\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "print(\"Tensorflow \", tf.__version__)\n", + "n_gpus = len(tf.config.list_physical_devices('GPU'))\n", + "print(\"#GPUs: \", n_gpus)\n", + "if n_gpus > 0:\n", + " !nvidia-smi -L\n", + "policy = mixed_precision.Policy('mixed_float16')\n", + "mixed_precision.set_global_policy(policy)\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"true\"\n", + "\n", + "np.random.seed(987654321)\n", + "tf.random.set_seed(987654321)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Parameters\n", + "\n", + "* `vocab_size` refers to the number of words which will be remembered in our vocabulary.\n", + "* `hide_most_frequently` is the number of ignored words, among the most common ones.\n", + "* `review_len` is the review length.\n", + "* `n_cpus` is the number of CPU which will be used for data preprocessing.\n", + "* `distil` refers to whether or not we are going to use a DistilBert model or a regular Bert model.\n", + "* `load_locally` will fetch data locally, otherwise will download on the Internet (requires an Internet connection, not possible on Jean Zay)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FhIuZkS2PnTE" + }, + "outputs": [], + "source": [ + "vocab_size = 30000\n", + "hide_most_frequently = 0\n", + "\n", + "review_len = 512\n", + "\n", + "epochs = 1\n", + "batch_size = 32\n", + "\n", + "fit_verbosity = 1\n", + "scale = 1\n", + "\n", + "n_cpus = 6\n", + "distil = True\n", + "load_locally = True # if set to False, will fetch data from the internet (requires an internet connection)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrieve the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qaRtDy9wQinS", + "outputId": "9d2d9e12-74fb-4eee-9d9b-d4a7148e2dc2" + }, + "outputs": [], + "source": [ + "if load_locally:\n", + " with open(\"dataset\", \"rb\") as file_:\n", + " (x_train, y_train), (x_test, y_test) = pickle.load(file_)\n", + "else:\n", + " (x_train, y_train), (x_test, y_test) = imdb.load_data(\n", + " num_words=vocab_size,\n", + " skip_top=hide_most_frequently,\n", + " seed=123456789,\n", + " )\n", + " with open(\"dataset\", \"wb\") as file_:\n", + " pickle.dump(((x_train, y_train), (x_test, y_test)), file_)\n", + "\n", + "\n", + "y_train = np.asarray(y_train).astype('float32')\n", + "y_test = np.asarray(y_test ).astype('float32')\n", + "\n", + "n1 = int(scale * len(x_train))\n", + "n2 = int(scale * len(x_test))\n", + "x_train, y_train = x_train[:n1], y_train[:n1]\n", + "x_test, y_test = x_test[:n2], y_test[:n2]\n", + "\n", + "print(\"x_train : {} y_train : {}\".format(x_train.shape, y_train.shape))\n", + "print(\"x_test : {} y_test : {}\".format(x_test.shape, y_test.shape))\n", + "print('\\nReview sample (x_train[12]) :\\n\\n',x_train[12])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nbF1uktpRdXy" + }, + "outputs": [], + "source": [ + "if load_locally:\n", + " with open(\"word_index\", \"rb\") as file_:\n", + " word_index = pickle.load(file_)\n", + "else:\n", + " word_index = imdb.get_word_index()\n", + " with open(\"word_index\", \"wb\") as file_:\n", + " pickle.dump(word_index, file_)\n", + "\n", + "word_index = {w:(i+3) for w,i in word_index.items()}\n", + "word_index.update({'[PAD]':0, '[CLS]':1, '[UNK]':2})\n", + "index_word = {index:word for word,index in word_index.items()} \n", + "\n", + "# Add a nice function to transpose:\n", + "def dataset2text(review):\n", + " return ' '.join([index_word.get(i, \"?\") for i in review[1:]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DifmZNsKR38n", + "outputId": "cb5f9819-1930-478d-f3f2-45f06e04c5d4" + }, + "outputs": [], + "source": [ + "print(dataset2text(x_train[12]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch the model from HuggingFace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_model(distil, load_locally):\n", + " if load_locally:\n", + " if distil:\n", + " bert_model = TFDistilBertModel.from_pretrained(\"distilbert_model\")\n", + " tokenizer = DistilBertTokenizer(\"distilbert_vocab.txt\", do_lower_case=True)\n", + " else:\n", + " bert_model = TFBertModel.from_pretrained(\"bert_model\")\n", + " tokenizer = BertTokenizer(\"bert_vocab.txt\", do_lower_case=True)\n", + " return bert_model, tokenizer\n", + "\n", + " if distil:\n", + " bert_model = TFDistilBertModel.from_pretrained(\"distilbert-base-uncased\")\n", + " tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", + " bert_model.save_pretrained(\"distilbert_model\")\n", + " tokenizer.save_vocabulary(\"distilbert_vocab.txt\")\n", + " else:\n", + " bert_model = TFBertModel.from_pretrained(\"bert-base-uncased\")\n", + " tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", + " bert_model.save_pretrained(\"bert_model\")\n", + " tokenizer.save_vocabulary(\"bert_vocab.txt\")\n", + " return bert_model, tokenizer\n", + "\n", + "bert_model, tokenizer = load_model(distil, load_locally)\n", + "bert_model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare the dataset " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KKwI-RIXnWWd", + "outputId": "42dc5943-b060-4a08-9180-cd5914980527" + }, + "outputs": [], + "source": [ + "def tokenize_sample(sample, tokenizer):\n", + " return tokenizer(dataset2text(sample), truncation=True, max_length=review_len)\n", + "\n", + "def distributed_tokenize_dataset(dataset):\n", + " ds = list(dataset)\n", + " with multiprocessing.Pool(n_cpus) as pool:\n", + " tokenized_ds = pool.starmap(\n", + " tokenize_sample,\n", + " zip(ds, itertools.repeat(tokenizer, len(ds)))\n", + " )\n", + " return tokenized_ds\n", + "\n", + "tokenized_x_train = distributed_tokenize_dataset(x_train)\n", + "tokenized_x_test = distributed_tokenize_dataset(x_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TivZYh8vnZlS" + }, + "outputs": [], + "source": [ + "data_collator = DataCollatorWithPadding(tokenizer, return_tensors=\"tf\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7Up0APYtwFm7", + "outputId": "37cb98bd-a0d3-47c2-9f91-96f94abf4b2e" + }, + "outputs": [], + "source": [ + "data_collator(tokenized_x_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "U6Lhjfh6maIF", + "outputId": "f4798e7f-bc69-47fe-e2a7-c0155a99cca7" + }, + "outputs": [], + "source": [ + "def make_dataset(x, y):\n", + " collated = data_collator(x)\n", + " dataset = tf.data.Dataset.from_tensor_slices(\n", + " (collated['input_ids'], collated['attention_mask'], y)\n", + " )\n", + " transformed_dataset = (\n", + " dataset\n", + " .map(\n", + " lambda x, y, z: ((x, y), z)\n", + " )\n", + " .shuffle(25000)\n", + " .batch(batch_size)\n", + " )\n", + " return transformed_dataset\n", + "\n", + "train_ds = make_dataset(tokenized_x_train, y_train)\n", + "test_ds = make_dataset(tokenized_x_test, y_test)\n", + "\n", + "for x, y in train_ds:\n", + " print(x)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add a new head to the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ClassificationModel(keras.Model):\n", + "\n", + " def __init__(self, bert_model):\n", + " super(ClassificationModel, self).__init__()\n", + " self.bert_model = bert_model\n", + " self.pre_classifier = Dense(768, activation='relu')\n", + " self.dropout = Dropout(0.1)\n", + " self.classifier = Dense(2)\n", + "\n", + " def call(self, x):\n", + " x = self.bert_model(x)\n", + " x = x.last_hidden_state\n", + " x = x[:, 0] # get the output of the classification token\n", + " x = self.pre_classifier(x)\n", + " x = self.dropout(x)\n", + " x = self.classifier(x)\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = ClassificationModel(bert_model)\n", + "x = next(iter(train_ds))[0]\n", + "model(x)\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train! " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4jDAzxxwXLT1", + "outputId": "bc4d5f62-9fa7-426d-a9e2-2fa4d2bdf780" + }, + "outputs": [], + "source": [ + "model.compile(\n", + " optimizer=Adam(1e-05),\n", + " loss=SparseCategoricalCrossentropy(from_logits=True),\n", + " metrics=[SparseCategoricalAccuracy('accuracy')]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 419 + }, + "id": "KtmfFjL02Ano", + "outputId": "ca174c57-b8f9-4d50-a53a-03761556e492" + }, + "outputs": [], + "source": [ + "history = model.fit(\n", + " train_ds,\n", + " epochs=epochs,\n", + " verbose=fit_verbosity\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "_, score = model.evaluate(test_ds)\n", + "colors = sns.color_palette('pastel')[2:]\n", + "accuracy_score = [score, 1 - score]\n", + "plt.pie(\n", + " accuracy_score,\n", + " labels=[\"Accurate\", \"Mistaken\"],\n", + " colors=colors,\n", + " autopct=lambda val: f\"{val:.2f}%\",\n", + " explode=(0.0, 0.1)\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "Untitled0.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Transformers/02-distilbert_colab.ipynb b/Transformers/02-distilbert_colab.ipynb new file mode 100755 index 0000000..5f22226 --- /dev/null +++ b/Transformers/02-distilbert_colab.ipynb @@ -0,0 +1,487 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<img width=\"800px\" src=\"../fidle/img/00-Fidle-header-01.svg\"></img>\n", + "\n", + "# <!-- TITLE --> [TRANS2] - IMDB, Sentiment analysis with Transformers \n", + "<!-- DESC --> Using a Tranformer to perform a sentiment analysis (IMDB) - Colab version\n", + "<!-- AUTHOR : Hatim Bourfoune (IDRIS) and Nathan Cassereau (IDRIS) -->\n", + "\n", + "By : Hatim Bourfoune (IDRIS) and Nathan Cassereau (IDRIS)\n", + "\n", + "\n", + "## Objectives :\n", + " - Complement the learning of a Transformer to perform a sentiment analysis\n", + " - Understand the use of a pre-trained transformer\n", + "\n", + "This task is exactly the same as the Sentiment analysis with text embedding. Only this time, \n", + "we are going to exploit the strenght of transformers. Considering how computation-heavy transformer \n", + "pretraining is, we are going to use a pretrained BERT model from HuggingFace. \n", + "This notebook performs the fine-tuning process. If possible, try to use a GPU to speed up \n", + "the training, transformers are difficult to train on CPU.\n", + "\n", + "## What we are going to do:\n", + "\n", + "* Retrieve the dataset\n", + "* Prepare the dataset\n", + "* Fetch a pretrained BERT model from HuggingFace's platform (https://huggingface.co/models)\n", + "* Fine-tune the model on a sequence classification task: the sentiment analysis of the IMDB dataset\n", + "* Evaluate the result\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oFKuY9XfOE8i" + }, + "source": [ + "## Installations\n", + "\n", + "**IMPORTANT :** We will need to use the library `transformers` created by HuggingFace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "50QKMUQzPv3n" + }, + "outputs": [], + "source": [ + "!pip install transformers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RbvDGdh5OE8r" + }, + "source": [ + "## Imports and initialisation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZrV8ONYZPi8L" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "import tensorflow as tf\n", + "import tensorflow.keras as keras\n", + "import tensorflow.keras.datasets.imdb as imdb\n", + "from tensorflow.keras.layers import Dense, Dropout\n", + "from tensorflow.keras.optimizers import Adam\n", + "from tensorflow.keras.losses import SparseCategoricalCrossentropy\n", + "from tensorflow.keras.metrics import SparseCategoricalAccuracy\n", + "\n", + "from transformers import (\n", + " DistilBertTokenizer,\n", + " TFDistilBertModel,\n", + " DataCollatorWithPadding,\n", + " BertTokenizer,\n", + " TFBertModel\n", + ")\n", + "\n", + "from tqdm.notebook import tqdm\n", + "import itertools\n", + "import multiprocessing\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "print(\"Tensorflow \", tf.__version__)\n", + "n_gpus = len(tf.config.list_physical_devices('GPU'))\n", + "print(\"#GPUs: \", n_gpus)\n", + "if n_gpus > 0:\n", + " !nvidia-smi -L\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"true\"\n", + "\n", + "np.random.seed(987654321)\n", + "tf.random.set_seed(987654321)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PKcpPVxzOE8y" + }, + "source": [ + "## Parameters\n", + "\n", + "* `vocab_size` refers to the number of words which will be remembered in our vocabulary.\n", + "* `hide_most_frequently` is the number of ignored words, among the most common ones.\n", + "* `review_len` is the review length.\n", + "* `n_cpus` is the number of CPU which will be used for data preprocessing.\n", + "* `distil` refers to whether or not we are going to use a DistilBert model or a regular Bert model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FhIuZkS2PnTE" + }, + "outputs": [], + "source": [ + "vocab_size = 30000\n", + "hide_most_frequently = 0\n", + "\n", + "review_len = 512\n", + "\n", + "epochs = 1\n", + "batch_size = 32\n", + "\n", + "fit_verbosity = 1\n", + "scale = 1\n", + "\n", + "n_cpus = 1\n", + "distil = True" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fjGT9ymIOE83" + }, + "source": [ + "## Retrieve the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qaRtDy9wQinS" + }, + "outputs": [], + "source": [ + "(x_train, y_train), (x_test, y_test) = imdb.load_data(\n", + " num_words=vocab_size,\n", + " skip_top=hide_most_frequently,\n", + " seed=123456789,\n", + ")\n", + "\n", + "\n", + "y_train = np.asarray(y_train).astype('float32')\n", + "y_test = np.asarray(y_test ).astype('float32')\n", + "\n", + "n1 = int(scale * len(x_train))\n", + "n2 = int(scale * len(x_test))\n", + "x_train, y_train = x_train[:n1], y_train[:n1]\n", + "x_test, y_test = x_test[:n2], y_test[:n2]\n", + "\n", + "print(\"x_train : {} y_train : {}\".format(x_train.shape, y_train.shape))\n", + "print(\"x_test : {} y_test : {}\".format(x_test.shape, y_test.shape))\n", + "print('\\nReview sample (x_train[12]) :\\n\\n',x_train[12])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nbF1uktpRdXy" + }, + "outputs": [], + "source": [ + "word_index = imdb.get_word_index()\n", + "\n", + "word_index = {w:(i+3) for w,i in word_index.items()}\n", + "word_index.update({'[PAD]':0, '[CLS]':1, '[UNK]':2})\n", + "index_word = {index:word for word,index in word_index.items()} \n", + "\n", + "# Add a nice function to transpose:\n", + "def dataset2text(review):\n", + " return ' '.join([index_word.get(i, \"?\") for i in review[1:]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DifmZNsKR38n" + }, + "outputs": [], + "source": [ + "print(dataset2text(x_train[12]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i-TNspUcOE8_" + }, + "source": [ + "## Fetch the model from HuggingFace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "q04e6x2yOE9B" + }, + "outputs": [], + "source": [ + "def load_model(distil):\n", + " if distil:\n", + " bert_model = TFDistilBertModel.from_pretrained(\"distilbert-base-uncased\")\n", + " tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", + " else:\n", + " bert_model = TFBertModel.from_pretrained(\"bert-base-uncased\")\n", + " tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", + " return bert_model, tokenizer\n", + "\n", + "bert_model, tokenizer = load_model(distil)\n", + "bert_model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xHnz-irHOE9E" + }, + "source": [ + "## Prepare the dataset " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KKwI-RIXnWWd" + }, + "outputs": [], + "source": [ + "def tokenize_sample(sample):\n", + " return tokenizer(dataset2text(sample), truncation=True, max_length=review_len)\n", + "\n", + "def distributed_tokenize_dataset(dataset):\n", + " ds = list(dataset)\n", + " with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:\n", + " tokenized_ds = list(tqdm(\n", + " pool.imap(tokenize_sample, ds),\n", + " total=len(ds)\n", + " ))\n", + " return tokenized_ds\n", + "\n", + "tokenized_x_train = distributed_tokenize_dataset(x_train)\n", + "tokenized_x_test = distributed_tokenize_dataset(x_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TivZYh8vnZlS" + }, + "outputs": [], + "source": [ + "data_collator = DataCollatorWithPadding(tokenizer, return_tensors=\"tf\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7Up0APYtwFm7" + }, + "outputs": [], + "source": [ + "data_collator(tokenized_x_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "U6Lhjfh6maIF" + }, + "outputs": [], + "source": [ + "def make_dataset(x, y):\n", + " collated = data_collator(x)\n", + " dataset = tf.data.Dataset.from_tensor_slices(\n", + " (collated['input_ids'], collated['attention_mask'], y)\n", + " )\n", + " transformed_dataset = (\n", + " dataset\n", + " .map(\n", + " lambda x, y, z: ((x, y), z)\n", + " )\n", + " .shuffle(25000)\n", + " .batch(batch_size)\n", + " )\n", + " return transformed_dataset\n", + "\n", + "train_ds = make_dataset(tokenized_x_train, y_train)\n", + "test_ds = make_dataset(tokenized_x_test, y_test)\n", + "\n", + "for x, y in train_ds:\n", + " print(x)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BQMn-htqOE9O" + }, + "source": [ + "## Add a new head to the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KUaZbGYwOE9O" + }, + "outputs": [], + "source": [ + "class ClassificationModel(keras.Model):\n", + "\n", + " def __init__(self, bert_model):\n", + " super(ClassificationModel, self).__init__()\n", + " self.bert_model = bert_model\n", + " self.pre_classifier = Dense(768, activation='relu')\n", + " self.dropout = Dropout(0.1)\n", + " self.classifier = Dense(2)\n", + "\n", + " def call(self, x):\n", + " x = self.bert_model(x)\n", + " x = x.last_hidden_state\n", + " x = x[:, 0] # get the output of the classification token\n", + " x = self.pre_classifier(x)\n", + " x = self.dropout(x)\n", + " x = self.classifier(x)\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2dTGFu9MOE9Q" + }, + "outputs": [], + "source": [ + "model = ClassificationModel(bert_model)\n", + "x = next(iter(train_ds))[0]\n", + "model(x)\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r3IUL7wvOE9S" + }, + "source": [ + "## Train! " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4jDAzxxwXLT1" + }, + "outputs": [], + "source": [ + "model.compile(\n", + " optimizer=Adam(1e-05),\n", + " loss=SparseCategoricalCrossentropy(from_logits=True),\n", + " metrics=[SparseCategoricalAccuracy('accuracy')]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KtmfFjL02Ano" + }, + "outputs": [], + "source": [ + "history = model.fit(\n", + " train_ds,\n", + " epochs=epochs,\n", + " verbose=fit_verbosity\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Pb1ruObAOE9V" + }, + "source": [ + "## Evaluation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bvI18GGUOE9W" + }, + "outputs": [], + "source": [ + "_, score = model.evaluate(test_ds)\n", + "colors = sns.color_palette('pastel')[2:]\n", + "accuracy_score = [score, 1 - score]\n", + "plt.pie(\n", + " accuracy_score,\n", + " labels=[\"Accurate\", \"Mistaken\"],\n", + " colors=colors,\n", + " autopct=lambda val: f\"{val:.2f}%\",\n", + " explode=(0.0, 0.1)\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dh0LDNq8OE9X" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "distilbert_colab.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/fidle/01-update-index.ipynb b/fidle/01-update-index.ipynb index 5011d2a..cd919a9 100644 --- a/fidle/01-update-index.ipynb +++ b/fidle/01-update-index.ipynb @@ -63,6 +63,7 @@ " 'GTSRB':'Images classification with Convolutional Neural Networks (CNN)',\n", " 'IMDB':'Sentiment analysis with word embedding',\n", " 'SYNOP':'Time series with Recurrent Neural Network (RNN)',\n", + " 'Transformers': 'Sentiment analysis with transformers',\n", " 'AE':'Unsupervised learning with an autoencoder neural network (AE)',\n", " 'VAE':'Generative network with Variational Autoencoder (VAE)',\n", " 'DCGAN':'Generative Adversarial Networks (GANs)',\n", @@ -278,7 +279,8 @@ "hash": "7822d55dc7294a4f6f06b86d8ad2ca65bd6e1ee5d72628c47c30a06bbf89aef6" }, "kernelspec": { - "display_name": "Python 3.9.7 64-bit ('fidle': conda)", + "display_name": "Python 3 (ipykernel)", + "language": "python", "name": "python3" }, "language_info": { diff --git a/fidle/ci/default.yml b/fidle/ci/default.yml index 2f6366c..be75d83 100644 --- a/fidle/ci/default.yml +++ b/fidle/ci/default.yml @@ -253,6 +253,16 @@ Nb_SYNOP3: scale: default train_prop: default sequence_len: default +Nb_TRANS1: + notebook_id: TRANS1 + notebook_dir: Transformers + notebook_src: 01-Distilbert.ipynb + notebook_tag: default +Nb_TRANS2: + notebook_id: TRANS2 + notebook_dir: Transformers + notebook_src: 02-distilbert_colab.ipynb + notebook_tag: default Nb_AE1: notebook_id: AE1 notebook_dir: AE @@ -414,8 +424,8 @@ Nb_VAE10: notebook_dir: VAE notebook_src: batch_slurm.sh notebook_tag: default -Nb_DCGAN01: - notebook_id: DCGAN01 +Nb_SHEEP1: + notebook_id: SHEEP1 notebook_dir: DCGAN notebook_src: 01-DCGAN-Draw-me-a-sheep.ipynb notebook_tag: default diff --git a/fidle/config.py b/fidle/config.py index 83cf1f3..76e4e4d 100644 --- a/fidle/config.py +++ b/fidle/config.py @@ -14,7 +14,7 @@ # ---- Version ----------------------------------------------------- # -VERSION = '2.0.32' +VERSION = '2.0.33' # ---- Default notebook name --------------------------------------- # diff --git a/fidle/logs/catalog.json b/fidle/logs/catalog.json index 3297641..73af623 100644 --- a/fidle/logs/catalog.json +++ b/fidle/logs/catalog.json @@ -326,6 +326,22 @@ "sequence_len" ] }, + "TRANS1": { + "id": "TRANS1", + "dirname": "Transformers", + "basename": "01-Distilbert.ipynb", + "title": "IMDB, Sentiment analysis with Transformers ", + "description": "Using a Tranformer to perform a sentiment analysis (IMDB) - Jean Zay version", + "overrides": [] + }, + "TRANS2": { + "id": "TRANS2", + "dirname": "Transformers", + "basename": "02-distilbert_colab.ipynb", + "title": "IMDB, Sentiment analysis with Transformers ", + "description": "Using a Tranformer to perform a sentiment analysis (IMDB) - Colab version", + "overrides": [] + }, "AE1": { "id": "AE1", "dirname": "AE", @@ -532,8 +548,8 @@ "description": "Bash script for SLURM batch submission of VAE8 notebooks ", "overrides": [] }, - "DCGAN01": { - "id": "DCGAN01", + "SHEEP1": { + "id": "SHEEP1", "dirname": "DCGAN", "basename": "01-DCGAN-Draw-me-a-sheep.ipynb", "title": "A first DCGAN to Draw a Sheep", -- GitLab