From f13cacf3fdf6dcfc7c41223295a9a36372491773 Mon Sep 17 00:00:00 2001 From: Achille Mbogol Touye <achille.mbogol-touye@univ-grenoble-alpes.fr> Date: Tue, 26 Sep 2023 17:51:56 +0200 Subject: [PATCH] 03-DNN-Wine-Regression-lightning --- .../03-DNN-Wine-Regression-lightning.ipynb | 574 ++++++++++++++++++ 1 file changed, 574 insertions(+) create mode 100644 WineQuality-DNN_Reg-lightning/03-DNN-Wine-Regression-lightning.ipynb diff --git a/WineQuality-DNN_Reg-lightning/03-DNN-Wine-Regression-lightning.ipynb b/WineQuality-DNN_Reg-lightning/03-DNN-Wine-Regression-lightning.ipynb new file mode 100644 index 0000000..5a45fc1 --- /dev/null +++ b/WineQuality-DNN_Reg-lightning/03-DNN-Wine-Regression-lightning.ipynb @@ -0,0 +1,574 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<img width=\"800px\" src=\"../fidle/img/00-Fidle-header-01.svg\"></img>\n", + "\n", + "# <!-- TITLE --> [WINE1] - Wine quality prediction with a Dense Network (DNN) using Lightning\n", + " <!-- DESC --> Another example of regression, with a wine quality prediction!\n", + " <!-- AUTHOR : Achille Mbogol Touye (EFFILIA-MIAI/SIMaP) -->\n", + "\n", + "## Objectives :\n", + " - Predict the **quality of wines**, based on their analysis\n", + " - Understanding the principle and the architecture of a regression with a dense neural network with backup and restore of the trained model. \n", + "\n", + "The **[Wine Quality datasets](https://archive.ics.uci.edu/ml/datasets/wine+Quality)** are made up of analyses of a large number of wines, with an associated quality (between 0 and 10) \n", + "This dataset is provide by : \n", + "Paulo Cortez, University of Minho, Guimarães, Portugal, http://www3.dsi.uminho.pt/pcortez \n", + "A. Cerdeira, F. Almeida, T. Matos and J. Reis, Viticulture Commission of the Vinho Verde Region(CVRVV), Porto, Portugal, @2009 \n", + "This dataset can be retreive at [University of California Irvine (UCI)](https://archive-beta.ics.uci.edu/ml/datasets/wine+quality)\n", + "\n", + "\n", + "Due to privacy and logistic issues, only physicochemical and sensory variables are available \n", + "There is no data about grape types, wine brand, wine selling price, etc.\n", + "\n", + "- fixed acidity\n", + "- volatile acidity\n", + "- citric acid\n", + "- residual sugar\n", + "- chlorides\n", + "- free sulfur dioxide\n", + "- total sulfur dioxide\n", + "- density\n", + "- pH\n", + "- sulphates\n", + "- alcohol\n", + "- quality (score between 0 and 10)\n", + "\n", + "## What we're going to do :\n", + "\n", + " - (Retrieve data)\n", + " - (Preparing the data)\n", + " - (Build a model)\n", + " - Train and save the model\n", + " - Restore saved model\n", + " - Evaluate the model\n", + " - Make some predictions\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1 - Import and init\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import some packages\n", + "import os\n", + "import sys\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import lightning.pytorch as pl\n", + "import torch.nn.functional as F\n", + "import torchvision.transforms as T\n", + "\n", + "from IPython.display import Markdown\n", + "from importlib import reload\n", + "from torch.utils.data import Dataset, DataLoader, random_split\n", + "from data_load import WineQualityDataset, Normalize, ToTensor\n", + "from lightning.pytorch.loggers.tensorboard import TensorBoardLogger\n", + "from torchmetrics.functional.regression import mean_absolute_error, mean_squared_error\n", + "\n", + "import fidle\n", + "\n", + "# Init Fidle environment\n", + "run_id, run_dir, datasets_dir = fidle.init('WINE1-Lightning')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Verbosity during training : \n", + "- 0 = silent\n", + "- 1 = progress bar\n", + "- 2 = one line per epoch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fit_verbosity = 1\n", + "dataset_name = 'winequality-red.csv'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Override parameters (batch mode) - Just forget this cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fidle.override('fit_verbosity', 'dataset_name')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2 - Retrieve data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "csv_file_path=f'{datasets_dir}/WineQuality/origine/{dataset_name}'\n", + "datasets=WineQualityDataset(csv_file_path)\n", + "\n", + "display(datasets.data.head(5).style.format(\"{0:.2f}\"))\n", + "print('Missing Data : ',datasets.data.isna().sum().sum(), ' Shape is : ', datasets.data.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3 - Preparing the data\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.1 - Data normalization\n", + "**Note :** \n", + " - All input features must be normalized. \n", + " - To do this we will subtract the mean and divide by the standard deviation for each input features. \n", + " - Then we convert numpy array features and target **(quality)** to torch tensor " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transforms=T.Compose([Normalize(csv_file_path), ToTensor()])\n", + "\n", + "dataset=WineQualityDataset(csv_file_path,transform=transforms)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(\"before normalization :\"))\n", + "display(datasets[:][\"features\"])\n", + "\n", + "print()\n", + "\n", + "display(Markdown(\"After normalization :\"))\n", + "display(dataset[:][\"features\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2 - Split data\n", + "We will use 80% of the data for training and 20% for validation. \n", + "x will be the features data of the analysis and y the target (quality)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ---- Split => train, test\n", + "#\n", + "data_train_len = int(len(dataset)*0.8) # get 80 %\n", + "data_test_len = len(dataset) -data_train_len # test = all - train\n", + "\n", + "# ---- Split => x,y with random_split\n", + "#\n", + "data_train_subset, data_test_subset=random_split(dataset, [data_train_len, data_test_len]) \n", + "\n", + " \n", + "x_train = data_train_subset[:][\"features\"]\n", + "y_train = data_train_subset[:][\"quality\" ]\n", + "\n", + "x_test = data_test_subset [:][\"features\"]\n", + "y_test = data_test_subset [:][\"quality\" ]\n", + "\n", + "\n", + "print('Original data shape was : ',dataset.data.shape)\n", + "print('x_train : ',x_train.shape, 'y_train : ',y_train.shape)\n", + "print('x_test : ',x_test.shape, 'y_test : ',y_test.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.3 - For Training model use Dataloader\n", + "The Dataset retrieves our dataset’s features and labels one sample at a time. While training a model, we typically want to pass samples in minibatches, reshuffle the data at every epoch to reduce model overfitting. DataLoader is an iterable that abstracts this complexity for us in an easy API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# train bacth data\n", + "train_loader= DataLoader(\n", + " dataset=data_train_subset, \n", + " shuffle=True, \n", + " batch_size=20,\n", + " num_workers=4 \n", + ")\n", + "\n", + "\n", + "# test bacth data\n", + "test_loader= DataLoader(\n", + " dataset=data_test_subset, \n", + " shuffle=False, \n", + " batch_size=20,\n", + " num_workers=4\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4 - Build a model\n", + "More informations about : \n", + " - [Optimizer](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers)\n", + " - [Activation](https://www.tensorflow.org/api_docs/python/tf/keras/activations)\n", + " - [Loss](https://www.tensorflow.org/api_docs/python/tf/keras/losses)\n", + " - [Metrics](https://www.tensorflow.org/api_docs/python/tf/keras/metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class LitRegression(pl.LightningModule):\n", + " \n", + " def __init__(self,in_features=11):\n", + " super().__init__()\n", + " self.model = nn.Sequential(\n", + " nn.Linear(in_features, 128), # hidden layer 1\n", + " nn.ReLU(), # activation function \n", + " nn.Linear(128, 128), # hidden layer 2\n", + " nn.ReLU(), # activation function\n", + " nn.Linear(128, 1)) # output layer \n", + " \n", + " def forward(self, x): # forward pass\n", + " x = self.model(x)\n", + " return x \n", + " \n", + " \n", + " # defines the train loop.\n", + " def training_step(self, batch, batch_idx): \n", + " x_features, y_target = batch[\"features\"],batch[\"quality\"]\n", + " y_pred = self.model(x_features) # forward pass\n", + " loss = F.mse_loss(y_pred, y_target) # loss function MSE\n", + " mae=mean_absolute_error(y_pred,y_target) # metrics mae\n", + " mse=mean_squared_error(y_pred,y_target) # metrics mse\n", + " metrics = {\"train_loss\": loss, \n", + " \"train_mae\" : mae, \n", + " \"train_mse\" : mse\n", + " }\n", + " # logs metrics for each training_step\n", + " self.log_dict(metrics, \n", + " on_step=False, \n", + " on_epoch=True, \n", + " logger=True,\n", + " prog_bar =True, \n", + " )\n", + " return loss \n", + " \n", + " # defines the val loop.\n", + " def validation_step(self, batch, batch_idx): \n", + " x_features, y_target = batch[\"features\"],batch[\"quality\"]\n", + " y_pred = self.model(x_features) # forward pass\n", + " loss = F.mse_loss(y_pred, y_target) # loss function MSE\n", + " mae=mean_absolute_error(y_pred,y_target) # metrics\n", + " mse=mean_squared_error(y_pred,y_target) # metrics\n", + " metrics = {\"val_loss\": loss, \n", + " \"val_mae\" : mae, \n", + " \"val_mse\" : mse\n", + " }\n", + " # logs metrics for each validation_step \n", + " self.log_dict(metrics, \n", + " on_step=False, \n", + " on_epoch=True, \n", + " logger=True,\n", + " prog_bar =True, \n", + " )\n", + "\n", + " return metrics\n", + " \n", + " # optimizer\n", + " def configure_optimizers(self): \n", + " optimizer = torch.optim.RMSprop(self.parameters(),lr=1e-4)\n", + " return optimizer " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5 - Train the model\n", + "### 5.1 - Get it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model=LitRegression(in_features=11)\n", + "print(model) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.2 - Add callback" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs('./run/models', exist_ok=True)\n", + "save_dir = \"./run/models/\"\n", + "\n", + "savemodel_callback = pl.callbacks.ModelCheckpoint(dirpath=save_dir, filename='best_model',save_top_k=1, verbose=False, monitor=\"val_loss\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.3 - Train it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# loggers data\n", + "logger = TensorBoardLogger(save_dir='Wine_logs',name=\"history_logs\")\n", + "\n", + "# train model\n", + "trainer=pl.Trainer(accelerator='auto',max_epochs=100,logger=logger,callbacks=[savemodel_callback])\n", + "trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=test_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6 - Evaluate\n", + "### 6.1 - Model evaluation\n", + "MAE = Mean Absolute Error (between the labels and predictions) \n", + "A mae equal to 3 represents an average error in prediction of $3k." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "score=trainer.validate(model=model, dataloaders=test_loader, verbose=False)\n", + "\n", + "print('x_test / loss : {:5.4f}'.format(score[0]['val_loss']))\n", + "print('x_test / mae : {:5.4f}'.format(score[0]['val_mae']))\n", + "print('x_test / mse : {:5.4f}'.format(score[0]['val_mse']))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.2 - Training history" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# launch Tensorboard \n", + "%reload_ext tensorboard\n", + "%tensorboard --logdir=Wine_logs/history_logs/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7 - Restore a model :" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7.1 - Reload model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_model = model.load_from_checkpoint('./run/models/best_model.ckpt')\n", + "print(\"Loaded:\")\n", + "print(loaded_model)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7.2 - Evaluate it :" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "score=trainer.validate(model=loaded_model, dataloaders=test_loader, verbose=False)\n", + "\n", + "print('x_test / loss : {:5.4f}'.format(score[0]['val_loss']))\n", + "print('x_test / mae : {:5.4f}'.format(score[0]['val_mae']))\n", + "print('x_test / mse : {:5.4f}'.format(score[0]['val_mse']))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7.3 - Make a prediction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ---- Pick n entries from our test set\n", + "n = 200\n", + "ii = np.random.randint(1,len(x_test),n)\n", + "x_sample = x_test[ii]\n", + "y_sample = y_test[ii]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ---- Make a predictions\n", + "loaded_model.eval() # Sets the model in evaluation mode.\n", + "with torch.no_grad():\n", + " y_pred = loaded_model( x_sample )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ---- Show it\n", + "print('Wine Prediction Real Delta')\n", + "for i in range(n):\n", + " pred = y_pred[i][0].item()\n", + " real = y_sample[i][0].item()\n", + " delta = real-pred\n", + " print(f'{i:03d} {pred:.2f} {real} {delta:+.2f} ')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fidle.end()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "<img width=\"80px\" src=\"../fidle/img/00-Fidle-logo-01.svg\"></img>" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "vscode": { + "interpreter": { + "hash": "b3929042cc22c1274d74e3e946c52b845b57cb6d84f2d591ffe0519b38e4896d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} -- GitLab