{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "<img width=\"800px\" src=\"../fidle/img/header.svg\"></img>\n", "\n", "# <!-- TITLE --> [POLR1] - Complexity Syndrome\n", "<!-- DESC --> Illustration of the problem of complexity with the polynomial regression\n", "<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->\n", "\n", "## Objectives :\n", " - Visualizing and understanding under and overfitting\n", " \n", "## What we're going to do :\n", "\n", "We are looking for a polynomial function to approximate the observed series : \n", "$ y = a_n\\cdot x^n + \\dots + a_i\\cdot x^i + \\dots + a_1\\cdot x + b $ \n", "\n", "\n", "## Step 1 - Import and init" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import math\n", "import random\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import sys\n", "import fidle\n", "\n", "# Init Fidle environment\n", "run_id, run_dir, datasets_dir = fidle.init('POLR1')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 2 - Dataset generation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ---- Parameters\n", "\n", "n = 100\n", "\n", "xob_min = -5\n", "xob_max = 5\n", "\n", "deg = 7\n", "a_min = -2\n", "a_max = 2\n", "\n", "noise = 2000\n", "\n", "# ---- Train data\n", "# X,Y : data\n", "# X_norm,Y_norm : normalized data\n", "\n", "X = np.random.uniform(xob_min,xob_max,(n,1))\n", "# N = np.random.uniform(-noise,noise,(n,1))\n", "N = noise * np.random.normal(0,1,(n,1))\n", "\n", "a = np.random.uniform(a_min,a_max, (deg,))\n", "fy = np.poly1d( a )\n", "\n", "Y = fy(X) + N\n", "\n", "# ---- Data normalization\n", "#\n", "X_norm = (X - X.mean(axis=0)) / X.std(axis=0)\n", "Y_norm = (Y - Y.mean(axis=0)) / Y.std(axis=0)\n", "\n", "# ---- Data visualization\n", "\n", "width = 12\n", "height = 6\n", "nb_viz = min(2000,n)\n", "\n", "def vector_infos(name,V):\n", " m=V.mean(axis=0).item()\n", " s=V.std(axis=0).item()\n", " print(\"{:8} : mean={:+12.4f} std={:+12.4f} min={:+12.4f} max={:+12.4f}\".format(name,m,s,V.min(),V.max()))\n", "\n", "\n", "fidle.utils.display_md('#### Generator :')\n", "print(f\"Nomber of points={n} deg={deg} bruit={noise}\")\n", "\n", "fidle.utils.display_md('#### Datasets :')\n", "print(f\"{nb_viz} points visibles sur {n})\")\n", "plt.figure(figsize=(width, height))\n", "plt.plot(X[:nb_viz], Y[:nb_viz], '.')\n", "plt.tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)\n", "plt.xlabel('x axis')\n", "plt.ylabel('y axis')\n", "fidle.scrawler.save_fig(\"01-dataset\")\n", "plt.show()\n", "\n", "fidle.utils.display_md('#### Before normalization :')\n", "vector_infos('X',X)\n", "vector_infos('Y',Y)\n", "\n", "fidle.utils.display_md('#### After normalization :') \n", "vector_infos('X_norm',X_norm)\n", "vector_infos('Y_norm',Y_norm)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 3 - Polynomial regression with NumPy\n", "### 3.1 - Underfitting" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def draw_reg(X_norm, Y_norm, x_hat,fy_hat, size, save_as):\n", " plt.figure(figsize=size)\n", " plt.plot(X_norm, Y_norm, '.')\n", "\n", " x_hat = np.linspace(X_norm.min(), X_norm.max(), 100)\n", "\n", " plt.plot(x_hat, fy_hat(x_hat))\n", " plt.tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)\n", " plt.xlabel('x axis')\n", " plt.ylabel('y axis')\n", " fidle.scrawler.save_fig(save_as)\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "reg_deg=1\n", "\n", "a_hat = np.polyfit(X_norm.reshape(-1,), Y_norm.reshape(-1,), reg_deg)\n", "fy_hat = np.poly1d( a_hat )\n", "\n", "print(f'Nombre de degrés : {reg_deg}')\n", "draw_reg(X_norm[:nb_viz],Y_norm[:nb_viz], X_norm,fy_hat, (width,height), save_as='02-underfitting')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.2 - Good fitting" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "reg_deg=5\n", "\n", "a_hat = np.polyfit(X_norm.reshape(-1,), Y_norm.reshape(-1,), reg_deg)\n", "fy_hat = np.poly1d( a_hat )\n", "\n", "print(f'Nombre de degrés : {reg_deg}')\n", "draw_reg(X_norm[:nb_viz],Y_norm[:nb_viz], X_norm,fy_hat, (width,height), save_as='03-good_fitting')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.3 - Overfitting" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "reg_deg=24\n", "\n", "a_hat = np.polyfit(X_norm.reshape(-1,), Y_norm.reshape(-1,), reg_deg)\n", "fy_hat = np.poly1d( a_hat )\n", "\n", "print(f'Nombre de degrés : {reg_deg}')\n", "draw_reg(X_norm[:nb_viz],Y_norm[:nb_viz], X_norm,fy_hat, (width,height), save_as='04-over_fitting')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fidle.end()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "<img width=\"80px\" src=\"../fidle/img/logo-paysage.svg\"></img>" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.2 ('fidle-env')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "vscode": { "interpreter": { "hash": "b3929042cc22c1274d74e3e946c52b845b57cb6d84f2d591ffe0519b38e4896d" } } }, "nbformat": 4, "nbformat_minor": 4 }