Newer
Older
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img width=\"800px\" src=\"../fidle/img/header.svg\"></img>\n",
"# <!-- TITLE --> [POLR1] - Complexity Syndrome\n",
"<!-- DESC --> Illustration of the problem of complexity with the polynomial regression\n",
"<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->\n",
" - Visualizing and understanding under and overfitting\n",
"## What we're going to do :\n",
"\n",
"We are looking for a polynomial function to approximate the observed series : \n",
"$ y = a_n\\cdot x^n + \\dots + a_i\\cdot x^i + \\dots + a_1\\cdot x + b $ \n",
"\n",
"\n",
"## Step 1 - Import and init"
]
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"import math\n",
"import random\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import sys\n",
"# Init Fidle environment\n",
"run_id, run_dir, datasets_dir = fidle.init('POLR1')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 2 - Dataset generation"
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"source": [
"# ---- Parameters\n",
"\n",
"n = 100\n",
"\n",
"xob_min = -5\n",
"xob_max = 5\n",
"\n",
"deg = 7\n",
"a_min = -2\n",
"a_max = 2\n",
"\n",
"noise = 2000\n",
"\n",
"# ---- Train data\n",
"# X,Y : data\n",
"# X_norm,Y_norm : normalized data\n",
"\n",
"X = np.random.uniform(xob_min,xob_max,(n,1))\n",
"# N = np.random.uniform(-noise,noise,(n,1))\n",
"N = noise * np.random.normal(0,1,(n,1))\n",
"\n",
"a = np.random.uniform(a_min,a_max, (deg,))\n",
"fy = np.poly1d( a )\n",
"\n",
"Y = fy(X) + N\n",
"\n",
"# ---- Data normalization\n",
"#\n",
"X_norm = (X - X.mean(axis=0)) / X.std(axis=0)\n",
"Y_norm = (Y - Y.mean(axis=0)) / Y.std(axis=0)\n",
"\n",
"# ---- Data visualization\n",
"\n",
"width = 12\n",
"height = 6\n",
"nb_viz = min(2000,n)\n",
"\n",
"def vector_infos(name,V):\n",
" m=V.mean(axis=0).item()\n",
" s=V.std(axis=0).item()\n",
" print(\"{:8} : mean={:+12.4f} std={:+12.4f} min={:+12.4f} max={:+12.4f}\".format(name,m,s,V.min(),V.max()))\n",
"\n",
"\n",
"print(f\"Nomber of points={n} deg={deg} bruit={noise}\")\n",
"print(f\"{nb_viz} points visibles sur {n})\")\n",
"plt.figure(figsize=(width, height))\n",
"plt.plot(X[:nb_viz], Y[:nb_viz], '.')\n",
"plt.tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)\n",
"plt.xlabel('x axis')\n",
"plt.ylabel('y axis')\n",
"fidle.utils.display_md('#### Before normalization :')\n",
"vector_infos('X',X)\n",
"vector_infos('Y',Y)\n",
"\n",
"fidle.utils.display_md('#### After normalization :') \n",
"vector_infos('X_norm',X_norm)\n",
"vector_infos('Y_norm',Y_norm)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 3 - Polynomial regression with NumPy\n",
"### 3.1 - Underfitting"
]
},
{
"cell_type": "code",
"metadata": {},
"outputs": [],
"source": [
"def draw_reg(X_norm, Y_norm, x_hat,fy_hat, size, save_as):\n",
" plt.figure(figsize=size)\n",
" plt.plot(X_norm, Y_norm, '.')\n",
"\n",
" x_hat = np.linspace(X_norm.min(), X_norm.max(), 100)\n",
"\n",
" plt.plot(x_hat, fy_hat(x_hat))\n",
" plt.tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)\n",
" plt.xlabel('x axis')\n",
" plt.ylabel('y axis')\n",
" plt.show()"
]
},
{
"cell_type": "code",
"source": [
"reg_deg=1\n",
"\n",
"a_hat = np.polyfit(X_norm.reshape(-1,), Y_norm.reshape(-1,), reg_deg)\n",
"fy_hat = np.poly1d( a_hat )\n",
"\n",
"print(f'Nombre de degrés : {reg_deg}')\n",
"draw_reg(X_norm[:nb_viz],Y_norm[:nb_viz], X_norm,fy_hat, (width,height), save_as='02-underfitting')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.2 - Good fitting"
]
},
{
"cell_type": "code",
"source": [
"reg_deg=5\n",
"\n",
"a_hat = np.polyfit(X_norm.reshape(-1,), Y_norm.reshape(-1,), reg_deg)\n",
"fy_hat = np.poly1d( a_hat )\n",
"\n",
"print(f'Nombre de degrés : {reg_deg}')\n",
"draw_reg(X_norm[:nb_viz],Y_norm[:nb_viz], X_norm,fy_hat, (width,height), save_as='03-good_fitting')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.3 - Overfitting"
]
},
{
"cell_type": "code",
"source": [
"reg_deg=24\n",
"\n",
"a_hat = np.polyfit(X_norm.reshape(-1,), Y_norm.reshape(-1,), reg_deg)\n",
"fy_hat = np.poly1d( a_hat )\n",
"\n",
"print(f'Nombre de degrés : {reg_deg}')\n",
"draw_reg(X_norm[:nb_viz],Y_norm[:nb_viz], X_norm,fy_hat, (width,height), save_as='04-over_fitting')"
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"<img width=\"80px\" src=\"../fidle/img/logo-paysage.svg\"></img>"
]
}
],
"metadata": {
"kernelspec": {
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
},
"vscode": {
"interpreter": {
"hash": "b3929042cc22c1274d74e3e946c52b845b57cb6d84f2d591ffe0519b38e4896d"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}