Commit 591d5e4c authored by Loic Huder's avatar Loic Huder
Browse files

Cosmetic changes to 12_pandas

parent 528d73a1
......@@ -21,7 +21,7 @@
}
},
"source": [
"# Pandas a Data Analysis Library\n",
"# Pandas, a Data Analysis Library\n",
"\n",
"pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.\n",
"\n",
......@@ -60,10 +60,8 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
......@@ -82,7 +80,7 @@
" \n",
"- multiple methods for convenient data filtering.\n",
"\n",
"- Toolkit utilities to perform Input/Output operations.\n",
"- toolkit utilities to perform input/output operations.\n",
"It can read data from a variety of formats such as CSV, TSV, MS Excel, etc.\n"
]
},
......@@ -102,7 +100,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 2,
"metadata": {
"slideshow": {
"slide_type": "slide"
......@@ -125,12 +123,20 @@
"2,3\n",
"3,4\n",
"\n",
"0 apple\n",
"0 kiwi\n",
"1 orange\n",
"2 mango\n",
"3 pear\n",
"3 apple\n",
"dtype: object\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/huderl/.local/lib/python3.6/site-packages/ipykernel_launcher.py:12: FutureWarning: The signature of `Series.to_csv` was aligned to that of `DataFrame.to_csv`, and argument 'header' will change its default value from False to True: please pass an explicit value to suppress this warning.\n",
" if sys.path[0] == '':\n"
]
}
],
"source": [
......@@ -163,7 +169,7 @@
"source": [
"# Dataframe\n",
"\n",
"A Dictionnary of series where keys are column name\n",
"A dictionnary of series where keys are column name\n",
"\n",
"<div><img style=\"float: left;margin-left : 70px\" src='fig/dataframe_type.png' height=\"800\" width=\"800\"/>\n"
]
......@@ -192,7 +198,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [
{
......@@ -212,7 +218,7 @@
"pandas.core.series.Series"
]
},
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
......@@ -221,14 +227,14 @@
"## DataFrame structure\n",
"import pandas as pd \n",
" \n",
"# intialise data of lists. \n",
"data = {'Name':['John', 'Paul', 'Debby', 'Laura'], 'Sex':['Male','Male','Female','Female'],'Age':[20, 40, 19, 30]} \n",
"# Intialise data: dictionnary of lists. \n",
"data = {'Name':['John', 'Paul', 'Debby', 'Laura'], 'Sex':['Male','Male','Female','Female'], 'Age':[20, 40, 19, 30]} \n",
" \n",
"# Create DataFrame \n",
"df = pd.DataFrame(data)\n",
"print(df)\n",
"\n",
"type(df.Age)\n"
"type(df.Age)"
]
},
{
......@@ -239,12 +245,12 @@
}
},
"source": [
"## From a File"
"### From a file"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
......@@ -261,7 +267,7 @@
],
"source": [
"import pandas as pd\n",
"df_person = pd.read_csv('files/person.txt', sep = ',',encoding = \"utf-8\", header=0)\n",
"df_person = pd.read_csv('files/person.txt', sep = ',', encoding=\"utf-8\", header=0)\n",
"print(df_person)"
]
},
......@@ -273,11 +279,13 @@
}
},
"source": [
"By default, new index is created\n",
"By default, a new index is created\n",
"\n",
"If you want use a based field index, you have to specify it on the read_csv function:\n",
"If you want use a field-based index, you have to specify it in the `read_csv` function:\n",
"\n",
"df_person = pd.read_csv('files/person.txt', sep = ',',index_col='Name',encoding = \"utf-8\", header=0)\n"
"```python\n",
"df_person = pd.read_csv('files/person.txt', sep = ',', index_col='Name', encoding=\"utf-8\", header=0)\n",
"```\n"
]
},
{
......@@ -288,12 +296,12 @@
}
},
"source": [
"## Basics commands"
"## Basic commands"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -317,13 +325,13 @@
}
],
"source": [
"#display simple statistics\n",
"# display simple statistics\n",
"print(df_person.describe())"
]
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 6,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -339,13 +347,13 @@
}
],
"source": [
"#display the dataframe columns\n",
"# display the dataframe columns\n",
"print(df_person.columns)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -370,16 +378,16 @@
}
],
"source": [
"#query one column\n",
"# query one column\n",
"print(df_person[\"Age\"])\n",
"\n",
"# another method to query one column\n",
"print(df_person.Age)\n"
"print(df_person.Age)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 8,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -399,13 +407,13 @@
}
],
"source": [
"#query multiple column\n",
"# query multiple columns\n",
"print(df_person[['Name','Age']])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -422,12 +430,12 @@
],
"source": [
"# display unique value of a column\n",
"print(df_person.Sex.unique())\n"
"print(df_person.Sex.unique())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 10,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -464,7 +472,6 @@
}
],
"source": [
"\n",
"# display the 5 first rows\n",
"print(df_person.head())\n",
"\n",
......@@ -474,16 +481,16 @@
"# display 2 first rows\n",
"print(df_person[:2])\n",
"\n",
"# display by number position\n",
"# display by position number\n",
"print(df_person.iloc[2])\n",
"\n",
"\n",
"print(df_person.iloc[:])\n"
"print(df_person.iloc[:])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 11,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -507,7 +514,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 12,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -529,7 +536,7 @@
"source": [
"# Basic operations on columns \n",
"df_person.Age = df_person.Age + 2\n",
"print(df_person.Age)\n"
"print(df_person.Age)"
]
},
{
......@@ -540,12 +547,12 @@
}
},
"source": [
"# Add a row"
"### Add a row"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 13,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -622,13 +629,13 @@
"4 Glenn Male 10"
]
},
"execution_count": 11,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_person = df_person.append({'Name':'Glenn','Sex': 'Male','Age':10},ignore_index=True)\n",
"df_person = df_person.append({'Name':'Glenn', 'Sex':'Male', 'Age':10}, ignore_index=True)\n",
"df_person"
]
},
......@@ -640,12 +647,12 @@
}
},
"source": [
"### Add some rows "
"### Add some rows"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 14,
"metadata": {},
"outputs": [
{
......@@ -746,15 +753,15 @@
"8 Ava Female 22"
]
},
"execution_count": 12,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = {'Name':['Marguerite', 'Annie', 'Stephen', 'Ava'], 'Sex':['Female','Female','Male','Female'],'Age':[34, 23, 49, 22]} \n",
"data = {'Name':['Marguerite', 'Annie', 'Stephen', 'Ava'], 'Sex':['Female','Female','Male','Female'], 'Age':[34, 23, 49, 22]} \n",
"\n",
"df_person= df_person.append(pd.DataFrame(data),ignore_index=True)\n",
"df_person = df_person.append(pd.DataFrame(data), ignore_index=True)\n",
"df_person"
]
},
......@@ -766,12 +773,12 @@
}
},
"source": [
"## Add a column"
"### Add a column"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 15,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -886,7 +893,7 @@
"8 Ava Female 22 USA"
]
},
"execution_count": 13,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
......@@ -909,7 +916,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 16,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -922,7 +929,7 @@
"pandas.core.series.Series"
]
},
"execution_count": 7,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
......@@ -933,7 +940,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 17,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -953,13 +960,13 @@
],
"source": [
"## Mean\n",
"print (df_person.Age.mean())\n",
"print(df_person.Age.mean())\n",
"\n",
"## Min and Max\n",
"print (df_person.Age.min())\n",
"print(df_person.Age.min())\n",
"print(df_person.Age.max())\n",
"\n",
"print (df_person.Age.count())"
"print(df_person.Age.count())"
]
},
{
......@@ -975,7 +982,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 18,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -1017,7 +1024,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 19,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -1045,13 +1052,13 @@
}
],
"source": [
"# selection with one criteria\n",
"print(df_person[df_person['Sex']=='Female'])\n",
"print (\"--------------------\")\n",
"print(df_person[df_person['Age']<20])\n",
"print (\"--------------------\")\n",
"# selection with one criterion\n",
"print(df_person[df_person['Sex'] == 'Female'])\n",
"print(\"--------------------\")\n",
"print(df_person[df_person['Age'] < 20])\n",
"print(\"--------------------\")\n",
"# selection with 2 criteria\n",
"print(df_person[(df_person['Sex'] =='Male') & (df_person['Age'] > 30)])"
"print(df_person[(df_person['Sex'] == 'Male') & (df_person['Age'] > 30)])"
]
},
{
......@@ -1067,7 +1074,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 20,
"metadata": {
"slideshow": {
"slide_type": "fragment"
......@@ -1102,18 +1109,18 @@
}
],
"source": [
"#change one value by index\n",
"df_person.loc[7,\"Name\"] = \"Stephane\"\n",
"# change one value by index\n",
"df_person.loc[7, \"Name\"] = \"Stephane\"\n",
"print(df_person)\n",
"\n",
"#change one value after a selection\n",
"# change one value after a selection\n",
"df_person.loc[df_person[\"Name\"] == 'Stephane', \"Name\"] = \"Eric\"\n",
"print(df_person)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 21,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -1148,11 +1155,11 @@
}
],
"source": [
"##Add a Column\n",
"## Add a column\n",
"df_person[\"City\"] = \"City\"\n",
"print(df_person)\n",
"##Delete a row\n",
"df_person = df_person.drop(\"City\",axis=1)\n",
"## Delete a column\n",
"df_person = df_person.drop(\"City\", axis=1)\n",
"print(df_person)"
]
},
......@@ -1171,7 +1178,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 22,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -1200,7 +1207,7 @@
}
],
"source": [
"data = {'Name':['Benedicte', 'Bernard', 'Nicolas', 'Anne'], 'Sex':['Female','Male','Male','Female'],'Age':[24, 34, 49, 42],'Nationality':['FR','FR','FR','FR']} \n",
"data = {'Name':['Benedicte', 'Bernard', 'Nicolas', 'Anne'], 'Sex':['Female','Male','Male','Female'], 'Age':[24, 34, 49, 42],'Nationality':['FR','FR','FR','FR']} \n",
"df_person_fr = pd.DataFrame(data)\n",
"list_person = [df_person,df_person_fr]\n",
"result = pd.concat(list_person)\n",
......@@ -1223,7 +1230,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 23,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -1240,36 +1247,36 @@
"2 2 st georges street Charlotte\n",
"3 3 5th street San Francisco\n",
" Name Sex Age Nationality id_Address\n",
"0 John Male 22 USA 1\n",
"1 Paul Male 42 USA 3\n",
"0 John Male 22 USA 3\n",
"1 Paul Male 42 USA 2\n",
"2 Debby Female 21 USA 3\n",
"3 Laura Female 32 USA 3\n",
"4 Glenn Male 10 USA 2\n",
"5 Marguerite Female 34 USA 1\n",
"6 Annie Female 23 USA 3\n",
"7 Eric Male 49 USA 2\n",
"8 Ava Female 22 USA 2\n",
"3 Laura Female 32 USA 0\n",
"4 Glenn Male 10 USA 3\n",
"5 Marguerite Female 34 USA 0\n",
"6 Annie Female 23 USA 0\n",
"7 Eric Male 49 USA 3\n",
"8 Ava Female 22 USA 1\n",
" Name Sex Age Nationality id_Address Address \\\n",
"0 John Male 22 USA 1 aqua boulevard \n",
"1 Paul Male 42 USA 3 5th street \n",
"0 John Male 22 USA 3 5th street \n",
"1 Paul Male 42 USA 2 st georges street \n",
"2 Debby Female 21 USA 3 5th street \n",
"3 Laura Female 32 USA 3 5th street \n",
"4 Glenn Male 10 USA 2 st georges street \n",
"5 Marguerite Female 34 USA 1 aqua boulevard \n",
"6 Annie Female 23 USA 3 5th street \n",
"7 Eric Male 49 USA 2 st georges street \n",
"8 Ava Female 22 USA 2 st georges street \n",
"3 Laura Female 32 USA 0 gordon street \n",
"4 Glenn Male 10 USA 3 5th street \n",
"5 Marguerite Female 34 USA 0 gordon street \n",
"6 Annie Female 23 USA 0 gordon street \n",
"7 Eric Male 49 USA 3 5th street \n",
"8 Ava Female 22 USA 1 aqua boulevard \n",
"\n",
" City \n",
"0 Chicago \n",
"1 San Francisco \n",
"0 San Francisco \n",
"1 Charlotte \n",
"2 San Francisco \n",
"3 San Francisco \n",
"4 Charlotte \n",
"5 Chicago \n",
"6 San Francisco \n",
"7 Charlotte \n",
"8 Charlotte \n"
"3 Boston \n",
"4 San Francisco \n",
"5 Boston \n",
"6 Boston \n",
"7 San Francisco \n",
"8 Chicago \n"
]
}
],
......@@ -1277,7 +1284,7 @@
"import random\n",
"\n",
"\n",
"data = {'id_Address':[0,1,2,3],'Address':['gordon street', 'aqua boulevard', 'st georges street', '5th street'], 'City':['Boston','Chicago','Charlotte','San Francisco']} \n",
"data = {'id_Address':[0, 1, 2, 3], 'Address':['gordon street', 'aqua boulevard', 'st georges street', '5th street'], 'City':['Boston', 'Chicago', 'Charlotte', 'San Francisco']} \n",
" \n",
"# Create DataFrame \n",
"df_address = pd.DataFrame(data)\n",
......@@ -1286,13 +1293,13 @@
"df_person[\"id_Address\"] = \"\"\n",
"nb_elements = df_person.Name.count()\n",
"\n",
"cpt = 0 \n",
"while (cpt < nb_elements ):\n",
"cpt = 0\n",
"while(cpt < nb_elements):\n",
" df_person.loc[cpt,\"id_Address\"] = random.randint(0, 3)\n",
" cpt = cpt + 1\n",
"print(df_person)\n",
"\n",
"result = pd.merge(df_person, df_address, how='left',on='id_Address')\n",
"result = pd.merge(df_person, df_address, how='left', on='id_Address')\n",
"print(result)"
]
},
......@@ -1315,7 +1322,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 24,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -1339,7 +1346,7 @@
],
"source": [
"print(df_person.groupby('Sex')['Sex'].count())\n",
"print(df_person.groupby('Sex')['Age'].mean())\n"
"print(df_person.groupby('Sex')['Age'].mean())"
]
},
{
......@@ -1350,12 +1357,12 @@
}
},
"source": [
"## Export Data "
"## Export data "
]
},
{
"cell_type": "code",
"execution_count": 57,
"execution_count": 25,
"metadata": {
"slideshow": {
"slide_type": "subslide"
......@@ -1363,7 +1370,7 @@
},
"outputs": [],
"source": [
"export_csv = df_person.to_csv (r'./files/export_person.csv', index = None, header=True) "
"export_csv = df_person.to_csv(r'./files/export_person.csv', index=None, header=True) "
]
},
{
......@@ -1374,12 +1381,12 @@
}
},
"source": [
"## Plot Data"
"## Plot data"
]
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 26,
"metadata": {
"slideshow": {
"slide_type": "fragment"
......@@ -1389,21 +1396,23 @@
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7f9b4e5cd438>"
"<matplotlib.axes._subplots.AxesSubplot at 0x7febf9fdf9b0>"
]
},
"execution_count": 22,
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAEkCAYAAAAPYduFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAADPVJREFUeJzt3X2MpfVZh/Hr211q6RskZWwIy3ZoQkjoGy0jSImkoH0TbBP/sRgbkzZuTLC0iYlW/UPwJdVqtIGo6aZUa61QtcXYNi2lKiipRWaRVrZLU9JQQUhYNLSgkXbh9o9zpk6XmZ1n65x5zj1zfZLNzpl59uy92ZMrz/zmd54nVYUkqY9njD2AJOn4GG5JasZwS1IzhluSmjHcktSM4ZakZgy3JDVjuCWpGcMtSc3snsWTnnLKKbW4uDiLp5akbenAgQOPVNXCkGNnEu7FxUWWl5dn8dSStC0l+frQY10qkaRmDLckNWO4JakZwy1JzRhuSWpm0K6SJPcBjwFPAkeqammWQ0mS1nc82wEvrqpHZjaJJGkQl0okqZmhZ9wFfDZJAe+vqv1HH5BkH7APYO/evZs34YwsvvtTY4+wrdz3W5eOPYK0Yww9476wql4FvBG4IslFRx9QVfuraqmqlhYWBr1rU5L0PRgU7qp6cPr7w8CNwHmzHEqStL4Nw53kOUmet/Ix8Drg7lkPJkla25A17hcCNyZZOf7Pq+ozM51KkrSuDcNdVV8DXrEFs0iSBnA7oCQ1Y7glqRnDLUnNGG5JasZwS1IzhluSmjHcktSM4ZakZgy3JDVjuCWpGcMtSc0YbklqxnBLUjOGW5KaMdyS1IzhlqRmDLckNWO4JakZwy1JzRhuSWrGcEtSM4Zbkpox3JLUjOGWpGYMtyQ1Y7glqRnDLUnNGG5JasZwS1IzhluSmjHcktSM4ZakZgaHO8muJP+S5JOzHEiSdGzHc8b9TuDQrAaRJA0zKNxJ9gCXAh+Y7TiSpI0MPeN+H/ALwFMznEWSNMDujQ5IchnwcFUdSPKaYxy3D9gHsHfv3k0bUNqRrjpp7Am2l6u+MfYEm2rIGfeFwJuS3AfcAFyS5M+OPqiq9lfVUlUtLSwsbPKYkqQVG4a7qn6pqvZU1SLwFuDvquqnZj6ZJGlN7uOWpGY2XONerapuAW6ZySSSpEE845akZgy3JDVjuCWpGcMtSc0YbklqxnBLUjOGW5KaMdyS