diff --git a/pres_numpy.ipynb b/pres_numpy.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..947987241d17bcb1b52e592ece5cc62b52de4796 --- /dev/null +++ b/pres_numpy.ipynb @@ -0,0 +1,844 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# A short introduction to Numpy\n", + "Strongly inspired by the UGA Python Introduction Course\n", + "https://gricad-gitlab.univ-grenoble-alpes.fr/python-uga/py-training-2017" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## A short introduction on NumPy\n", + "\n", + "Code using `numpy` usually starts with the import statement" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NumPy provides the type `np.ndarray`. Such array are multidimensionnal sequences of homogeneous elements. They can be created for example with the commands:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([10. , 12.5, 15. , 17.5, 20. ])" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# from a list\n", + "l = [10.0, 12.5, 15.0, 17.5, 20.0]\n", + "np.array(l)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1.27880790e-316, 0.00000000e+000, 6.91986808e-310, 1.57378525e-316])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# fast but the values can be anything\n", + "np.empty(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0.]])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# slower than np.empty but the values are all 0.\n", + "np.zeros([2, 6])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2, 3, 4) 24 float64\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[[1., 1., 1., 1.],\n", + " [1., 1., 1., 1.],\n", + " [1., 1., 1., 1.]],\n", + "\n", + " [[1., 1., 1., 1.],\n", + " [1., 1., 1., 1.],\n", + " [1., 1., 1., 1.]]])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# multidimensional array\n", + "a = np.ones([2, 3, 4])\n", + "print(a.shape, a.size, a.dtype)\n", + "a" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2, 3])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# like range but produce 1D numpy array\n", + "np.arange(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0., 1., 2., 3.])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# np.arange can produce arrays of floats\n", + "np.arange(4.)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([10. , 12.5, 15. , 17.5, 20. ])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# another convenient function to generate 1D arrays\n", + "np.linspace(10, 20, 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A NumPy array can be easily converted to a Python list." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[10.0, 12.5, 15.0, 17.5, 20.0]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = np.linspace(10, 20 ,5)\n", + "list(a)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[10.0, 12.5, 15.0, 17.5, 20.0]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Or even better\n", + "a.tolist()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Manipulating NumPy arrays" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Access elements\n", + "Elements in a `numpy` array can be accessed using indexing and slicing in any dimension. It also offers the same functionalities available in Fortan or Matlab.\n", + "\n", + "### Indexes and slices\n", + "For example, we can create an array `A` and perform any kind of selection operations on it." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.89925962, 0.31519992, 0.17170063, 0.06102236, 0.6055506 ],\n", + " [0.43365108, 0.67461267, 0.34962124, 0.75648088, 0.53096922],\n", + " [0.65643503, 0.4723704 , 0.77202087, 0.50192904, 0.14067726],\n", + " [0.80709755, 0.2314217 , 0.65465368, 0.28459125, 0.54727527]])" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "A = np.random.random([4, 5])\n", + "A" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.4336510750584107" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the element from second line, first column\n", + "A[1, 0]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.89925962, 0.31519992, 0.17170063, 0.06102236, 0.6055506 ],\n", + " [0.43365108, 0.67461267, 0.34962124, 0.75648088, 0.53096922]])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the first two lines\n", + "A[:2]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.6055506 , 0.53096922, 0.14067726, 0.54727527])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the last column\n", + "A[:, -1]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.89925962, 0.17170063, 0.6055506 ],\n", + " [0.43365108, 0.34962124, 0.53096922]])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the first two lines and the columns with an even index\n", + "A[:2, ::2]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Using a mask to select elements validating a condition:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ True False False False True]\n", + " [False True False True True]\n", + " [ True False True True False]\n", + " [ True False True False True]]\n", + "[0.89925962 0.6055506 0.67461267 0.75648088 0.53096922 0.65643503\n", + " 0.77202087 0.50192904 0.80709755 0.65465368 0.54727527]\n" + ] + } + ], + "source": [ + "cond = A > 0.5\n", + "print(cond)\n", + "print(A[cond])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The mask is in fact a particular case of the advanced indexing capabilities provided by NumPy. For example, it is even possible to use lists for indexing:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.89925962 0.31519992 0.17170063 0.06102236 0.6055506 ]\n", + " [0.43365108 0.67461267 0.34962124 0.75648088 0.53096922]\n", + " [0.65643503 0.4723704 0.77202087 0.50192904 0.14067726]\n", + " [0.80709755 0.2314217 0.65465368 0.28459125 0.54727527]]\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[0.89925962, 0.31519992, 0.6055506 ],\n", + " [0.43365108, 0.67461267, 0.53096922],\n", + " [0.65643503, 0.4723704 , 0.14067726],\n", + " [0.80709755, 0.2314217 , 0.54727527]])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Selecting only particular columns\n", + "print(A)\n", + "A[:, [0, 1, 4]]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Perform array manipulations\n", + "### Apply arithmetic operations to whole arrays (element-wise):" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[34.80126403, 28.25135024, 26.7464874 , 25.61394735, 31.42219749],\n", + " [29.52456401, 32.20122896, 28.61844741, 33.13707212, 30.59162046],\n", + " [31.99525724, 29.94683782, 33.31622493, 30.27122313, 26.42656267],\n", + " [33.72238198, 27.36777304, 31.97510827, 27.92690466, 30.77226288]])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(A+5)**2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Apply functions element-wise:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[2.45778274, 1.37053329, 1.18732233, 1.06292268, 1.83226077],\n", + " [1.54288042, 1.9632724 , 1.41853016, 2.13076459, 1.70057974],\n", + " [1.92790714, 1.60379132, 2.16413527, 1.65190478, 1.15105309],\n", + " [2.24139301, 1.26039064, 1.92447592, 1.3292186 , 1.72853679]])" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.exp(A) # With numpy arrays, use the functions from numpy !" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting parts of arrays" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0. 0.31519992 0.17170063 0.06102236 0.6055506 ]\n", + " [0. 0.67461267 0.34962124 0.75648088 0.53096922]\n", + " [0. 0.4723704 0.77202087 0.50192904 0.14067726]\n", + " [0. 0.2314217 0.65465368 0.28459125 0.54727527]]\n" + ] + } + ], + "source": [ + "A[:, 0] = 0.\n", + "print(A)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0. 3.17258959 5.82409047 16.387435 1.65138967]\n", + " [ 0. 1.48233207 2.86023812 1.32191048 1.88334836]\n", + " [ 0. 2.11698277 1.29530177 1.99231351 7.10846954]\n", + " [ 0. 4.32111589 1.5275252 3.51381149 1.82723405]]\n" + ] + } + ], + "source": [ + "# BONUS: Safe element-wise inverse with masks\n", + "cond = (A != 0)\n", + "A[cond] = 1./A[cond]\n", + "print(A)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Attributes and methods of `np.ndarray` (see the [doc](https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html#numpy.ndarray))" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['T', 'all', 'any', 'argmax', 'argmin', 'argpartition', 'argsort', 'astype', 'base', 'byteswap', 'choose', 'clip', 'compress', 'conj', 'conjugate', 'copy', 'ctypes', 'cumprod', 'cumsum', 'data', 'diagonal', 'dot', 'dtype', 'dump', 'dumps', 'fill', 'flags', 'flat', 'flatten', 'getfield', 'imag', 'item', 'itemset', 'itemsize', 'max', 'mean', 'min', 'nbytes', 'ndim', 'newbyteorder', 'nonzero', 'partition', 'prod', 'ptp', 'put', 'ravel', 'real', 'repeat', 'reshape', 'resize', 'round', 'searchsorted', 'setfield', 'setflags', 'shape', 'size', 'sort', 'squeeze', 'std', 'strides', 'sum', 'swapaxes', 'take', 'tobytes', 'tofile', 'tolist', 'tostring', 'trace', 'transpose', 'var', 'view']\n" + ] + } + ], + "source": [ + "print([s for s in dir(A) if not s.startswith('__')])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0. 3.17258959 5.82409047 16.387435 1.65138967]\n", + " [ 0. 1.48233207 2.86023812 1.32191048 1.88334836]\n", + " [ 0. 2.11698277 1.29530177 1.99231351 7.10846954]\n", + " [ 0. 4.32111589 1.5275252 3.51381149 1.82723405]]\n", + "Mean value 2.9143043986324475\n", + "Mean line [0. 2.77325508 2.87678889 5.80386762 3.1176104 ]\n", + "Mean column [5.40710095 1.50956581 2.50261352 2.23793733]\n" + ] + } + ], + "source": [ + "# Ex1: Get the mean through different dimensions\n", + "print(A)\n", + "print('Mean value', A.mean())\n", + "print('Mean line', A.mean(axis=0))\n", + "print('Mean column', A.mean(axis=1))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0. 3.17258959 5.82409047 16.387435 1.65138967]\n", + " [ 0. 1.48233207 2.86023812 1.32191048 1.88334836]\n", + " [ 0. 2.11698277 1.29530177 1.99231351 7.10846954]\n", + " [ 0. 4.32111589 1.5275252 3.51381149 1.82723405]] (4, 5)\n", + "[ 0. 3.17258959 5.82409047 16.387435 1.65138967 0.\n", + " 1.48233207 2.86023812 1.32191048 1.88334836 0. 2.11698277\n", + " 1.29530177 1.99231351 7.10846954 0. 4.32111589 1.5275252\n", + " 3.51381149 1.82723405] (20,)\n" + ] + } + ], + "source": [ + "# Ex2: Convert a 2D array in 1D keeping all elements\n", + "print(A, A.shape)\n", + "A_flat = A.flatten()\n", + "print(A_flat, A_flat.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "### Remark: dot product" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.]\n", + "385.0\n" + ] + } + ], + "source": [ + "b = np.linspace(0, 10, 11)\n", + "c = b @ b\n", + "# before 3.5:\n", + "# c = b.dot(b)\n", + "print(b)\n", + "print(c)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### For Matlab users\n", + "\n", + "| ` ` | Matlab | Numpy |\n", + "| ------------- | ------ | ----- |\n", + "| element wise | `.*` | `*` |\n", + "| dot product | `*` | `@` |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`numpy` arrays can also be sorted, even when they are composed of complex data if the type of the columns are explicitly stated with `dtypes`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "#### NumPy and SciPy sub-packages:\n", + "\n", + "We already saw `numpy.random` to generate `numpy` arrays filled with random values. This submodule also provides functions related to distributions (Poisson, gaussian, etc.) and permutations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To perform linear algebra with dense matrices, we can use the submodule `numpy.linalg`. For instance, in order to compute the determinant of a random matrix, we use the method `det`" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.47138506 0.41353868 0.09441948 0.225147 0.82335198]\n", + " [0.04490952 0.14682972 0.31792846 0.22918746 0.73823443]\n", + " [0.50485749 0.99705961 0.51896582 0.93318595 0.11375617]\n", + " [0.37148317 0.0477689 0.29061475 0.41826056 0.47950005]\n", + " [0.70324502 0.82838271 0.92172528 0.79532669 0.56698101]]\n" + ] + }, + { + "data": { + "text/plain": [ + "0.06968780805887545" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "A = np.random.random([5,5])\n", + "print(A)\n", + "np.linalg.det(A)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.14682972 0.31792846]\n", + " [0.99705961 0.51896582]]\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[-2.15522717, 1.32033369],\n", + " [ 4.14071576, -0.6097731 ]])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "squared_subA = A[1:3, 1:3]\n", + "print(squared_subA)\n", + "np.linalg.inv(squared_subA)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Introduction to Pandas: Python Data Analysis Library\n", + "\n", + "Pandas is an open source library providing high-performance, easy-to-use data structures and data analysis tools for Python.\n", + "\n", + "[Pandas tutorial](https://pandas.pydata.org/pandas-docs/stable/10min.html)\n", + "[Grenoble Python Working Session](https://github.com/iutzeler/Pres_Pandas/)\n", + "[Pandas for SQL Users](https://hackernoon.com/pandas-cheatsheet-for-sql-people-part-1-2976894acd0)" + ] + } + ], + "metadata": { + "celltoolbar": "Diaporama", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}