Update VAE07 for ci

f28f4f37 · Jean-Luc Parouty · a434f008 · f28f4f37 · f28f4f37 · f28f4f37
Commit f28f4f37 authored 4 years ago by Jean-Luc Parouty
--- a/VAE/06-Prepare-CelebA-datasets.ipynb
+++ b/VAE/06-Prepare-CelebA-datasets.ipynb
@@ -114,7 +114,7 @@
     "text": [
      "Version              : 0.6.1 DEV\n",
      "Notebook id          : VAE6\n",
-      "Run time             : Thursday 31 December 2020, 12:49:25\n",
+      "Run time             : Saturday 2 January 2021, 17:04:58\n",
      "TensorFlow version   : 2.2.0\n",
      "Keras version        : 2.3.0-tf\n",
      "Datasets dir         : /home/pjluc/datasets/fidle\n",
@@ -228,6 +228,7 @@
    "    cluster_size = int(cluster_size*scale)\n",
    "    pwk.subtitle('Parameters :')\n",
    "    print(f'Scale is : {scale}')\n",
+    "    print(f'Image size is     : {image_size}')\n",
    "    print(f'dataset length is : {n}')\n",
    "    print(f'cluster size is   : {cluster_size}')\n",
    "    print(f'clusters nb  is   :',int(n/cluster_size + 1))\n",
@@ -293,19 +294,94 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Parameters you can change -----------------------------------\n",
+    "\n",
+    "# ---- Tests\n",
+    "scale       = 0.1\n",
+    "image_size  = (128,128)\n",
+    "output_dir  = './data'\n",
+    "\n",
+    "# ---- Full clusters generation, medium size\n",
+    "# scale       = 1.\n",
+    "# image_size  = (128,128)\n",
+    "# output_dir = f'{datasets_dir}/celeba/enhanced'\n",
+    "\n",
+    "# ---- Full clusters generation, large size\n",
+    "# scale       = 1.\n",
+    "# image_size  = (192,160)\n",
+    "# output_dir = f'{datasets_dir}/celeba/enhanced'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "<br>**Parameters :**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Override : running mode is [smart] Parameter [scale=0.05] set to [0.05]\n",
+      "Scale is : 0.1\n",
-      "Override : running mode is [smart] Parameter [image_size=(128, 128)] set to [(128, 128)]\n",
+      "Image size is     : (128, 128)\n",
-      "Override : running mode is [smart] Parameter [output_dir=./data] set to [./data]\n",
+      "dataset length is : 20259\n",
-      "\n",
+      "cluster size is   : 1000\n",
-      "*** Oups. There are already clusters in the target folder!\n",
+      "clusters nb  is   : 21\n",
-      "\n"
+      "cluster dir  is   : ./data/clusters-128x128\n"
+     ]
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "<br>**Running...**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cluster 000 :    [########################################] 100.0% of 1000\n",
+      "Cluster 001 :    [########################################] 100.0% of 1000\n",
+      "Cluster 002 :    [########################################] 100.0% of 1000\n",
+      "Cluster 003 :    [########################################] 100.0% of 1000\n",
+      "Cluster 004 :    [########################################] 100.0% of 1000\n",
+      "Cluster 005 :    [########################################] 100.0% of 1000\n",
+      "Cluster 006 :    [########################################] 100.0% of 1000\n",
+      "Cluster 007 :    [########################################] 100.0% of 1000\n",
+      "Cluster 008 :    [########################################] 100.0% of 1000\n",
+      "Cluster 009 :    [########################################] 100.0% of 1000\n",
+      "Cluster 010 :    [########################################] 100.0% of 1000\n",
+      "Cluster 011 :    [########################################] 100.0% of 1000\n",
+      "Cluster 012 :    [########################################] 100.0% of 1000\n",
+      "Cluster 013 :    [########################################] 100.0% of 1000\n",
+      "Cluster 014 :    [########################################] 100.0% of 1000\n",
+      "Cluster 015 :    [########################################] 100.0% of 1000\n",
+      "Cluster 016 :    [########################################] 100.0% of 1000\n",
+      "Cluster 017 :    [########################################] 100.0% of 1000\n",
+      "Cluster 018 :    [########################################] 100.0% of 1000\n",
+      "Cluster 019 :    [########################################] 100.0% of 1000\n",
+      "Cluster 020 :    [##########------------------------------]  25.0% of 1000\r"
     ]
    },
    {
@@ -324,32 +400,20 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Duration     :  0:00:00\n",
+      "Duration     :  0:01:59\n",
-      "Size         :  3.7 Go\n"
+      "Size         :  7.4 Go\n"
     ]
    }
   ],
   "source": [
-    "# ---- Parameters you can change :\n",
-    "\n",
-    "# Tests\n",
-    "scale       = 0.05\n",
-    "image_size  = (128,128)\n",
-    "output_dir  = './data'\n",
-    "\n",
-    "# Clusters generation\n",
-    "# scale       = 1.\n",
-    "# image_size  = (192,160)\n",
-    "# output_dir = f'{datasets_dir}/celeba/enhanced'\n",
-    "\n",
    "\n",
-    "# ---- Used for continous integration - Just forget this lines\n",
+    "# ---- Used for continous integration - Just forget this 3 lines\n",
    "#\n",
    "scale      = pwk.override('scale',      scale)\n",
    "image_size = pwk.override('image_size', image_size)\n",
    "output_dir = pwk.override('output_dir', output_dir)\n",
    "\n",
-    "# ---- x_train\n",
+    "# ---- Build clusters\n",
    "#\n",
    "cluster_size = 10000\n",
    "lx,ly        = image_size\n",
@@ -361,7 +425,8 @@
    "                                     cluster_dir  = cluster_dir,\n",
    "                                     image_size   = image_size )\n",
    "\n",
-    "        \n",
+    "# ---- Conclusion...\n",
+    "\n",
    "directory = pathlib.Path(cluster_dir)\n",
    "s=sum(f.stat().st_size for f in directory.glob('**/*') if f.is_file())\n",
    "\n",
@@ -372,15 +437,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "End time is : Thursday 31 December 2020, 12:50:38\n",
+      "End time is : Saturday 2 January 2021, 17:06:59\n",
-      "Duration is : 00:01:13 121ms\n",
+      "Duration is : 00:02:00 412ms\n",
      "This notebook ends here\n"
     ]
    }

 %% Cell type:markdown id: tags:
 <img width="800px" src="../fidle/img/00-Fidle-header-01.svg"></img>
 # <!-- TITLE --> [VAE6] - Preparation of the CelebA dataset
 <!-- DESC --> Preparation of a clustered dataset, batchable
 <!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->
 ## Objectives :
 - Formatting our dataset in **cluster files**, using batch mode
 - Adapting a notebook for batch use
 The [CelebFaces Attributes Dataset (CelebA)](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) contains about 200,000 images (202599,218,178,3).
 ## What we're going to do :
 - Lire les images
 - redimensionner et normaliser celles-ci,
 - Constituer des clusters d'images en format npy
 %% Cell type:markdown id: tags:
 ## Step 1 - Import and init
 ### 1.2 - Import
 %% Cell type:code id: tags:
 ``` python
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from skimage import io, transform
 import os,pathlib,time,sys,json,glob
 import csv
 import math, random
 from importlib import reload
 sys.path.append('..')
 import fidle.pwk as pwk
 datasets_dir = pwk.init('VAE6')
 ```
 %% Output
    **FIDLE 2020 - Practical Work Module**
    Version              : 0.6.1 DEV
    Notebook id          : VAE6
-    Run time             : Thursday 31 December 2020, 12:49:25
+    Run time             : Saturday 2 January 2021, 17:04:58
    TensorFlow version   : 2.2.0
    Keras version        : 2.3.0-tf
    Datasets dir         : /home/pjluc/datasets/fidle
    Run dir              : ./run
    CI running mode      : none
    Update keras cache   : False
    Save figs            : True
    Path figs            : ./run/figs
 %% Cell type:markdown id: tags:
 ### 1.2 - Directories and files :
 %% Cell type:code id: tags:
 ``` python
 dataset_csv = f'{datasets_dir}/celeba/origine/list_attr_celeba.csv'
 dataset_img = f'{datasets_dir}/celeba/origine/img_align_celeba'
 ```
 %% Cell type:markdown id: tags:
 ## Step 2 - Read and shuffle filenames catalog
 %% Cell type:code id: tags:
 ``` python
 dataset_desc = pd.read_csv(dataset_csv, header=0)
 dataset_desc = dataset_desc.reindex(np.random.permutation(dataset_desc.index))
 ```
 %% Cell type:markdown id: tags:
 ## Step 3 - Save as clusters of n images
 %% Cell type:markdown id: tags:
 ### 4.2 - Cooking function
 %% Cell type:code id: tags:
 ``` python
 def read_and_save( dataset_img, dataset_desc, scale=1,
                   cluster_size=1000, cluster_dir='./dataset_cluster', cluster_name='images',
                   image_size=(128,128)):
    global pwk
    def save_cluster(imgs,desc,cols,id):
        file_img  = f'{cluster_dir}/{cluster_name}-{id:03d}.npy'
        file_desc = f'{cluster_dir}/{cluster_name}-{id:03d}.csv'
        np.save(file_img,  np.array(imgs))
        df=pd.DataFrame(data=desc,columns=cols)
        df.to_csv(file_desc, index=False)
        return [],[],id+1
    pwk.chrono_start()
    cols = list(dataset_desc.columns)
    # ---- Check if cluster files exist
    #
    if os.path.isfile(f'{cluster_dir}/images-000.npy'):
        print('\n*** Oups. There are already clusters in the target folder!\n')
        return 0,0
    pwk.mkdir(cluster_dir)
    # ---- Scale
    #
    n=int(len(dataset_desc)*scale)
    dataset = dataset_desc[:n]
    cluster_size = int(cluster_size*scale)
    pwk.subtitle('Parameters :')
    print(f'Scale is : {scale}')
+    print(f'Image size is     : {image_size}')
    print(f'dataset length is : {n}')
    print(f'cluster size is   : {cluster_size}')
    print(f'clusters nb  is   :',int(n/cluster_size + 1))
    print(f'cluster dir  is   : {cluster_dir}')
    # ---- Read and save clusters
    #
    pwk.subtitle('Running...')
    imgs, desc, cluster_id = [],[],0
    #
    for i,row in dataset.iterrows():
        #
        filename = f'{dataset_img}/{row.image_id}'
        #
        # ---- Read image, resize (and normalize)
        #
        img = io.imread(filename)
        img = transform.resize(img, image_size)
        #
        # ---- Add image and description
        #
        imgs.append( img )
        desc.append( row.values )
        #
        # ---- Progress bar
        #
        pwk.update_progress(f'Cluster {cluster_id:03d} :',len(imgs),cluster_size)
        #
        # ---- Save cluster if full
        #
        if len(imgs)==cluster_size:
            imgs,desc,cluster_id=save_cluster(imgs,desc,cols, cluster_id)
    # ---- Save uncomplete cluster
    if len(imgs)>0 : imgs,desc,cluster_id=save_cluster(imgs,desc,cols,cluster_id)
    duration=pwk.chrono_stop()
    return cluster_id,duration
 ```
 %% Cell type:markdown id: tags:
 ### 4.3 - Cluster building
 All the dataset will be use for training
 Reading the 200,000 images can take a long time **(>20 minutes)** and a lot of place **(>170 GB)**
 Example :
 Image Sizes: 128x128 : 74 GB
 Image Sizes: 192x160 : 138 GB
 You can define theses parameters :
 `scale` : 1 mean 100% of the dataset - set 0.05 for tests
 `image_size` : images size in the clusters, should be 128x128 or 192,160 (original is 218,178)
 `output_dir` : where to write clusters, could be :
 - `./data`, for tests purpose
 - `<datasets_dir>/celeba/enhanced` to add clusters in your datasets dir.
 `cluster_size` : number of images in a cluster, 10000 is fine. (will be adjust by scale)
 **Note :** If the target folder is not empty, the construction is blocked.
 %% Cell type:code id: tags:
 ``` python
-# ---- Parameters you can change :
+# ---- Parameters you can change -----------------------------------
-# Tests
+# ---- Tests
-scale       = 0.05
+scale       = 0.1
 image_size  = (128,128)
 output_dir  = './data'
-# Clusters generation
+# ---- Full clusters generation, medium size
+# scale       = 1.
+# image_size  = (128,128)
+# output_dir = f'{datasets_dir}/celeba/enhanced'
+# ---- Full clusters generation, large size
 # scale       = 1.
 # image_size  = (192,160)
 # output_dir = f'{datasets_dir}/celeba/enhanced'
+```
+%% Cell type:code id: tags:
+``` python
-# ---- Used for continous integration - Just forget this lines
+# ---- Used for continous integration - Just forget this 3 lines
 #
 scale      = pwk.override('scale',      scale)
 image_size = pwk.override('image_size', image_size)
 output_dir = pwk.override('output_dir', output_dir)
-# ---- x_train
+# ---- Build clusters
 #
 cluster_size = 10000
 lx,ly        = image_size
 cluster_dir  = f'{output_dir}/clusters-{lx}x{ly}'
 cluster_nb,duration = read_and_save( dataset_img, dataset_desc,
                                     scale        = scale,
                                     cluster_size = cluster_size,
                                     cluster_dir  = cluster_dir,
                                     image_size   = image_size )
+# ---- Conclusion...
 directory = pathlib.Path(cluster_dir)
 s=sum(f.stat().st_size for f in directory.glob('**/*') if f.is_file())
 pwk.subtitle('Conclusion :')
 print('Duration     : ',pwk.hdelay(duration))
 print('Size         : ',pwk.hsize(s))
 ```
 %% Output
-    Override : running mode is [smart] Parameter [scale=0.05] set to [0.05]
+    <br>**Parameters :**
-    Override : running mode is [smart] Parameter [image_size=(128, 128)] set to [(128, 128)]
-    Override : running mode is [smart] Parameter [output_dir=./data] set to [./data]
+    Scale is : 0.1
+    Image size is     : (128, 128)
-    *** Oups. There are already clusters in the target folder!
+    dataset length is : 20259
+    cluster size is   : 1000
+    clusters nb  is   : 21
+    cluster dir  is   : ./data/clusters-128x128
+    <br>**Running...**
+    Cluster 000 :    [########################################] 100.0% of 1000
+    Cluster 001 :    [########################################] 100.0% of 1000
+    Cluster 002 :    [########################################] 100.0% of 1000
+    Cluster 003 :    [########################################] 100.0% of 1000
+    Cluster 004 :    [########################################] 100.0% of 1000
+    Cluster 005 :    [########################################] 100.0% of 1000
+    Cluster 006 :    [########################################] 100.0% of 1000
+    Cluster 007 :    [########################################] 100.0% of 1000
+    Cluster 008 :    [########################################] 100.0% of 1000
+    Cluster 009 :    [########################################] 100.0% of 1000
+    Cluster 010 :    [########################################] 100.0% of 1000
+    Cluster 011 :    [########################################] 100.0% of 1000
+    Cluster 012 :    [########################################] 100.0% of 1000
+    Cluster 013 :    [########################################] 100.0% of 1000
+    Cluster 014 :    [########################################] 100.0% of 1000
+    Cluster 015 :    [########################################] 100.0% of 1000
+    Cluster 016 :    [########################################] 100.0% of 1000
+    Cluster 017 :    [########################################] 100.0% of 1000
+    Cluster 018 :    [########################################] 100.0% of 1000
+    Cluster 019 :    [########################################] 100.0% of 1000
+    Cluster 020 :    [##########------------------------------]  25.0% of 1000
    <br>**Conclusion :**
-    Duration     :  0:00:00
+    Duration     :  0:01:59
-    Size         :  3.7 Go
+    Size         :  7.4 Go
 %% Cell type:code id: tags:
 ``` python
 pwk.end()
 ```
 %% Output
-    End time is : Thursday 31 December 2020, 12:50:38
+    End time is : Saturday 2 January 2021, 17:06:59
-    Duration is : 00:01:13 121ms
+    Duration is : 00:02:00 412ms
    This notebook ends here
 %% Cell type:markdown id: tags:
 ---
 <img width="80px" src="../fidle/img/00-Fidle-logo-01.svg"></img>

--- a/VAE/07-Check-CelebA.ipynb
+++ b/VAE/07-Check-CelebA.ipynb
--- a/fidle/config.py
+++ b/fidle/config.py
@@ -25,6 +25,11 @@ DEFAULT_NOTEBOOK_NAME = "Unknown"
 FIDLE_MPLSTYLE = '../fidle/mplstyles/custom.mplstyle'
 FIDLE_CSSFILE  = '../fidle/css/custom.css'
+# ---- Save figs or not (yes|no)
+#      Overided by env : FIDLE_SAVE_FIGS
+#      
+DEFAULT_SAVE_FIGS    = 'yes'
 # ---- Catalog file, a json description of all notebooks
 #
 CATALOG_FILE   = '../fidle/log/catalog.json'
@@ -43,11 +48,6 @@ CI_REPORT      = '../fidle/log/ci_report.html'
 #
 DEFAULT_RUNNING_MODE = 'none'
-# ---- Save figs or not (yes|no)
-#      Overided by env : FIDLE_SAVE_FIGS
-#      
-DEFAULT_SAVE_FIGS    = 'yes'
 # ---- CI Override parameters
 #
 GTSRB1_smart_scale      = 0.1
@@ -60,4 +60,9 @@ VAE6_smart_image_size   = (128,128)
 VAE6_smart_output_dir   = './data'
 VAE6_full_scale         = 1
 VAE6_full_image_size    = (192,160)
-VAE6_full_output_dir    = '{datasets_dir}/GTSRB/enhanced'
+VAE6_full_output_dir    = '{datasets_dir}/celeba/enhanced'
+VAE7_smart_image_size   = (128,128)
+VAE7_smart_enhanced_dir = './data'
+VAE7_full_image_size    = (192,160)
+VAE7_full_enhanced_dir  = '{datasets_dir}/celeba/enhanced'
--- a/fidle/log/finished.json
+++ b/fidle/log/finished.json
@@ -127,9 +127,9 @@
    },
    "VAE6": {
        "path": "/home/pjluc/dev/fidle/VAE",
-        "start": "Thursday 31 December 2020, 12:49:25",
+        "start": "Saturday 2 January 2021, 17:04:58",
-        "end": "Thursday 31 December 2020, 12:50:38",
+        "end": "Saturday 2 January 2021, 17:06:59",
-        "duration": "00:01:13 121ms"
+        "duration": "00:02:00 412ms"
    },
    "GTS1": {
        "path": "/home/pjluc/dev/fidle/GTSRB",
@@ -142,5 +142,11 @@
        "start": "Thursday 31 December 2020, 12:34:29",
        "end": "Thursday 31 December 2020, 12:36:22",
        "duration": "00:01:53 128ms"
+    },
+    "VAE7": {
+        "path": "/home/pjluc/dev/fidle/VAE",
+        "start": "Saturday 2 January 2021, 17:28:39",
+        "end": "Saturday 2 January 2021, 17:28:47",
+        "duration": "00:00:08 736ms"
    }
 }
\ No newline at end of file