VAE Test

c322a913 · Jean-Luc Parouty · e968b335 · c322a913 · c322a913
Commit c322a913 authored 4 years ago by Jean-Luc Parouty
--- a/VAE/06-Prepare-CelebA-datasets.ipynb
+++ b/VAE/06-Prepare-CelebA-datasets.ipynb
@@ -114,10 +114,10 @@
     "text": [
      "Version              : 0.6.1 DEV\n",
      "Notebook id          : VAE6\n",
-      "Run time             : Monday 4 January 2021, 21:17:51\n",
-      "TensorFlow version   : 2.4.0\n",
-      "Keras version        : 2.4.0\n",
-      "Datasets dir         : /gpfswork/rech/mlh/uja62cb/datasets\n",
+      "Run time             : Monday 4 January 2021, 23:45:46\n",
+      "TensorFlow version   : 2.2.0\n",
+      "Keras version        : 2.3.0-tf\n",
+      "Datasets dir         : /home/pjluc/datasets/fidle\n",
      "Run dir              : ./run\n",
      "CI running mode      : none\n",
      "Update keras cache   : False\n",
@@ -301,7 +301,7 @@
    "# ---- Parameters you can change -----------------------------------\n",
    "\n",
    "# ---- Tests\n",
-    "scale       = 0.1\n",
+    "scale       = 0.2\n",
    "image_size  = (128,128)\n",
    "output_dir  = './data'\n",
    "\n",
@@ -337,12 +337,12 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Scale is : 1.0\n",
+      "Scale is : 0.2\n",
      "Image size is     : (128, 128)\n",
-      "dataset length is : 202599\n",
-      "cluster size is   : 10000\n",
+      "dataset length is : 40519\n",
+      "cluster size is   : 2000\n",
      "clusters nb  is   : 21\n",
-      "cluster dir  is   : /gpfswork/rech/mlh/uja62cb/datasets/celeba/enhanced/clusters-128x128\n"
+      "cluster dir  is   : ./data/clusters-128x128\n"
     ]
    },
    {
@@ -361,27 +361,27 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Cluster 000 :    [########################################] 100.0% of 10000\n",
-      "Cluster 001 :    [########################################] 100.0% of 10000\n",
-      "Cluster 002 :    [########################################] 100.0% of 10000\n",
-      "Cluster 003 :    [########################################] 100.0% of 10000\n",
-      "Cluster 004 :    [########################################] 100.0% of 10000\n",
-      "Cluster 005 :    [########################################] 100.0% of 10000\n",
-      "Cluster 006 :    [########################################] 100.0% of 10000\n",
-      "Cluster 007 :    [########################################] 100.0% of 10000\n",
-      "Cluster 008 :    [########################################] 100.0% of 10000\n",
-      "Cluster 009 :    [########################################] 100.0% of 10000\n",
-      "Cluster 010 :    [########################################] 100.0% of 10000\n",
-      "Cluster 011 :    [########################################] 100.0% of 10000\n",
-      "Cluster 012 :    [########################################] 100.0% of 10000\n",
-      "Cluster 013 :    [########################################] 100.0% of 10000\n",
-      "Cluster 014 :    [########################################] 100.0% of 10000\n",
-      "Cluster 015 :    [########################################] 100.0% of 10000\n",
-      "Cluster 016 :    [########################################] 100.0% of 10000\n",
-      "Cluster 017 :    [########################################] 100.0% of 10000\n",
-      "Cluster 018 :    [########################################] 100.0% of 10000\n",
-      "Cluster 019 :    [########################################] 100.0% of 10000\n",
-      "Cluster 020 :    [##########------------------------------]  25.0% of 10000\r"
+      "Cluster 000 :    [########################################] 100.0% of 2000\n",
+      "Cluster 001 :    [########################################] 100.0% of 2000\n",
+      "Cluster 002 :    [########################################] 100.0% of 2000\n",
+      "Cluster 003 :    [########################################] 100.0% of 2000\n",
+      "Cluster 004 :    [########################################] 100.0% of 2000\n",
+      "Cluster 005 :    [########################################] 100.0% of 2000\n",
+      "Cluster 006 :    [########################################] 100.0% of 2000\n",
+      "Cluster 007 :    [########################################] 100.0% of 2000\n",
+      "Cluster 008 :    [########################################] 100.0% of 2000\n",
+      "Cluster 009 :    [########################################] 100.0% of 2000\n",
+      "Cluster 010 :    [########################################] 100.0% of 2000\n",
+      "Cluster 011 :    [########################################] 100.0% of 2000\n",
+      "Cluster 012 :    [########################################] 100.0% of 2000\n",
+      "Cluster 013 :    [########################################] 100.0% of 2000\n",
+      "Cluster 014 :    [########################################] 100.0% of 2000\n",
+      "Cluster 015 :    [########################################] 100.0% of 2000\n",
+      "Cluster 016 :    [########################################] 100.0% of 2000\n",
+      "Cluster 017 :    [########################################] 100.0% of 2000\n",
+      "Cluster 018 :    [########################################] 100.0% of 2000\n",
+      "Cluster 019 :    [########################################] 100.0% of 2000\n",
+      "Cluster 020 :    [##########------------------------------]  25.0% of 2000\r"
     ]
    },
    {
@@ -400,8 +400,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Duration     :  0:51:04\n",
-      "Size         :  74.2 Go\n"
+      "Duration     :  0:05:06\n",
+      "Size         :  14.8 Go\n"
     ]
    }
   ],
@@ -444,8 +444,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "End time is : Monday 4 January 2021, 22:08:57\n",
-      "Duration is : 00:51:06 647ms\n",
+      "End time is : Monday 4 January 2021, 23:50:54\n",
+      "Duration is : 00:05:08 166ms\n",
      "This notebook ends here\n"
     ]
    }

 %% Cell type:markdown id: tags:

 <img width="800px" src="../fidle/img/00-Fidle-header-01.svg"></img>

 # <!-- TITLE --> [VAE6] - Preparation of the CelebA dataset
 <!-- DESC --> Preparation of a clustered dataset, batchable
 <!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->

 ## Objectives :
 - Formatting our dataset in **cluster files**, using batch mode
 - Adapting a notebook for batch use


 The [CelebFaces Attributes Dataset (CelebA)](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) contains about 200,000 images (202599,218,178,3).


 ## What we're going to do :
 - Lire les images
 - redimensionner et normaliser celles-ci,
 - Constituer des clusters d'images en format npy

 %% Cell type:markdown id: tags:

 ## Step 1 - Import and init
 ### 1.2 - Import

 %% Cell type:code id: tags:

 ``` python
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from skimage import io, transform

 import os,pathlib,time,sys,json,glob
 import csv
 import math, random

 from importlib import reload

 sys.path.append('..')
 import fidle.pwk as pwk

 datasets_dir = pwk.init('VAE6')
 ```

 %% Output


    **FIDLE 2020 - Practical Work Module**

    Version              : 0.6.1 DEV
    Notebook id          : VAE6
-    Run time             : Monday 4 January 2021, 21:17:51
-    TensorFlow version   : 2.4.0
-    Keras version        : 2.4.0
-    Datasets dir         : /gpfswork/rech/mlh/uja62cb/datasets
+    Run time             : Monday 4 January 2021, 23:45:46
+    TensorFlow version   : 2.2.0
+    Keras version        : 2.3.0-tf
+    Datasets dir         : /home/pjluc/datasets/fidle
    Run dir              : ./run
    CI running mode      : none
    Update keras cache   : False
    Save figs            : True
    Path figs            : ./run/figs

 %% Cell type:markdown id: tags:

 ### 1.2 - Directories and files :

 %% Cell type:code id: tags:

 ``` python
 dataset_csv = f'{datasets_dir}/celeba/origine/list_attr_celeba.csv'
 dataset_img = f'{datasets_dir}/celeba/origine/img_align_celeba'
 ```

 %% Cell type:markdown id: tags:

 ## Step 2 - Read and shuffle filenames catalog

 %% Cell type:code id: tags:

 ``` python
 dataset_desc = pd.read_csv(dataset_csv, header=0)
 dataset_desc = dataset_desc.reindex(np.random.permutation(dataset_desc.index))
 ```

 %% Cell type:markdown id: tags:

 ## Step 3 - Save as clusters of n images

 %% Cell type:markdown id: tags:

 ### 4.2 - Cooking function

 %% Cell type:code id: tags:

 ``` python
 def read_and_save( dataset_img, dataset_desc, scale=1,
                   cluster_size=1000, cluster_dir='./dataset_cluster', cluster_name='images',
                   image_size=(128,128)):
    global pwk

    def save_cluster(imgs,desc,cols,id):
        file_img  = f'{cluster_dir}/{cluster_name}-{id:03d}.npy'
        file_desc = f'{cluster_dir}/{cluster_name}-{id:03d}.csv'
        np.save(file_img,  np.array(imgs))
        df=pd.DataFrame(data=desc,columns=cols)
        df.to_csv(file_desc, index=False)
        return [],[],id+1

    pwk.chrono_start()
    cols = list(dataset_desc.columns)

    # ---- Check if cluster files exist
    #
    if os.path.isfile(f'{cluster_dir}/images-000.npy'):
        print('\n*** Oups. There are already clusters in the target folder!\n')
        return 0,0
    pwk.mkdir(cluster_dir)

    # ---- Scale
    #
    n=int(len(dataset_desc)*scale)
    dataset = dataset_desc[:n]
    cluster_size = int(cluster_size*scale)
    pwk.subtitle('Parameters :')
    print(f'Scale is : {scale}')
    print(f'Image size is     : {image_size}')
    print(f'dataset length is : {n}')
    print(f'cluster size is   : {cluster_size}')
    print(f'clusters nb  is   :',int(n/cluster_size + 1))
    print(f'cluster dir  is   : {cluster_dir}')

    # ---- Read and save clusters
    #
    pwk.subtitle('Running...')
    imgs, desc, cluster_id = [],[],0
    #
    for i,row in dataset.iterrows():
        #
        filename = f'{dataset_img}/{row.image_id}'
        #
        # ---- Read image, resize (and normalize)
        #
        img = io.imread(filename)
        img = transform.resize(img, image_size)
        #
        # ---- Add image and description
        #
        imgs.append( img )
        desc.append( row.values )
        #
        # ---- Progress bar
        #
        pwk.update_progress(f'Cluster {cluster_id:03d} :',len(imgs),cluster_size)
        #
        # ---- Save cluster if full
        #
        if len(imgs)==cluster_size:
            imgs,desc,cluster_id=save_cluster(imgs,desc,cols, cluster_id)

    # ---- Save uncomplete cluster
    if len(imgs)>0 : imgs,desc,cluster_id=save_cluster(imgs,desc,cols,cluster_id)

    duration=pwk.chrono_stop()
    return cluster_id,duration
 ```

 %% Cell type:markdown id: tags:

 ### 4.3 - Cluster building
 All the dataset will be use for training
 Reading the 200,000 images can take a long time **(>20 minutes)** and a lot of place **(>170 GB)**
 Example :
 Image Sizes: 128x128 : 74 GB
 Image Sizes: 192x160 : 138 GB

 You can define theses parameters :
 `scale` : 1 mean 100% of the dataset - set 0.05 for tests
 `image_size` : images size in the clusters, should be 128x128 or 192,160 (original is 218,178)
 `output_dir` : where to write clusters, could be :
 - `./data`, for tests purpose
 - `<datasets_dir>/celeba/enhanced` to add clusters in your datasets dir.

 `cluster_size` : number of images in a cluster, 10000 is fine. (will be adjust by scale)

 **Note :** If the target folder is not empty, the construction is blocked.

 %% Cell type:code id: tags:

 ``` python
 # ---- Parameters you can change -----------------------------------

 # ---- Tests
-scale       = 0.1
+scale       = 0.2
 image_size  = (128,128)
 output_dir  = './data'

 # ---- Full clusters generation, medium size
 # scale       = 1.
 # image_size  = (128,128)
 # output_dir = f'{datasets_dir}/celeba/enhanced'

 # ---- Full clusters generation, large size
 # scale       = 1.
 # image_size  = (192,160)
 # output_dir = f'{datasets_dir}/celeba/enhanced'
 ```

 %% Cell type:code id: tags:

 ``` python

 # ---- Used for continous integration - Just forget this 3 lines
 #
 scale      = pwk.override('scale',      scale)
 image_size = pwk.override('image_size', image_size)
 output_dir = pwk.override('output_dir', output_dir)

 # ---- Build clusters
 #
 cluster_size = 10000
 lx,ly        = image_size
 cluster_dir  = f'{output_dir}/clusters-{lx}x{ly}'

 cluster_nb,duration = read_and_save( dataset_img, dataset_desc,
                                     scale        = scale,
                                     cluster_size = cluster_size,
                                     cluster_dir  = cluster_dir,
                                     image_size   = image_size )

 # ---- Conclusion...

 directory = pathlib.Path(cluster_dir)
 s=sum(f.stat().st_size for f in directory.glob('**/*') if f.is_file())

 pwk.subtitle('Conclusion :')
 print('Duration     : ',pwk.hdelay(duration))
 print('Size         : ',pwk.hsize(s))
 ```

 %% Output

    <br>**Parameters :**

-    Scale is : 1.0
+    Scale is : 0.2
    Image size is     : (128, 128)
-    dataset length is : 202599
-    cluster size is   : 10000
+    dataset length is : 40519
+    cluster size is   : 2000
    clusters nb  is   : 21
-    cluster dir  is   : /gpfswork/rech/mlh/uja62cb/datasets/celeba/enhanced/clusters-128x128
+    cluster dir  is   : ./data/clusters-128x128

    <br>**Running...**

-    Cluster 000 :    [########################################] 100.0% of 10000
-    Cluster 001 :    [########################################] 100.0% of 10000
-    Cluster 002 :    [########################################] 100.0% of 10000
-    Cluster 003 :    [########################################] 100.0% of 10000
-    Cluster 004 :    [########################################] 100.0% of 10000
-    Cluster 005 :    [########################################] 100.0% of 10000
-    Cluster 006 :    [########################################] 100.0% of 10000
-    Cluster 007 :    [########################################] 100.0% of 10000
-    Cluster 008 :    [########################################] 100.0% of 10000
-    Cluster 009 :    [########################################] 100.0% of 10000
-    Cluster 010 :    [########################################] 100.0% of 10000
-    Cluster 011 :    [########################################] 100.0% of 10000
-    Cluster 012 :    [########################################] 100.0% of 10000
-    Cluster 013 :    [########################################] 100.0% of 10000
-    Cluster 014 :    [########################################] 100.0% of 10000
-    Cluster 015 :    [########################################] 100.0% of 10000
-    Cluster 016 :    [########################################] 100.0% of 10000
-    Cluster 017 :    [########################################] 100.0% of 10000
-    Cluster 018 :    [########################################] 100.0% of 10000
-    Cluster 019 :    [########################################] 100.0% of 10000
-    Cluster 020 :    [##########------------------------------]  25.0% of 10000
+    Cluster 000 :    [########################################] 100.0% of 2000
+    Cluster 001 :    [########################################] 100.0% of 2000
+    Cluster 002 :    [########################################] 100.0% of 2000
+    Cluster 003 :    [########################################] 100.0% of 2000
+    Cluster 004 :    [########################################] 100.0% of 2000
+    Cluster 005 :    [########################################] 100.0% of 2000
+    Cluster 006 :    [########################################] 100.0% of 2000
+    Cluster 007 :    [########################################] 100.0% of 2000
+    Cluster 008 :    [########################################] 100.0% of 2000
+    Cluster 009 :    [########################################] 100.0% of 2000
+    Cluster 010 :    [########################################] 100.0% of 2000
+    Cluster 011 :    [########################################] 100.0% of 2000
+    Cluster 012 :    [########################################] 100.0% of 2000
+    Cluster 013 :    [########################################] 100.0% of 2000
+    Cluster 014 :    [########################################] 100.0% of 2000
+    Cluster 015 :    [########################################] 100.0% of 2000
+    Cluster 016 :    [########################################] 100.0% of 2000
+    Cluster 017 :    [########################################] 100.0% of 2000
+    Cluster 018 :    [########################################] 100.0% of 2000
+    Cluster 019 :    [########################################] 100.0% of 2000
+    Cluster 020 :    [##########------------------------------]  25.0% of 2000

    <br>**Conclusion :**

-    Duration     :  0:51:04
-    Size         :  74.2 Go
+    Duration     :  0:05:06
+    Size         :  14.8 Go

 %% Cell type:code id: tags:

 ``` python
 pwk.end()
 ```

 %% Output

-    End time is : Monday 4 January 2021, 22:08:57
-    Duration is : 00:51:06 647ms
+    End time is : Monday 4 January 2021, 23:50:54
+    Duration is : 00:05:08 166ms
    This notebook ends here

 %% Cell type:markdown id: tags:

 ---
 <img width="80px" src="../fidle/img/00-Fidle-logo-01.svg"></img>

--- a/fidle/log/finished.json
+++ b/fidle/log/finished.json
@@ -126,10 +126,10 @@
        "duration": "00:00:10 061ms"
    },
    "VAE6": {
-        "path": "/gpfsdswork/projects/rech/mlh/uja62cb/fidle/VAE",
-        "start": "Monday 4 January 2021, 21:17:51",
-        "end": "Monday 4 January 2021, 22:08:57",
-        "duration": "00:51:06 647ms"
+        "path": "/home/pjluc/dev/fidle/VAE",
+        "start": "Monday 4 January 2021, 23:45:46",
+        "end": "Monday 4 January 2021, 23:50:54",
+        "duration": "00:05:08 166ms"
    },
    "GTS1": {
        "path": "/home/pjluc/dev/fidle/GTSRB",
@@ -150,8 +150,8 @@
        "duration": "00:00:08 736ms"
    },
    "VAE8": {
-        "path": "/gpfsdswork/projects/rech/mlh/uja62cb/fidle/VAE",
-        "start": "Monday 4 January 2021, 22:27:20",
+        "path": "/home/pjluc/dev/fidle/VAE",
+        "start": "Monday 4 January 2021, 23:53:16",
        "end": "",
        "duration": "Unfinished..."
    }