Skip to content
Snippets Groups Projects
Commit f28f4f37 authored by Jean-Luc Parouty's avatar Jean-Luc Parouty
Browse files

Update VAE07 for ci

parent a434f008
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
<img width="800px" src="../fidle/img/00-Fidle-header-01.svg"></img> <img width="800px" src="../fidle/img/00-Fidle-header-01.svg"></img>
# <!-- TITLE --> [VAE6] - Preparation of the CelebA dataset # <!-- TITLE --> [VAE6] - Preparation of the CelebA dataset
<!-- DESC --> Preparation of a clustered dataset, batchable <!-- DESC --> Preparation of a clustered dataset, batchable
<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) --> <!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->
## Objectives : ## Objectives :
- Formatting our dataset in **cluster files**, using batch mode - Formatting our dataset in **cluster files**, using batch mode
- Adapting a notebook for batch use - Adapting a notebook for batch use
The [CelebFaces Attributes Dataset (CelebA)](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) contains about 200,000 images (202599,218,178,3). The [CelebFaces Attributes Dataset (CelebA)](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) contains about 200,000 images (202599,218,178,3).
## What we're going to do : ## What we're going to do :
- Lire les images - Lire les images
- redimensionner et normaliser celles-ci, - redimensionner et normaliser celles-ci,
- Constituer des clusters d'images en format npy - Constituer des clusters d'images en format npy
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Step 1 - Import and init ## Step 1 - Import and init
### 1.2 - Import ### 1.2 - Import
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
from skimage import io, transform from skimage import io, transform
import os,pathlib,time,sys,json,glob import os,pathlib,time,sys,json,glob
import csv import csv
import math, random import math, random
from importlib import reload from importlib import reload
sys.path.append('..') sys.path.append('..')
import fidle.pwk as pwk import fidle.pwk as pwk
datasets_dir = pwk.init('VAE6') datasets_dir = pwk.init('VAE6')
``` ```
%% Output %% Output
**FIDLE 2020 - Practical Work Module** **FIDLE 2020 - Practical Work Module**
Version : 0.6.1 DEV Version : 0.6.1 DEV
Notebook id : VAE6 Notebook id : VAE6
Run time : Thursday 31 December 2020, 12:49:25 Run time : Saturday 2 January 2021, 17:04:58
TensorFlow version : 2.2.0 TensorFlow version : 2.2.0
Keras version : 2.3.0-tf Keras version : 2.3.0-tf
Datasets dir : /home/pjluc/datasets/fidle Datasets dir : /home/pjluc/datasets/fidle
Run dir : ./run Run dir : ./run
CI running mode : none CI running mode : none
Update keras cache : False Update keras cache : False
Save figs : True Save figs : True
Path figs : ./run/figs Path figs : ./run/figs
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 1.2 - Directories and files : ### 1.2 - Directories and files :
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
dataset_csv = f'{datasets_dir}/celeba/origine/list_attr_celeba.csv' dataset_csv = f'{datasets_dir}/celeba/origine/list_attr_celeba.csv'
dataset_img = f'{datasets_dir}/celeba/origine/img_align_celeba' dataset_img = f'{datasets_dir}/celeba/origine/img_align_celeba'
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Step 2 - Read and shuffle filenames catalog ## Step 2 - Read and shuffle filenames catalog
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
dataset_desc = pd.read_csv(dataset_csv, header=0) dataset_desc = pd.read_csv(dataset_csv, header=0)
dataset_desc = dataset_desc.reindex(np.random.permutation(dataset_desc.index)) dataset_desc = dataset_desc.reindex(np.random.permutation(dataset_desc.index))
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Step 3 - Save as clusters of n images ## Step 3 - Save as clusters of n images
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 4.2 - Cooking function ### 4.2 - Cooking function
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def read_and_save( dataset_img, dataset_desc, scale=1, def read_and_save( dataset_img, dataset_desc, scale=1,
cluster_size=1000, cluster_dir='./dataset_cluster', cluster_name='images', cluster_size=1000, cluster_dir='./dataset_cluster', cluster_name='images',
image_size=(128,128)): image_size=(128,128)):
global pwk global pwk
def save_cluster(imgs,desc,cols,id): def save_cluster(imgs,desc,cols,id):
file_img = f'{cluster_dir}/{cluster_name}-{id:03d}.npy' file_img = f'{cluster_dir}/{cluster_name}-{id:03d}.npy'
file_desc = f'{cluster_dir}/{cluster_name}-{id:03d}.csv' file_desc = f'{cluster_dir}/{cluster_name}-{id:03d}.csv'
np.save(file_img, np.array(imgs)) np.save(file_img, np.array(imgs))
df=pd.DataFrame(data=desc,columns=cols) df=pd.DataFrame(data=desc,columns=cols)
df.to_csv(file_desc, index=False) df.to_csv(file_desc, index=False)
return [],[],id+1 return [],[],id+1
pwk.chrono_start() pwk.chrono_start()
cols = list(dataset_desc.columns) cols = list(dataset_desc.columns)
# ---- Check if cluster files exist # ---- Check if cluster files exist
# #
if os.path.isfile(f'{cluster_dir}/images-000.npy'): if os.path.isfile(f'{cluster_dir}/images-000.npy'):
print('\n*** Oups. There are already clusters in the target folder!\n') print('\n*** Oups. There are already clusters in the target folder!\n')
return 0,0 return 0,0
pwk.mkdir(cluster_dir) pwk.mkdir(cluster_dir)
# ---- Scale # ---- Scale
# #
n=int(len(dataset_desc)*scale) n=int(len(dataset_desc)*scale)
dataset = dataset_desc[:n] dataset = dataset_desc[:n]
cluster_size = int(cluster_size*scale) cluster_size = int(cluster_size*scale)
pwk.subtitle('Parameters :') pwk.subtitle('Parameters :')
print(f'Scale is : {scale}') print(f'Scale is : {scale}')
print(f'Image size is : {image_size}')
print(f'dataset length is : {n}') print(f'dataset length is : {n}')
print(f'cluster size is : {cluster_size}') print(f'cluster size is : {cluster_size}')
print(f'clusters nb is :',int(n/cluster_size + 1)) print(f'clusters nb is :',int(n/cluster_size + 1))
print(f'cluster dir is : {cluster_dir}') print(f'cluster dir is : {cluster_dir}')
# ---- Read and save clusters # ---- Read and save clusters
# #
pwk.subtitle('Running...') pwk.subtitle('Running...')
imgs, desc, cluster_id = [],[],0 imgs, desc, cluster_id = [],[],0
# #
for i,row in dataset.iterrows(): for i,row in dataset.iterrows():
# #
filename = f'{dataset_img}/{row.image_id}' filename = f'{dataset_img}/{row.image_id}'
# #
# ---- Read image, resize (and normalize) # ---- Read image, resize (and normalize)
# #
img = io.imread(filename) img = io.imread(filename)
img = transform.resize(img, image_size) img = transform.resize(img, image_size)
# #
# ---- Add image and description # ---- Add image and description
# #
imgs.append( img ) imgs.append( img )
desc.append( row.values ) desc.append( row.values )
# #
# ---- Progress bar # ---- Progress bar
# #
pwk.update_progress(f'Cluster {cluster_id:03d} :',len(imgs),cluster_size) pwk.update_progress(f'Cluster {cluster_id:03d} :',len(imgs),cluster_size)
# #
# ---- Save cluster if full # ---- Save cluster if full
# #
if len(imgs)==cluster_size: if len(imgs)==cluster_size:
imgs,desc,cluster_id=save_cluster(imgs,desc,cols, cluster_id) imgs,desc,cluster_id=save_cluster(imgs,desc,cols, cluster_id)
# ---- Save uncomplete cluster # ---- Save uncomplete cluster
if len(imgs)>0 : imgs,desc,cluster_id=save_cluster(imgs,desc,cols,cluster_id) if len(imgs)>0 : imgs,desc,cluster_id=save_cluster(imgs,desc,cols,cluster_id)
duration=pwk.chrono_stop() duration=pwk.chrono_stop()
return cluster_id,duration return cluster_id,duration
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 4.3 - Cluster building ### 4.3 - Cluster building
All the dataset will be use for training All the dataset will be use for training
Reading the 200,000 images can take a long time **(>20 minutes)** and a lot of place **(>170 GB)** Reading the 200,000 images can take a long time **(>20 minutes)** and a lot of place **(>170 GB)**
Example : Example :
Image Sizes: 128x128 : 74 GB Image Sizes: 128x128 : 74 GB
Image Sizes: 192x160 : 138 GB Image Sizes: 192x160 : 138 GB
You can define theses parameters : You can define theses parameters :
`scale` : 1 mean 100% of the dataset - set 0.05 for tests `scale` : 1 mean 100% of the dataset - set 0.05 for tests
`image_size` : images size in the clusters, should be 128x128 or 192,160 (original is 218,178) `image_size` : images size in the clusters, should be 128x128 or 192,160 (original is 218,178)
`output_dir` : where to write clusters, could be : `output_dir` : where to write clusters, could be :
- `./data`, for tests purpose - `./data`, for tests purpose
- `<datasets_dir>/celeba/enhanced` to add clusters in your datasets dir. - `<datasets_dir>/celeba/enhanced` to add clusters in your datasets dir.
`cluster_size` : number of images in a cluster, 10000 is fine. (will be adjust by scale) `cluster_size` : number of images in a cluster, 10000 is fine. (will be adjust by scale)
**Note :** If the target folder is not empty, the construction is blocked. **Note :** If the target folder is not empty, the construction is blocked.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# ---- Parameters you can change : # ---- Parameters you can change -----------------------------------
# Tests # ---- Tests
scale = 0.05 scale = 0.1
image_size = (128,128) image_size = (128,128)
output_dir = './data' output_dir = './data'
# Clusters generation # ---- Full clusters generation, medium size
# scale = 1.
# image_size = (128,128)
# output_dir = f'{datasets_dir}/celeba/enhanced'
# ---- Full clusters generation, large size
# scale = 1. # scale = 1.
# image_size = (192,160) # image_size = (192,160)
# output_dir = f'{datasets_dir}/celeba/enhanced' # output_dir = f'{datasets_dir}/celeba/enhanced'
```
%% Cell type:code id: tags:
``` python
# ---- Used for continous integration - Just forget this lines # ---- Used for continous integration - Just forget this 3 lines
# #
scale = pwk.override('scale', scale) scale = pwk.override('scale', scale)
image_size = pwk.override('image_size', image_size) image_size = pwk.override('image_size', image_size)
output_dir = pwk.override('output_dir', output_dir) output_dir = pwk.override('output_dir', output_dir)
# ---- x_train # ---- Build clusters
# #
cluster_size = 10000 cluster_size = 10000
lx,ly = image_size lx,ly = image_size
cluster_dir = f'{output_dir}/clusters-{lx}x{ly}' cluster_dir = f'{output_dir}/clusters-{lx}x{ly}'
cluster_nb,duration = read_and_save( dataset_img, dataset_desc, cluster_nb,duration = read_and_save( dataset_img, dataset_desc,
scale = scale, scale = scale,
cluster_size = cluster_size, cluster_size = cluster_size,
cluster_dir = cluster_dir, cluster_dir = cluster_dir,
image_size = image_size ) image_size = image_size )
# ---- Conclusion...
directory = pathlib.Path(cluster_dir) directory = pathlib.Path(cluster_dir)
s=sum(f.stat().st_size for f in directory.glob('**/*') if f.is_file()) s=sum(f.stat().st_size for f in directory.glob('**/*') if f.is_file())
pwk.subtitle('Conclusion :') pwk.subtitle('Conclusion :')
print('Duration : ',pwk.hdelay(duration)) print('Duration : ',pwk.hdelay(duration))
print('Size : ',pwk.hsize(s)) print('Size : ',pwk.hsize(s))
``` ```
%% Output %% Output
Override : running mode is [smart] Parameter [scale=0.05] set to [0.05] <br>**Parameters :**
Override : running mode is [smart] Parameter [image_size=(128, 128)] set to [(128, 128)]
Override : running mode is [smart] Parameter [output_dir=./data] set to [./data] Scale is : 0.1
Image size is : (128, 128)
*** Oups. There are already clusters in the target folder! dataset length is : 20259
cluster size is : 1000
clusters nb is : 21
cluster dir is : ./data/clusters-128x128
<br>**Running...**
Cluster 000 : [########################################] 100.0% of 1000
Cluster 001 : [########################################] 100.0% of 1000
Cluster 002 : [########################################] 100.0% of 1000
Cluster 003 : [########################################] 100.0% of 1000
Cluster 004 : [########################################] 100.0% of 1000
Cluster 005 : [########################################] 100.0% of 1000
Cluster 006 : [########################################] 100.0% of 1000
Cluster 007 : [########################################] 100.0% of 1000
Cluster 008 : [########################################] 100.0% of 1000
Cluster 009 : [########################################] 100.0% of 1000
Cluster 010 : [########################################] 100.0% of 1000
Cluster 011 : [########################################] 100.0% of 1000
Cluster 012 : [########################################] 100.0% of 1000
Cluster 013 : [########################################] 100.0% of 1000
Cluster 014 : [########################################] 100.0% of 1000
Cluster 015 : [########################################] 100.0% of 1000
Cluster 016 : [########################################] 100.0% of 1000
Cluster 017 : [########################################] 100.0% of 1000
Cluster 018 : [########################################] 100.0% of 1000
Cluster 019 : [########################################] 100.0% of 1000
Cluster 020 : [##########------------------------------] 25.0% of 1000
<br>**Conclusion :** <br>**Conclusion :**
Duration : 0:00:00 Duration : 0:01:59
Size : 3.7 Go Size : 7.4 Go
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
pwk.end() pwk.end()
``` ```
%% Output %% Output
End time is : Thursday 31 December 2020, 12:50:38 End time is : Saturday 2 January 2021, 17:06:59
Duration is : 00:01:13 121ms Duration is : 00:02:00 412ms
This notebook ends here This notebook ends here
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
--- ---
<img width="80px" src="../fidle/img/00-Fidle-logo-01.svg"></img> <img width="80px" src="../fidle/img/00-Fidle-logo-01.svg"></img>
......
This diff is collapsed.
...@@ -25,6 +25,11 @@ DEFAULT_NOTEBOOK_NAME = "Unknown" ...@@ -25,6 +25,11 @@ DEFAULT_NOTEBOOK_NAME = "Unknown"
FIDLE_MPLSTYLE = '../fidle/mplstyles/custom.mplstyle' FIDLE_MPLSTYLE = '../fidle/mplstyles/custom.mplstyle'
FIDLE_CSSFILE = '../fidle/css/custom.css' FIDLE_CSSFILE = '../fidle/css/custom.css'
# ---- Save figs or not (yes|no)
# Overided by env : FIDLE_SAVE_FIGS
#
DEFAULT_SAVE_FIGS = 'yes'
# ---- Catalog file, a json description of all notebooks # ---- Catalog file, a json description of all notebooks
# #
CATALOG_FILE = '../fidle/log/catalog.json' CATALOG_FILE = '../fidle/log/catalog.json'
...@@ -43,11 +48,6 @@ CI_REPORT = '../fidle/log/ci_report.html' ...@@ -43,11 +48,6 @@ CI_REPORT = '../fidle/log/ci_report.html'
# #
DEFAULT_RUNNING_MODE = 'none' DEFAULT_RUNNING_MODE = 'none'
# ---- Save figs or not (yes|no)
# Overided by env : FIDLE_SAVE_FIGS
#
DEFAULT_SAVE_FIGS = 'yes'
# ---- CI Override parameters # ---- CI Override parameters
# #
GTSRB1_smart_scale = 0.1 GTSRB1_smart_scale = 0.1
...@@ -60,4 +60,9 @@ VAE6_smart_image_size = (128,128) ...@@ -60,4 +60,9 @@ VAE6_smart_image_size = (128,128)
VAE6_smart_output_dir = './data' VAE6_smart_output_dir = './data'
VAE6_full_scale = 1 VAE6_full_scale = 1
VAE6_full_image_size = (192,160) VAE6_full_image_size = (192,160)
VAE6_full_output_dir = '{datasets_dir}/GTSRB/enhanced' VAE6_full_output_dir = '{datasets_dir}/celeba/enhanced'
VAE7_smart_image_size = (128,128)
VAE7_smart_enhanced_dir = './data'
VAE7_full_image_size = (192,160)
VAE7_full_enhanced_dir = '{datasets_dir}/celeba/enhanced'
...@@ -127,9 +127,9 @@ ...@@ -127,9 +127,9 @@
}, },
"VAE6": { "VAE6": {
"path": "/home/pjluc/dev/fidle/VAE", "path": "/home/pjluc/dev/fidle/VAE",
"start": "Thursday 31 December 2020, 12:49:25", "start": "Saturday 2 January 2021, 17:04:58",
"end": "Thursday 31 December 2020, 12:50:38", "end": "Saturday 2 January 2021, 17:06:59",
"duration": "00:01:13 121ms" "duration": "00:02:00 412ms"
}, },
"GTS1": { "GTS1": {
"path": "/home/pjluc/dev/fidle/GTSRB", "path": "/home/pjluc/dev/fidle/GTSRB",
...@@ -142,5 +142,11 @@ ...@@ -142,5 +142,11 @@
"start": "Thursday 31 December 2020, 12:34:29", "start": "Thursday 31 December 2020, 12:34:29",
"end": "Thursday 31 December 2020, 12:36:22", "end": "Thursday 31 December 2020, 12:36:22",
"duration": "00:01:53 128ms" "duration": "00:01:53 128ms"
},
"VAE7": {
"path": "/home/pjluc/dev/fidle/VAE",
"start": "Saturday 2 January 2021, 17:28:39",
"end": "Saturday 2 January 2021, 17:28:47",
"duration": "00:00:08 736ms"
} }
} }
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment