<img width="800px" src="../fidle/img/00-Fidle-header-01.svg"></img>

# <!-- TITLE --> [VAE6] - Generation of a clustered dataset
<!-- DESC --> Episode 2 : Analysis of the CelebA dataset and creation of an clustered and usable dataset
<!-- AUTHOR : Jean-Luc Parouty (CNRS/SIMaP) -->

## Objectives :
 - Formatting our dataset in **cluster files**, using batch mode
 - Adapting a notebook for batch use


The [CelebFaces Attributes Dataset (CelebA)](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) contains about **200,000 images** (202599,218,178,3).  
The size and the number of files of this dataset make it impossible to use it as it is.  
A formatting in the form of clusters of n images is essential.


## What we're going to do :
 - Lire les images
 - redimensionner et normaliser celles-ci,
 - Constituer des clusters d'images en format npy


## Step 1 - Import and init

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from skimage import io, transform

import os,pathlib,time,sys,json,glob
import csv
import math, random

from importlib import reload

sys.path.append('..')
import fidle.pwk as pwk

run_dir='./run/VAE6'
datasets_dir = pwk.init('VAE6', run_dir)

## Step 2 - Parameters
All the dataset will be use for training  
Reading the 200,000 images can take a long time **(>20 minutes)** and a lot of place **(>170 GB)**  
Example :  
Image Sizes: 128x128 : 74 GB  
Image Sizes: 192x160 : 138 GB  

You can define theses parameters :  
`scale` : 1 mean 100% of the dataset - set 0.05 for tests  
`image_size` : images size in the clusters, should be 128x128 or 192,160 - original size is (218,178)  
`output_dir` : where to write clusters, could be :
 - `./data`, for tests purpose
 - `<datasets_dir>/celeba/enhanced` to add clusters in your datasets dir.  
 
`cluster_size` : number of images in a cluster, 10000 is fine. (will be adjust by scale)  
`progress_verbosity`: Verbosity of progress bar: 0=no progress, 1: progress bar, 2: One line

**Note :** If the target folder is not empty and exit_if_exist is True, the construction is blocked.  

In [None]:
# ---- Parameters you can change -----------------------------------
#
progress_verbosity = 1

# ---- Just for tests
#      Save clustered dataset in ./data
#
scale         = 0.05
seed          = 123
cluster_size  = 10000
image_size    = (128,128)
output_dir    = './data'
exit_if_exist = False

# ---- Full clusters generation, medium size : 74 GB
#      Save clustered dataset in <datasets_dir> 
#
# scale         = 1.
# seed          = 123
# cluster_size  = 10000
# image_size    = (128,128)
# output_dir    = f'{datasets_dir}/celeba/enhanced'
# exit_if_exist = True

# ---- Just for tests
#      Save clustered dataset in ./data
#
# scale         = 0.05
# seed          = 123
# cluster_size  = 10000
# image_size    = (192,160)
# output_dir    = './data'
# exit_if_exist = False

# ---- Full clusters generation, large size : 138 GB
#      Save clustered dataset in <datasets_dir> 
#
# scale         = 1.
# seed          = 123
# cluster_size  = 10000
# image_size    = (192,160)
# output_dir    = f'{datasets_dir}/celeba/enhanced'
# exit_if_exist = True

In [None]:
# ---- Used for continous integration - Just forget these lines
#
pwk.override('progress_verbosity', 'scale', 'seed', )
pwk.override('cluster_size', 'image_size', 'output_dir', 'exit_if_exist')

## Step 3 - Cluster construction

### 3.1 - Directories and files :

In [None]:
dataset_csv = f'{datasets_dir}/celeba/origine/list_attr_celeba.csv'
dataset_img = f'{datasets_dir}/celeba/origine/img_align_celeba'

### 3.2 - Cooking function

In [None]:
def read_and_save( dataset_csv, dataset_img, shuffle=True, seed=None, scale=1,
                   cluster_size=1000, cluster_dir='./dataset_cluster', cluster_name='images',
                   image_size=(128,128), exit_if_exist=True, verbosity=1):
    '''
    Will read the images and save a clustered dataset

    Args:
        dataset_csv : list and description of original images
        dataset_img : original images directory
        shuffle     : shuffle data if True  (True)
        seed        : random seed value. False mean no seed, None mean using /dev/urandom (None)
        scale       : scale of dataset to use. 1. mean 100% (1.)
        cluster_size : Size of generated cluster (10000)
        cluster_dir  : Directory of generated clusters (''./dataset_cluster')
        cluster_name : Name of generated clusters ('images')
        image_size   : Size of generated images (128,128)
        exit_if_exist : Exit if clusters still exists.

    Returns:
        nb_clusters : Number of clusters
        duration: total duration
    '''
    global pwk
    
    def save_cluster(imgs,desc,cols,id):
        file_img  = f'{cluster_dir}/{cluster_name}-{id:03d}.npy'
        file_desc = f'{cluster_dir}/{cluster_name}-{id:03d}.csv'
        np.save(file_img,  np.array(imgs))
        df=pd.DataFrame(data=desc,columns=cols)
        df.to_csv(file_desc, index=False)
        return [],[],id+1
    
    pwk.chrono_start()
    
    # ---- Seed
    #
    if seed is not False:
        np.random.seed(seed)
        print(f'Seeded ({seed})')
            
    # ---- Read dataset description
    #
    dataset_desc = pd.read_csv(dataset_csv, header=0)
    n=len(dataset_desc)
    print(f'Description loaded ({n} images).')
    
    # ---- Shuffle
    #
    if shuffle:
        dataset_desc = dataset_desc.reindex(np.random.permutation(dataset_desc.index))
        print('Shuffled.')
    cols = list(dataset_desc.columns)

    # ---- Check if cluster files exist
    #
    if exit_if_exist and os.path.isfile(f'{cluster_dir}/images-000.npy'):
        print('\n*** Oups. There are already clusters in the target folder!\n')
        return 0,0
    pwk.mkdir(cluster_dir)

    # ---- Rescale
    #
    n=int(len(dataset_desc)*scale)
    dataset = dataset_desc[:n]
    cluster_size = int(cluster_size*scale)
    print('Rescaled.')
    pwk.subtitle('Parameters :')
    print(f'Scale is : {scale}')
    print(f'Image size is     : {image_size}')
    print(f'dataset length is : {n}')
    print(f'cluster size is   : {cluster_size}')
    print(f'clusters nb  is   :',int(n/cluster_size + 1))
    print(f'cluster dir  is   : {cluster_dir}')
    
    # ---- Read and save clusters
    #
    pwk.subtitle('Running...')
    imgs, desc, cluster_id = [],[],0
    #
    for i,row in dataset.iterrows():
        #
        filename = f'{dataset_img}/{row.image_id}'
        #
        # ---- Read image, resize (and normalize)
        #
        img = io.imread(filename)
        img = transform.resize(img, image_size)
        #
        # ---- Add image and description
        #
        imgs.append( img )
        desc.append( row.values )
        #
        # ---- Progress bar
        #
        pwk.update_progress(f'Cluster {cluster_id:03d} :',len(imgs),
                            cluster_size, verbosity=verbosity)
        #
        # ---- Save cluster if full
        #
        if len(imgs)==cluster_size:
            imgs,desc,cluster_id=save_cluster(imgs,desc,cols, cluster_id)

    # ---- Save uncomplete cluster
    if len(imgs)>0 : imgs,desc,cluster_id=save_cluster(imgs,desc,cols,cluster_id)

    duration=pwk.chrono_stop()
    return cluster_id,duration


### 3.3 - Clusters building

In [None]:
# ---- Build clusters
#
lx,ly        = image_size
cluster_dir  = f'{output_dir}/clusters-{lx}x{ly}'

cluster_nb,duration = read_and_save( dataset_csv, dataset_img,
                                     shuffle       = True,
                                     seed          = seed,
                                     scale         = scale,
                                     cluster_size  = cluster_size, 
                                     cluster_dir   = cluster_dir,
                                     image_size    = image_size,
                                     exit_if_exist = exit_if_exist,
                                     verbosity     = progress_verbosity )

# ---- Conclusion...

directory = pathlib.Path(cluster_dir)
s=sum(f.stat().st_size for f in directory.glob('**/*') if f.is_file())

pwk.subtitle('Ressources :')
print('Duration     : ',pwk.hdelay(duration))
print('Size         : ',pwk.hsize(s))

pwk.subtitle('Estimation with scale=1 :')
print('Duration     : ',pwk.hdelay(duration*(1/scale)))
print('Size         : ',pwk.hsize(s*(1/scale)))


In [None]:
pwk.end()

---
<img width="80px" src="../fidle/img/00-Fidle-logo-01.svg"></img>