import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from skimage import io, transform
import os,time,sys,json,glob
import csv
import math, random
from importlib import reload
sys.path.append('..')
import fidle.pwk as ooo
ooo.init()
Select Git revision
03.1-Batch.ipynb 6.73 KiB
Celeb Faces Dataset (CelebA)
Introduction au Deep Learning (IDLE) - S. Arias, E. Maldonado, JL. Parouty - CNRS/SARI/DEVLOG - 2020
We'll do the same thing again but with a more interesting dataset: CelebFaces
About this dataset : http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html
Episode 1 : Preparation of data - Batch mode
- Save enhanced datasets in h5 file format
Step 1 - Import and init
1.2 - Import
1.2 - Directories and files :
place, dataset_dir = ooo.good_place( { 'GRICAD' : f'{os.getenv("SCRATCH_DIR","")}/PROJECTS/pr-fidle/datasets/celeba',
'IDRIS' : f'{os.getenv("WORK","")}/datasets/celeba' } )
dataset_csv = f'{dataset_dir}/list_attr_celeba.csv'
dataset_img = f'{dataset_dir}/img_align_celeba'
Step 2 - Read filenames catalog
dataset_desc = pd.read_csv(dataset_csv, header=0)
dataset_desc = dataset_desc.reindex(np.random.permutation(dataset_desc.index))
Step 3 - Save as clusters of n images
4.2 - Cooking function
def read_and_save( dataset_img, dataset_desc,
cluster_size=1000, cluster_dir='./dataset_cluster', cluster_name='images',
image_size=(128,128)):
def save_cluster(imgs,desc,cols,id):
file_img = f'{cluster_dir}/{cluster_name}-{id:03d}.npy'
file_desc = f'{cluster_dir}/{cluster_name}-{id:03d}.csv'
np.save(file_img, np.array(imgs))
df=pd.DataFrame(data=desc,columns=cols)
df.to_csv(file_desc, index=False)
return [],[],id+1
start_time = time.time()
cols = list(dataset_desc.columns)
# ---- Check if cluster files exist
#
if os.path.isfile(f'{cluster_dir}/images-000.npy'):
print('\n*** Oops. There are already clusters in the target folder!\n')
return 0,0
# ---- Create cluster_dir
#
os.makedirs(cluster_dir, mode=0o750, exist_ok=True)
# ---- Read and save clusters
#
imgs, desc, cluster_id = [],[],0
#
for i,row in dataset_desc.iterrows():
#
filename = f'{dataset_img}/{row.image_id}'
#
# ---- Read image, resize (and normalize)
#
img = io.imread(filename)
img = transform.resize(img, image_size)
#
# ---- Add image and description
#
imgs.append( img )
desc.append( row.values )
#
# ---- Progress bar
#
ooo.update_progress(f'Cluster {cluster_id:03d} :',len(imgs),cluster_size)
#
# ---- Save cluster if full
#
if len(imgs)==cluster_size:
imgs,desc,cluster_id=save_cluster(imgs,desc,cols, cluster_id)
# ---- Save uncomplete cluster
if len(imgs)>0 : imgs,desc,cluster_id=save_cluster(imgs,desc,cols,cluster_id)
duration=time.time()-start_time
return cluster_id,duration
4.3 - Cluster building
# ---- Cluster size
cluster_size_train = 10000
cluster_size_test = 10000
image_size = (192,160)
# ---- Clusters location
train_dir = f'{dataset_dir}/clusters-M.train'
test_dir = f'{dataset_dir}/clusters-M.test'
# ---- x_train, x_test
#
n1,d1 = read_and_save(dataset_img, dataset_desc[:200000],
cluster_size = cluster_size_train,
cluster_dir = train_dir,
image_size = image_size )
n2,d2 = read_and_save(dataset_img, dataset_desc[200000:],
cluster_size = cluster_size_test,
cluster_dir = test_dir,
image_size = image_size )
print(f'\n\nDuration : {d1+d2:.2f} s or {ooo.hdelay(d1+d2)}')
print(f'Train clusters : {train_dir}')
print(f'Test clusters : {test_dir}')
That's all folks !