Skip to content
Snippets Groups Projects
Select Git revision
  • 9a6f4e976ddfb067ae1e954a03d8887d60956cbc
  • master default protected
  • fetch-skips-hardening
  • cgo24_1
  • gdb-13-branchpoint
  • gdb_7_6-branchpoint
  • gdb_7_5-branchpoint
  • gdb_7_4-branchpoint
  • gdb_7_3-branchpoint
  • gdb_7_2-branchpoint
  • gdb_7_1-branchpoint
  • gdb_7_0-branchpoint
  • gdb_6_8-branchpoint
  • gdb_6_7-branchpoint
  • gdb_6_6-branchpoint
  • gdb_6_5-branchpoint
  • gdb_6_4-branchpoint
  • gdb_6_3-branchpoint
  • gdb_6_2-branchpoint
  • gdb_6_1-branchpoint
  • gdb_6_0-branchpoint
  • gdb_5_3-branchpoint
  • gdb_5_2-branchpoint
23 results

intl-compat.c

Blame
  • 03.1-Batch.ipynb 6.73 KiB

    Celeb Faces Dataset (CelebA)


    Introduction au Deep Learning (IDLE) - S. Arias, E. Maldonado, JL. Parouty - CNRS/SARI/DEVLOG - 2020

    We'll do the same thing again but with a more interesting dataset: CelebFaces
    About this dataset : http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html

    Episode 1 : Preparation of data - Batch mode

    • Save enhanced datasets in h5 file format

    Step 1 - Import and init

    1.2 - Import

    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as pd
    from skimage import io, transform
    
    import os,time,sys,json,glob
    import csv
    import math, random
    
    from importlib import reload
    
    sys.path.append('..')
    import fidle.pwk as ooo
    
    ooo.init()

    1.2 - Directories and files :

    place, dataset_dir = ooo.good_place( { 'GRICAD' : f'{os.getenv("SCRATCH_DIR","")}/PROJECTS/pr-fidle/datasets/celeba',
                                           'IDRIS'  : f'{os.getenv("WORK","")}/datasets/celeba'    } )
    
    dataset_csv  = f'{dataset_dir}/list_attr_celeba.csv'
    dataset_img  = f'{dataset_dir}/img_align_celeba'

    Step 2 - Read filenames catalog

    dataset_desc = pd.read_csv(dataset_csv, header=0)
    dataset_desc = dataset_desc.reindex(np.random.permutation(dataset_desc.index))

    Step 3 - Save as clusters of n images

    4.2 - Cooking function

    def read_and_save( dataset_img, dataset_desc, 
                       cluster_size=1000, cluster_dir='./dataset_cluster', cluster_name='images',
                       image_size=(128,128)):
        
        def save_cluster(imgs,desc,cols,id):
            file_img  = f'{cluster_dir}/{cluster_name}-{id:03d}.npy'
            file_desc = f'{cluster_dir}/{cluster_name}-{id:03d}.csv'
            np.save(file_img,  np.array(imgs))
            df=pd.DataFrame(data=desc,columns=cols)
            df.to_csv(file_desc, index=False)
            return [],[],id+1
        
        start_time = time.time()
        cols = list(dataset_desc.columns)
    
        # ---- Check if cluster files exist
        #
        if os.path.isfile(f'{cluster_dir}/images-000.npy'):
            print('\n*** Oops. There are already clusters in the target folder!\n')
            return 0,0
        
        # ---- Create cluster_dir
        #
        os.makedirs(cluster_dir, mode=0o750, exist_ok=True)
        
        # ---- Read and save clusters
        #
        imgs, desc, cluster_id = [],[],0
        #
        for i,row in dataset_desc.iterrows():
            #
            filename = f'{dataset_img}/{row.image_id}'
            #
            # ---- Read image, resize (and normalize)
            #
            img = io.imread(filename)
            img = transform.resize(img, image_size)
            #
            # ---- Add image and description
            #
            imgs.append( img )
            desc.append( row.values )
            #
            # ---- Progress bar
            #
            ooo.update_progress(f'Cluster {cluster_id:03d} :',len(imgs),cluster_size)
            #
            # ---- Save cluster if full
            #
            if len(imgs)==cluster_size:
                imgs,desc,cluster_id=save_cluster(imgs,desc,cols, cluster_id)
    
        # ---- Save uncomplete cluster
        if len(imgs)>0 : imgs,desc,cluster_id=save_cluster(imgs,desc,cols,cluster_id)
    
        duration=time.time()-start_time
        return cluster_id,duration
    

    4.3 - Cluster building

    # ---- Cluster size
    
    cluster_size_train = 10000
    cluster_size_test  = 10000
    image_size         = (192,160)
    
    # ---- Clusters location
    
    train_dir  = f'{dataset_dir}/clusters-M.train'
    test_dir   = f'{dataset_dir}/clusters-M.test'
    
    # ---- x_train, x_test
    #
    n1,d1 = read_and_save(dataset_img, dataset_desc[:200000],
                          cluster_size = cluster_size_train, 
                          cluster_dir  = train_dir,
                          image_size   = image_size )
    
    n2,d2 = read_and_save(dataset_img, dataset_desc[200000:],
                          cluster_size = cluster_size_test, 
                          cluster_dir  = test_dir,
                          image_size   = image_size )
            
    print(f'\n\nDuration : {d1+d2:.2f} s or {ooo.hdelay(d1+d2)}')
    print(f'Train clusters : {train_dir}')
    print(f'Test  clusters : {test_dir}')

    That's all folks !