SYNOP1-Preparation-of-data.ipynb

 
 
 [SYNOP1] - Preparation of data
 Episode 1 : Data analysis and preparation of a usuable meteorological dataset (SYNOP)


Objectives :

Undestand the data
cleanup a usable dataset
SYNOP meteorological data, can be found on :
https://public.opendatasoft.com  
About SYNOP datasets :
https://public.opendatasoft.com/explore/dataset/donnees-synop-essentielles-omm/information/?sort=date
This dataset contains a set of measurements (temperature, pressure, ...) made every 3 hours at the LYS airport.
The objective will be to predict the evolution of the weather !
What we're going to do :

Read the data
Cleanup and build a usable dataset
Step 1 - Import and init
 import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import TensorBoard

import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import h5py, json
import os,time,sys
import math, random

from importlib import reload

sys.path.append('..')
import fidle.pwk as pwk

run_dir = './run/SYNOP'
datasets_dir = pwk.init('SYNOP1', run_dir)

pd.set_option('display.max_rows',200)
 
 Step 2 - Parameters
output_dir : where to save our enhanced dataset.
./data is a good choice because our dataset will be very small.
 # ---- Our future enhanced dataset (no need to change)
#
dataset_filename     = 'synop-LYS.csv'
description_filename = 'synop.json'
output_dir           = './data'
 
 Override parameters (batch mode) - Just forget this cell
 pwk.override('output_dir')
 
 Step 3 - Retrieve the dataset
There are two parts to recover:

The data itself (csv)
Description of the data (json)
 data_filename   = 'origine/donnees-synop-essentielles-omm-LYS.csv'
schema_filename = 'origine/schema.json'
 
 3.1 - Read dataset description
Get columns names of the dataset from the schema description
 with open(f'{datasets_dir}/SYNOP/{schema_filename}','r') as json_file:
    schema = json.load(json_file)

synop_codes=list( schema['definitions']['donnees-synop-essentielles-omm_records']['properties']['fields']['properties'].keys() )
 
 3.2 - Read data
 df = pd.read_csv(f'{datasets_dir}/SYNOP/{data_filename}', header=0, sep=';')
pwk.subtitle('Raw data :')
display(df.tail(10))

# ---- Get the columns name as descriptions
#
synop_desc = list(df.columns)

# ---- Set Codes as columns name
#
df.columns   = synop_codes
code2desc    = dict(zip(synop_codes, synop_desc))

# ---- Count the na values by columns
#
columns_na = df.isna().sum().tolist()

# ---- Show all of that
#
df_desc=pd.DataFrame({'Code':synop_codes, 'Description':synop_desc, 'Na':columns_na})

pwk.subtitle('List of columns :')
display(df_desc.style.set_properties(**{'text-align': 'left'}))

print('Shape is : ', df.shape)
 
 Step 4 - Prepare dataset
4.1 - Keep only certain columns
 columns_used=['date','pmer','tend','cod_tend','dd','ff','td','u','ww','pres','rafper','per','rr1','rr3','tc']

# ---- Drop unused columns

to_drop = df.columns.difference(columns_used)
df.drop( to_drop, axis=1, inplace=True)

# ---- Show all of that

pwk.subtitle('Our selected columns :')
display(df.head(20))
 
 4.2 - Few stats
Note : We note that per column is constant, so we can drop it
 pwk.subtitle('Few statistics :')
display(df.describe().style.format('{:.2f}'))

# ---- 'per' column is constant, we can drop it

df.drop(['per'],axis=1,inplace=True)

# ---- Count the na values by columns
#
dataset_na    = df.isna().sum().tolist()
dataset_cols  = df.columns.tolist()
dataset_desc  = [ code2desc[c] for c in dataset_cols ]

# ---- Show all of that
#
pwk.subtitle('Do we have na values ?')
df_desc=pd.DataFrame({'Columns':dataset_cols, 'Description':dataset_desc, 'Na':dataset_na})
display(df_desc.style.set_properties(**{'text-align': 'left'}))
 
 4.3 - Cleanup dataset
Let's sort it and cook up NaN values with an interpolation
 # ---- First of all, we have to sort on the date

df.sort_values(['date'],  inplace=True)
df.reset_index(drop=True, inplace=True)

# ---- Before : Lines with NaN

na_rows=df.isna().any(axis=1)
pwk.subtitle('Before :')
display( df[na_rows].head(10) )

# ---- Nice interpolation for plugging holes

df.interpolate(inplace=True)

# ---- After

pwk.subtitle('After :')
display(df[na_rows].head(10))
 
 Step 5 - About our enhanced dataset
5.1 - Summarize it
 # ---- Count the na values by columns
#
dataset_na    = df.isna().sum().tolist()
dataset_cols  = df.columns.tolist()
dataset_desc  = [ code2desc[c] for c in dataset_cols ]

# ---- Show all of that
#
df_desc=pd.DataFrame({'Columns':dataset_cols, 'Description':dataset_desc, 'Na':dataset_na})
pwk.subtitle('Dataset columns :')
display(df_desc.style.set_properties(**{'text-align': 'left'}))

pwk.subtitle('Have a look :')
display(df.tail(20))
print('Shape is : ', df.shape)
 
 5.2 - Have a look (1 month)
 i=random.randint(0,len(df)-240)
df.iloc[i:i+240].plot(subplots=True, fontsize=12, figsize=(16,20))
pwk.save_fig('01-one-month')
plt.show()
 
 Step 6 - Save it
 # ---- Save it
#
pwk.mkdir(output_dir)

filedata = f'{output_dir}/{dataset_filename}'
filedesc = f'{output_dir}/{description_filename}'

df.to_csv(filedata, sep=';', index=False)
size=os.path.getsize(filedata)/(1024*1024)
print(f'Dataset saved. ({size:0.1f} Mo)')

with open(filedesc, 'w', encoding='utf-8') as f:
    json.dump(code2desc, f, indent=4)
print('Synop description saved.')
    
 
 pwk.end()