import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import TensorBoard
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import h5py, json
import os,time,sys
import math, random
from importlib import reload
sys.path.append('..')
import fidle.pwk as pwk
run_dir = './run/SYNOP'
datasets_dir = pwk.init('SYNOP1', run_dir)
pd.set_option('display.max_rows',200)
-
Jean-Luc Parouty authoredJean-Luc Parouty authored
[SYNOP1] - Preparation of data
Episode 1 : Data analysis and preparation of a usuable meteorological dataset (SYNOP)Objectives :
- Undestand the data
- cleanup a usable dataset
SYNOP meteorological data, can be found on :
https://public.opendatasoft.com
About SYNOP datasets :
https://public.opendatasoft.com/explore/dataset/donnees-synop-essentielles-omm/information/?sort=date
This dataset contains a set of measurements (temperature, pressure, ...) made every 3 hours at the LYS airport.
The objective will be to predict the evolution of the weather !
What we're going to do :
- Read the data
- Cleanup and build a usable dataset
Step 1 - Import and init
Step 2 - Parameters
output_dir
: where to save our enhanced dataset.
./data is a good choice because our dataset will be very small.
# ---- Our future enhanced dataset (no need to change)
#
dataset_filename = 'synop-LYS.csv'
description_filename = 'synop.json'
output_dir = './data'
Override parameters (batch mode) - Just forget this cell
pwk.override('output_dir')
Step 3 - Retrieve the dataset
There are two parts to recover:
- The data itself (csv)
- Description of the data (json)
data_filename = 'origine/donnees-synop-essentielles-omm-LYS.csv'
schema_filename = 'origine/schema.json'
3.1 - Read dataset description
Get columns names of the dataset from the schema description
with open(f'{datasets_dir}/SYNOP/{schema_filename}','r') as json_file:
schema = json.load(json_file)
synop_codes=list( schema['definitions']['donnees-synop-essentielles-omm_records']['properties']['fields']['properties'].keys() )
3.2 - Read data
df = pd.read_csv(f'{datasets_dir}/SYNOP/{data_filename}', header=0, sep=';')
pwk.subtitle('Raw data :')
display(df.tail(10))
# ---- Get the columns name as descriptions
#
synop_desc = list(df.columns)
# ---- Set Codes as columns name
#
df.columns = synop_codes
code2desc = dict(zip(synop_codes, synop_desc))
# ---- Count the na values by columns
#
columns_na = df.isna().sum().tolist()
# ---- Show all of that
#
df_desc=pd.DataFrame({'Code':synop_codes, 'Description':synop_desc, 'Na':columns_na})
pwk.subtitle('List of columns :')
display(df_desc.style.set_properties(**{'text-align': 'left'}))
print('Shape is : ', df.shape)
Step 4 - Prepare dataset
4.1 - Keep only certain columns
columns_used=['date','pmer','tend','cod_tend','dd','ff','td','u','ww','pres','rafper','per','rr1','rr3','tc']
# ---- Drop unused columns
to_drop = df.columns.difference(columns_used)
df.drop( to_drop, axis=1, inplace=True)
# ---- Show all of that
pwk.subtitle('Our selected columns :')
display(df.head(20))
4.2 - Few stats
Note : We note that per column is constant, so we can drop it
pwk.subtitle('Few statistics :')
display(df.describe().style.format('{:.2f}'))
# ---- 'per' column is constant, we can drop it
df.drop(['per'],axis=1,inplace=True)
# ---- Count the na values by columns
#
dataset_na = df.isna().sum().tolist()
dataset_cols = df.columns.tolist()
dataset_desc = [ code2desc[c] for c in dataset_cols ]
# ---- Show all of that
#
pwk.subtitle('Do we have na values ?')
df_desc=pd.DataFrame({'Columns':dataset_cols, 'Description':dataset_desc, 'Na':dataset_na})
display(df_desc.style.set_properties(**{'text-align': 'left'}))
4.3 - Cleanup dataset
Let's sort it and cook up NaN values with an interpolation
# ---- First of all, we have to sort on the date
df.sort_values(['date'], inplace=True)
df.reset_index(drop=True, inplace=True)
# ---- Before : Lines with NaN
na_rows=df.isna().any(axis=1)
pwk.subtitle('Before :')
display( df[na_rows].head(10) )
# ---- Nice interpolation for plugging holes
df.interpolate(inplace=True)
# ---- After
pwk.subtitle('After :')
display(df[na_rows].head(10))
Step 5 - About our enhanced dataset
5.1 - Summarize it
# ---- Count the na values by columns
#
dataset_na = df.isna().sum().tolist()
dataset_cols = df.columns.tolist()
dataset_desc = [ code2desc[c] for c in dataset_cols ]
# ---- Show all of that
#
df_desc=pd.DataFrame({'Columns':dataset_cols, 'Description':dataset_desc, 'Na':dataset_na})
pwk.subtitle('Dataset columns :')
display(df_desc.style.set_properties(**{'text-align': 'left'}))
pwk.subtitle('Have a look :')
display(df.tail(20))
print('Shape is : ', df.shape)
5.2 - Have a look (1 month)
i=random.randint(0,len(df)-240)
df.iloc[i:i+240].plot(subplots=True, fontsize=12, figsize=(16,20))
pwk.save_fig('01-one-month')
plt.show()
Step 6 - Save it
# ---- Save it
#
pwk.mkdir(output_dir)
filedata = f'{output_dir}/{dataset_filename}'
filedesc = f'{output_dir}/{description_filename}'
df.to_csv(filedata, sep=';', index=False)
size=os.path.getsize(filedata)/(1024*1024)
print(f'Dataset saved. ({size:0.1f} Mo)')
with open(filedesc, 'w', encoding='utf-8') as f:
json.dump(code2desc, f, indent=4)
print('Synop description saved.')
pwk.end()