Using a time series dataset with 5 years of weather data to explore usage of H2O's AutoML
# Imports
import h2o
import scipy
import pandas as pd
from datetime import datetime
import matplotlib as m
from h2o.automl import H2OAutoML
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
# Imports for plot formatting
plt.style.use('fivethirtyeight')
m.rcParams['axes.labelsize'] = 14
m.rcParams['xtick.labelsize'] = 12
m.rcParams['ytick.labelsize'] = 12
m.rcParams['text.color'] = 'k'
from pylab import rcParams
rcParams['figure.figsize'] = 18, 8
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -v --iv
dateparse = lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M')
df = pd.read_csv('https://raw.githubusercontent.com/Matheus-Schmitz/H2O_TimeSeries_AutoML/master/data/weather_5_years.csv',
index_col = 0,
parse_dates = ['date'],
date_parser = dateparse)
df.head()
df.shape
# Index (the date is the dataset's index, which characterizes a time series)
df.index
# Plot each variable
# Counter
i = 1
# Plot
plt.figure()
for k in range(0, 4):
plt.subplot(len(df.columns), 1, i)
plt.plot(df.values[:, k])
plt.title(df.columns[k], y = 0.5, loc = 'right')
i += 1
plt.show()
df2 = df.copy()
# Control parameters
# Number of lags
num_lags = 3
# Delay (aka how many hours ahead to predict)
delay = 1
# Rolling window size
ws = 3
# Loop to apply the lag to each variable in the dataset
for column in df2:
for lag in range(delay, num_lags+delay):
df2[column + '_lag' + str(lag)] = df2[column].shift(lag)
df2[column + '_rolling_avg' + str(lag)] = df2[column].shift(lag).rolling(window=ws).mean()
df2.head()
# Remove NA values
df2.dropna(inplace = True)
df2.shape
# Create a mask to define the final dataset
# Using all created features to predict temperature
# This means the mask will remove the current 'wind', 'snow' and 'rain' columns
mask = (df2.columns.str.contains('temperature') | df2.columns.str.contains('lag') | df2.columns.str.contains('rolling'))
mask
# Final dataset
df_final = df2[df2.columns[mask]]
# Check columns
df_final.columns
# Using the first 4 years to predict the last (80/20 split)
df_train = df_final[:-24*365]
df_test = df_final[-24*365:]
# Plot train x test data
plt.plot(df_train.index, df_train.temperature)
plt.plot(df_test.index, df_test.temperature)
plt.ylabel('Temperature', fontsize = 24)
plt.legend(['Train', 'Test'], fontsize = 24)
plt.show()
!nvidia-smi
# Initialize the cluster
h2o.init()
# Convert the train dataframe from pandas to H2O
hf_train = h2o.H2OFrame(df_train)
# Convert the test dataframe from pandas to H2O
hf_test = h2o.H2OFrame(df_test)
# Summary of the new dataframe
hf_train.describe()
# Define y
y = 'temperature'
# Define X
X = hf_train.columns
X.remove(y)
X
Instead of picking a numer of models, I'll simply define the max_runtime and let H2O go through as many models as possible in that time
# Create model
model_aml = H2OAutoML(max_runtime_secs = 300)
%%time
model_aml.train(x = X,
y = y,
training_frame = hf_train,
leaderboard_frame = hf_test)
# Get list with trained models
models = model_aml.leaderboard
# View trained models
models
# Get the best model
model_leader = model_aml.leader
# Save model
h2o.save_model(model = model_leader,
path = 'models/',
force = True)
# Predicting with best model
hf_test_predict = model_leader.predict(hf_test)
# Predictions
hf_test_predict.head(5)
hf_test_predict.shape
# Create dataframe with true and predicted values
df_results = pd.DataFrame()
df_results['y_true'] = df_test['temperature'].reset_index(drop = True)
df_results['y_pred'] = h2o.as_list(hf_test_predict, use_pandas = True)
df_results.head()
# The linregress() function from Scipy can extract prediction metrics
slope, intercept, r_value, p_value, std_err = stats.linregress(x = df_results['y_pred'],
y = df_results['y_true'])
print('R² Coefficient = ', r_value * r_value)
# Plot y_pred and y_true
plt.plot(df_results['y_true'])
plt.plot(df_results['y_pred'])
plt.ylabel('Temperature', fontsize = 24)
plt.legend(['y_true','y_pred'], fontsize = 24)
plt.show()
And predicting 5 hours (delay = 5) ahead and with 5 lags!
%%time
# Copy original df
df3 = df.copy()
# Number of lags
num_lags = 5
# Delay (aka how many hours ahead to predict)
delay = 5
# Rolling window size
ws = 3
# Loop to apply the lag to each variable in the dataset
for column in df3:
for lag in range(delay, num_lags+delay):
df3[column + '_lag' + str(lag)] = df3[column].shift(lag)
df3[column + '_rolling_avg' + str(lag)] = df3[column].shift(lag).rolling(window=ws).mean()
# Remove NA values
df3.dropna(inplace = True)
# Mask
mask = (df3.columns.str.contains('temperature') | df3.columns.str.contains('lag') | df3.columns.str.contains('rolling'))
# Final dataset
df_processed = df3[df3.columns[mask]]
# Using the first 4 years to predict the last (80/20 split)
DF_TRAIN = df_processed[:-24*365]
DF_TEST = df_processed[-24*365:]
# H2O Frame
HF_TRAIN = h2o.H2OFrame(DF_TRAIN)
HF_TEST = h2o.H2OFrame(DF_TEST)
# Split X and Y
TARGET = 'temperature'
PREDICTORS = HF_TRAIN.columns
PREDICTORS.remove(TARGET)
# Create model
aml = H2OAutoML(max_runtime_secs = 300)
# Train model
aml.train(x = PREDICTORS, y = TARGET, training_frame = HF_TRAIN, leaderboard_frame = HF_TEST)
# Best model
aml_leader = aml.leader
# Predictions
HF_TEST_PREDICT = aml_leader.predict(HF_TEST)
# Results
df_compare = pd.DataFrame()
df_compare['y_true'] = DF_TEST['temperature'].reset_index(drop = True)
df_compare['y_pred'] = h2o.as_list(HF_TEST_PREDICT, use_pandas = True)
# Metrics
slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(x = df_compare['y_pred'],
y = df_compare['y_true'])
print('R² Coefficient = ', r_value2 * r_value2)
# Plot predictions
plt.plot(df_compare['y_true'])
plt.plot(df_compare['y_pred'])
plt.ylabel('Temperature', fontsize = 24)
plt.legend(['y_true','y_pred'], fontsize = 24)
plt.show()
# Shutdown H2O cluster
h2o.cluster().shutdown()