Introduction¶

Using a time series dataset with 5 years of weather data to explore usage of H2O's AutoML

Imports¶

# Imports
import h2o
import scipy
import pandas as pd
from datetime import datetime
import matplotlib as m
from h2o.automl import H2OAutoML
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

# Imports for plot formatting
plt.style.use('fivethirtyeight')
m.rcParams['axes.labelsize'] = 14
m.rcParams['xtick.labelsize'] = 12
m.rcParams['ytick.labelsize'] = 12
m.rcParams['text.color'] = 'k'
from pylab import rcParams
rcParams['figure.figsize'] = 18, 8

# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -v --iv

h2o        3.30.0.6
pandas     1.0.5
matplotlib 3.2.2
scipy      1.5.0
CPython 3.7.7
IPython 7.16.1

Load Data¶

dateparse = lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M')

df = pd.read_csv('https://raw.githubusercontent.com/Matheus-Schmitz/H2O_TimeSeries_AutoML/master/data/weather_5_years.csv', 
                  index_col = 0, 
                  parse_dates = ['date'], 
                  date_parser = dateparse)

df.head()

df.shape

(43800, 4)

# Index (the date is the dataset's index, which characterizes a time series)
df.index

DatetimeIndex(['2010-01-02 00:00:00', '2010-01-02 01:00:00',
               '2010-01-02 02:00:00', '2010-01-02 03:00:00',
               '2010-01-02 04:00:00', '2010-01-02 05:00:00',
               '2010-01-02 06:00:00', '2010-01-02 07:00:00',
               '2010-01-02 08:00:00', '2010-01-02 09:00:00',
               ...
               '2014-12-31 14:00:00', '2014-12-31 15:00:00',
               '2014-12-31 16:00:00', '2014-12-31 17:00:00',
               '2014-12-31 18:00:00', '2014-12-31 19:00:00',
               '2014-12-31 20:00:00', '2014-12-31 21:00:00',
               '2014-12-31 22:00:00', '2014-12-31 23:00:00'],
              dtype='datetime64[ns]', name='date', length=43800, freq=None)

Exploratory Analysis¶

# Plot each variable

# Counter
i = 1

# Plot
plt.figure()
for k in range(0, 4):
    plt.subplot(len(df.columns), 1, i)
    plt.plot(df.values[:, k])
    plt.title(df.columns[k], y = 0.5, loc = 'right')
    i += 1

plt.show()

Feature Engineering¶

df2 = df.copy()

# Control parameters

# Number of lags
num_lags = 3

# Delay (aka how many hours ahead to predict)
delay = 1

# Rolling window size
ws = 3

# Loop to apply the lag to each variable in the dataset
for column in df2:
    for lag in range(delay, num_lags+delay):
        df2[column + '_lag' + str(lag)] = df2[column].shift(lag)
        df2[column + '_rolling_avg' + str(lag)] = df2[column].shift(lag).rolling(window=ws).mean()

df2.head()

# Remove NA values
df2.dropna(inplace = True)
df2.shape

(43795, 28)

# Create a mask to define the final dataset
# Using all created features to predict temperature
# This means the mask will remove the current 'wind', 'snow' and 'rain' columns
mask = (df2.columns.str.contains('temperature') | df2.columns.str.contains('lag') | df2.columns.str.contains('rolling'))
mask

array([ True, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

# Final dataset
df_final = df2[df2.columns[mask]]

# Check columns
df_final.columns

Index(['temperature', 'temperature_lag1', 'temperature_rolling_avg1',
       'temperature_lag2', 'temperature_rolling_avg2', 'temperature_lag3',
       'temperature_rolling_avg3', 'wind_lag1', 'wind_rolling_avg1',
       'wind_lag2', 'wind_rolling_avg2', 'wind_lag3', 'wind_rolling_avg3',
       'snow_lag1', 'snow_rolling_avg1', 'snow_lag2', 'snow_rolling_avg2',
       'snow_lag3', 'snow_rolling_avg3', 'rain_lag1', 'rain_rolling_avg1',
       'rain_lag2', 'rain_rolling_avg2', 'rain_lag3', 'rain_rolling_avg3'],
      dtype='object')

Train Test Split¶

# Using the first 4 years to predict the last (80/20 split)
df_train = df_final[:-24*365]
df_test = df_final[-24*365:]

# Plot train x test data
plt.plot(df_train.index, df_train.temperature)
plt.plot(df_test.index, df_test.temperature)
plt.ylabel('Temperature', fontsize = 24)
plt.legend(['Train', 'Test'], fontsize = 24)
plt.show()

GPU¶

!nvidia-smi

Mon Jul 27 09:48:00 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 446.14       Driver Version: 446.14       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|

H2O Cluster & Frames¶

# Initialize the cluster
h2o.init()

|   0  GeForce GTX 960M   WDDM  | 00000000:01:00.0 Off |                  N/A |Checking whether there is an H2O instance running at http://localhost:54321 
.| N/A   48C    P8    N/A /  N/A |     37MiB /  2048MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU                  PID   Type   Process name                  GPU Memory |
|                                                                  Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+
.... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) Client VM (build 25.251-b08, mixed mode)

C:\Users\Matheus\Anaconda3\lib\site-packages\h2o\backend\server.py:385: UserWarning:   You have a 32-bit version of Java. H2O works best with 64-bit Java.
  Please download the latest 64-bit Java SE JDK from Oracle.

  warn("  You have a 32-bit version of Java. H2O works best with 64-bit Java.\n"

  Starting server from C:\Users\Matheus\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Matheus\AppData\Local\Temp\tmplv4c8zu1
  JVM stdout: C:\Users\Matheus\AppData\Local\Temp\tmplv4c8zu1\h2o_Matheus_started_from_python.out
  JVM stderr: C:\Users\Matheus\AppData\Local\Temp\tmplv4c8zu1\h2o_Matheus_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.

# Convert the train dataframe from pandas to H2O
hf_train = h2o.H2OFrame(df_train)

Parse progress: |█████████████████████████████████████████████████████████| 100%

# Convert the test dataframe from pandas to H2O
hf_test = h2o.H2OFrame(df_test)

Parse progress: |█████████████████████████████████████████████████████████| 100%

# Summary of the new dataframe
hf_train.describe()

Rows:35035
Cols:25

# Define y
y = 'temperature'

# Define X
X = hf_train.columns
X.remove(y)

X

['temperature_lag1',
 'temperature_rolling_avg1',
 'temperature_lag2',
 'temperature_rolling_avg2',
 'temperature_lag3',
 'temperature_rolling_avg3',
 'wind_lag1',
 'wind_rolling_avg1',
 'wind_lag2',
 'wind_rolling_avg2',
 'wind_lag3',
 'wind_rolling_avg3',
 'snow_lag1',
 'snow_rolling_avg1',
 'snow_lag2',
 'snow_rolling_avg2',
 'snow_lag3',
 'snow_rolling_avg3',
 'rain_lag1',
 'rain_rolling_avg1',
 'rain_lag2',
 'rain_rolling_avg2',
 'rain_lag3',
 'rain_rolling_avg3']

Training With H2O AutoML¶

Instead of picking a numer of models, I'll simply define the max_runtime and let H2O go through as many models as possible in that time

# Create model
model_aml = H2OAutoML(max_runtime_secs = 300)

%%time
model_aml.train(x = X,
                y = y,
                training_frame = hf_train,
                leaderboard_frame = hf_test)

AutoML progress: |
09:48:31.958: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%
Wall time: 5min 6s

Model Performance¶

# Get list with trained models
models = model_aml.leaderboard

# View trained models
models

# Get the best model
model_leader = model_aml.leader

# Save model
h2o.save_model(model = model_leader,
               path = 'models/',
               force = True)

'C:\\Portfolio\\H2O_TimeSeries_AutoML\\models\\StackedEnsemble_AllModels_AutoML_20200727_094831'

Predictions¶

# Predicting with best model
hf_test_predict = model_leader.predict(hf_test)

stackedensemble prediction progress: |████████████████████████████████████| 100%

# Predictions
hf_test_predict.head(5)

hf_test_predict.shape

(8760, 1)

# Create dataframe with true and predicted values
df_results = pd.DataFrame()
df_results['y_true'] = df_test['temperature'].reset_index(drop = True)
df_results['y_pred'] = h2o.as_list(hf_test_predict, use_pandas = True)
df_results.head()

# The linregress() function from Scipy can extract prediction metrics
slope, intercept, r_value, p_value, std_err = stats.linregress(x = df_results['y_pred'],
                                                               y = df_results['y_true'])

print('R² Coefficient = ', r_value * r_value)

R² Coefficient =  0.9863831131845733

# Plot y_pred and y_true
plt.plot(df_results['y_true'])
plt.plot(df_results['y_pred'])
plt.ylabel('Temperature', fontsize = 24)
plt.legend(['y_true','y_pred'], fontsize = 24)
plt.show()

Putting Everything Together¶

And predicting 5 hours (delay = 5) ahead and with 5 lags!

%%time

# Copy original df
df3 = df.copy()

# Number of lags
num_lags = 5

# Delay (aka how many hours ahead to predict)
delay = 5

# Rolling window size
ws = 3

# Loop to apply the lag to each variable in the dataset
for column in df3:
    for lag in range(delay, num_lags+delay):
        df3[column + '_lag' + str(lag)] = df3[column].shift(lag)
        df3[column + '_rolling_avg' + str(lag)] = df3[column].shift(lag).rolling(window=ws).mean()
        
# Remove NA values
df3.dropna(inplace = True)

# Mask
mask = (df3.columns.str.contains('temperature') | df3.columns.str.contains('lag') | df3.columns.str.contains('rolling'))

# Final dataset
df_processed = df3[df3.columns[mask]]

# Using the first 4 years to predict the last (80/20 split)
DF_TRAIN = df_processed[:-24*365]
DF_TEST = df_processed[-24*365:]

# H2O Frame
HF_TRAIN = h2o.H2OFrame(DF_TRAIN)
HF_TEST = h2o.H2OFrame(DF_TEST)

# Split X and Y
TARGET = 'temperature'
PREDICTORS = HF_TRAIN.columns
PREDICTORS.remove(TARGET)

# Create model
aml = H2OAutoML(max_runtime_secs = 300)

# Train model
aml.train(x = PREDICTORS,  y = TARGET, training_frame = HF_TRAIN, leaderboard_frame = HF_TEST)

# Best model
aml_leader = aml.leader

# Predictions
HF_TEST_PREDICT = aml_leader.predict(HF_TEST)

# Results
df_compare = pd.DataFrame()
df_compare['y_true'] = DF_TEST['temperature'].reset_index(drop = True)
df_compare['y_pred'] = h2o.as_list(HF_TEST_PREDICT, use_pandas = True)

# Metrics
slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(x = df_compare['y_pred'],
                                                               y = df_compare['y_true'])

print('R² Coefficient = ', r_value2 * r_value2)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |
09:53:44.911: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%
R² Coefficient =  0.8910766347739651
Wall time: 7min 49s

# Plot predictions
plt.plot(df_compare['y_true'])
plt.plot(df_compare['y_pred'])
plt.ylabel('Temperature', fontsize = 24)
plt.legend(['y_true','y_pred'], fontsize = 24)
plt.show()

# Shutdown H2O cluster
h2o.cluster().shutdown()

H2O session _sid_a0fd closed.

	temperature	wind	snow	rain	temperature_lag1	temperature_rolling_avg1	temperature_lag2	temperature_rolling_avg2	temperature_lag3	temperature_rolling_avg3	...	snow_lag2	snow_rolling_avg2	snow_lag3	snow_rolling_avg3	rain_lag1	rain_rolling_avg1	rain_lag2	rain_rolling_avg2	rain_lag3	rain_rolling_avg3
date
2010-01-02 00:00:00	-4.0	1.79	0	0	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2010-01-02 01:00:00	-4.0	2.68	0	0	-4.0	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	0.0	NaN	NaN	NaN	NaN	NaN
2010-01-02 02:00:00	-5.0	3.57	0	0	-4.0	NaN	-4.0	NaN	NaN	NaN	...	0.0	NaN	NaN	NaN	0.0	NaN	0.0	NaN	NaN	NaN
2010-01-02 03:00:00	-5.0	5.36	1	0	-5.0	-4.333333	-4.0	NaN	-4.0	NaN	...	0.0	NaN	0.0	NaN	0.0	0.0	0.0	NaN	0.0	NaN
2010-01-02 04:00:00	-5.0	6.25	2	0	-5.0	-4.666667	-5.0	-4.333333	-4.0	NaN	...	0.0	0.0	0.0	NaN	0.0	0.0	0.0	0.0	0.0	NaN

H2O_cluster_uptime:	04 secs
H2O_cluster_timezone:	America/Sao_Paulo
H2O_data_parsing_timezone:	UTC
H2O_cluster_version:	3.30.0.6
H2O_cluster_version_age:	26 days
H2O_cluster_name:	H2O_from_python_Matheus_w9gx68
H2O_cluster_total_nodes:	1
H2O_cluster_free_memory:	247.5 Mb
H2O_cluster_total_cores:	8
H2O_cluster_allowed_cores:	8
H2O_cluster_status:	accepting new members, healthy
H2O_connection_url:	http://127.0.0.1:54321
H2O_connection_proxy:	{"http": null, "https": null}
H2O_internal_security:	False
H2O_API_Extensions:	Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
Python_version:	3.7.7 final

	temperature	temperature_lag1	temperature_rolling_avg1	temperature_lag2	temperature_rolling_avg2	temperature_lag3	temperature_rolling_avg3	wind_lag1	wind_rolling_avg1	wind_lag2	wind_rolling_avg2	wind_lag3	wind_rolling_avg3	snow_lag1	snow_rolling_avg1	snow_lag2	snow_rolling_avg2	snow_lag3	snow_rolling_avg3	rain_lag1	rain_rolling_avg1	rain_lag2	rain_rolling_avg2	rain_lag3	rain_rolling_avg3
type	int	int	real	int	real	int	real	real	real	real	real	real	real	int	real	int	real	int	real	int	real	int	real	int	real
mins	-19.0	-19.0	-18.666666666666668	-19.0	-18.666666666666668	-19.0	-18.666666666666668	0.45	0.5966666666663781	0.45	0.5966666666663781	0.45	0.5966666666663781	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
mean	12.156300842015108	12.155958327386918	12.155634841349153	12.155644355644359	12.155311355311328	12.155301841016101	12.154987869273578	24.96247381190243	24.958931164073867	24.95890138432998	24.95546510632219	24.955418295989727	24.952113981256854	0.057456828885400324	0.05747585747585747	0.057485371771086057	0.05748537177108601	0.057485371771086057	0.05748537177108601	0.20990438133295256	0.20990438133295292	0.2099043813329526	0.20990438133295286	0.20990438133295267	0.20990438133295292
maxs	41.0	41.0	41.0	41.0	41.0	41.0	41.0	585.6	581.58	585.6	581.58	585.6	581.58	27.0	26.0	27.0	26.0	27.0	26.0	36.0	35.0	36.0	35.0	36.0	35.0
sigma	12.306282925177888	12.306593433108048	12.260370727523656	12.30689080642424	12.260665066052116	12.307201280746867	12.260950594252325	51.31534939819063	50.14924326332216	51.31235712897835	50.14650412530533	51.309667039615405	50.14406953075699	0.7738766478157415	0.7392920678293055	0.7738929698990649	0.7392934730344249	0.7738929698990649	0.7392934730344249	1.4900323726075717	1.4253660939434738	1.4900323726075717	1.4253660939434731	1.490032372607572	1.4253660939434736
zeros	882	882	376	882	376	882	376	0	0	0	0	0	0	34705	34594	34704	34593	34704	34593	33519	32710	33519	32710	33519	32710
missing	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
0	-6.0	-5.0	-5.0	-5.0	-4.666666666666667	-5.0	-4.333333333333333	6.25	5.060000000000001	5.36	3.870000000000001	3.57	2.68	2.0	1.0	1.0	0.3333333333333333	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	-6.0	-6.0	-5.333333333333333	-5.0	-5.0	-5.0	-4.666666666666667	7.14	6.250000000000001	6.25	5.060000000000001	5.36	3.870000000000001	3.0	2.0	2.0	1.0	1.0	0.3333333333333333	0.0	0.0	0.0	0.0	0.0	0.0
2	-5.0	-6.0	-5.666666666666667	-6.0	-5.333333333333333	-5.0	-5.0	8.93	7.440000000000001	7.14	6.250000000000001	6.25	5.060000000000001	4.0	3.0	3.0	2.0	2.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
3	-6.0	-5.0	-5.666666666666667	-6.0	-5.666666666666667	-6.0	-5.333333333333333	10.72	8.930000000000001	8.93	7.440000000000001	7.14	6.250000000000001	0.0	2.3333333333333335	4.0	3.0	3.0	2.0	0.0	0.0	0.0	0.0	0.0	0.0
4	-5.0	-6.0	-5.666666666666667	-5.0	-5.666666666666667	-6.0	-5.666666666666667	12.51	10.72	10.72	8.930000000000001	8.93	7.440000000000001	0.0	1.3333333333333333	0.0	2.3333333333333335	4.0	3.0	0.0	0.0	0.0	0.0	0.0	0.0
5	-5.0	-5.0	-5.333333333333333	-6.0	-5.666666666666667	-5.0	-5.666666666666667	14.3	12.510000000000003	12.51	10.72	10.72	8.930000000000001	0.0	0.0	0.0	1.3333333333333333	0.0	2.3333333333333335	0.0	0.0	0.0	0.0	0.0	0.0
6	-5.0	-5.0	-5.333333333333333	-5.0	-5.333333333333333	-6.0	-5.666666666666667	17.43	14.74666666666667	14.3	12.510000000000003	12.51	10.72	1.0	0.3333333333333333	0.0	0.0	0.0	1.3333333333333333	0.0	0.0	0.0	0.0	0.0	0.0
7	-5.0	-5.0	-5.0	-5.0	-5.333333333333333	-5.0	-5.333333333333333	20.56	17.430000000000003	17.43	14.74666666666667	14.3	12.510000000000003	0.0	0.3333333333333333	1.0	0.3333333333333333	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
8	-5.0	-5.0	-5.0	-5.0	-5.0	-5.0	-5.333333333333333	23.69	20.56000000000001	20.56	17.430000000000003	17.43	14.74666666666667	0.0	0.3333333333333333	0.0	0.3333333333333333	1.0	0.3333333333333333	0.0	0.0	0.0	0.0	0.0	0.0
9	-5.0	-5.0	-5.0	-5.0	-5.0	-5.0	-5.0	27.71	23.986666666666668	23.69	20.56000000000001	20.56	17.430000000000003	0.0	0.0	0.0	0.3333333333333333	0.0	0.3333333333333333	0.0	0.0	0.0	0.0	0.0	0.0

model_id	mean_residual_deviance	rmse	mse	mae	rmsle
StackedEnsemble_AllModels_AutoML_20200727_094831	1.84971	1.36004	1.84971	0.951169	nan
StackedEnsemble_BestOfFamily_AutoML_20200727_094831	1.85401	1.36162	1.85401	0.952874	nan
GBM_grid__1_AutoML_20200727_094831_model_1	1.9091	1.3817	1.9091	0.978494	nan
GLM_1_AutoML_20200727_094831	1.97297	1.40462	1.97297	0.991196	nan
GBM_grid__1_AutoML_20200727_094831_model_2	1.99201	1.41139	1.99201	1.03775	nan
XRT_1_AutoML_20200727_094831	2.69395	1.64133	2.69395	1.17479	nan
DRF_1_AutoML_20200727_094831	2.77234	1.66503	2.77234	1.2098	nan
GBM_1_AutoML_20200727_094831	3.21305	1.7925	3.21305	1.42645	nan
DeepLearning_1_AutoML_20200727_094831	3.26027	1.80562	3.26027	1.32046	nan
GBM_2_AutoML_20200727_094831	3.77031	1.94173	3.77031	1.57439	nan

	y_true	y_pred
0	7.0	6.683505
1	7.0	6.945151
2	6.0	6.591172
3	6.0	5.845779
4	3.0	5.691440

predict
6.6835
6.94515
6.59117
5.84578
5.69144