Introduction

Using a time series dataset with 5 years of weather data to explore usage of H2O's AutoML

Imports

In [1]:
# Imports
import h2o
import scipy
import pandas as pd
from datetime import datetime
import matplotlib as m
from h2o.automl import H2OAutoML
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

# Imports for plot formatting
plt.style.use('fivethirtyeight')
m.rcParams['axes.labelsize'] = 14
m.rcParams['xtick.labelsize'] = 12
m.rcParams['ytick.labelsize'] = 12
m.rcParams['text.color'] = 'k'
from pylab import rcParams
rcParams['figure.figsize'] = 18, 8
In [2]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -v --iv
h2o        3.30.0.6
pandas     1.0.5
matplotlib 3.2.2
scipy      1.5.0
CPython 3.7.7
IPython 7.16.1

Load Data

In [3]:
dateparse = lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M')

df = pd.read_csv('https://raw.githubusercontent.com/Matheus-Schmitz/H2O_TimeSeries_AutoML/master/data/weather_5_years.csv', 
                  index_col = 0, 
                  parse_dates = ['date'], 
                  date_parser = dateparse)
In [4]:
df.head()
Out[4]:
temperature wind snow rain
date
2010-01-02 00:00:00 -4.0 1.79 0 0
2010-01-02 01:00:00 -4.0 2.68 0 0
2010-01-02 02:00:00 -5.0 3.57 0 0
2010-01-02 03:00:00 -5.0 5.36 1 0
2010-01-02 04:00:00 -5.0 6.25 2 0
In [5]:
df.shape
Out[5]:
(43800, 4)
In [6]:
# Index (the date is the dataset's index, which characterizes a time series)
df.index
Out[6]:
DatetimeIndex(['2010-01-02 00:00:00', '2010-01-02 01:00:00',
               '2010-01-02 02:00:00', '2010-01-02 03:00:00',
               '2010-01-02 04:00:00', '2010-01-02 05:00:00',
               '2010-01-02 06:00:00', '2010-01-02 07:00:00',
               '2010-01-02 08:00:00', '2010-01-02 09:00:00',
               ...
               '2014-12-31 14:00:00', '2014-12-31 15:00:00',
               '2014-12-31 16:00:00', '2014-12-31 17:00:00',
               '2014-12-31 18:00:00', '2014-12-31 19:00:00',
               '2014-12-31 20:00:00', '2014-12-31 21:00:00',
               '2014-12-31 22:00:00', '2014-12-31 23:00:00'],
              dtype='datetime64[ns]', name='date', length=43800, freq=None)

Exploratory Analysis

In [7]:
# Plot each variable

# Counter
i = 1

# Plot
plt.figure()
for k in range(0, 4):
    plt.subplot(len(df.columns), 1, i)
    plt.plot(df.values[:, k])
    plt.title(df.columns[k], y = 0.5, loc = 'right')
    i += 1

plt.show()

Feature Engineering

In [8]:
df2 = df.copy()
In [9]:
# Control parameters

# Number of lags
num_lags = 3

# Delay (aka how many hours ahead to predict)
delay = 1

# Rolling window size
ws = 3
In [10]:
# Loop to apply the lag to each variable in the dataset
for column in df2:
    for lag in range(delay, num_lags+delay):
        df2[column + '_lag' + str(lag)] = df2[column].shift(lag)
        df2[column + '_rolling_avg' + str(lag)] = df2[column].shift(lag).rolling(window=ws).mean()
In [11]:
df2.head()
Out[11]:
temperature wind snow rain temperature_lag1 temperature_rolling_avg1 temperature_lag2 temperature_rolling_avg2 temperature_lag3 temperature_rolling_avg3 ... snow_lag2 snow_rolling_avg2 snow_lag3 snow_rolling_avg3 rain_lag1 rain_rolling_avg1 rain_lag2 rain_rolling_avg2 rain_lag3 rain_rolling_avg3
date
2010-01-02 00:00:00 -4.0 1.79 0 0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2010-01-02 01:00:00 -4.0 2.68 0 0 -4.0 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0.0 NaN NaN NaN NaN NaN
2010-01-02 02:00:00 -5.0 3.57 0 0 -4.0 NaN -4.0 NaN NaN NaN ... 0.0 NaN NaN NaN 0.0 NaN 0.0 NaN NaN NaN
2010-01-02 03:00:00 -5.0 5.36 1 0 -5.0 -4.333333 -4.0 NaN -4.0 NaN ... 0.0 NaN 0.0 NaN 0.0 0.0 0.0 NaN 0.0 NaN
2010-01-02 04:00:00 -5.0 6.25 2 0 -5.0 -4.666667 -5.0 -4.333333 -4.0 NaN ... 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 NaN

5 rows × 28 columns

In [12]:
# Remove NA values
df2.dropna(inplace = True)
df2.shape
Out[12]:
(43795, 28)
In [13]:
# Create a mask to define the final dataset
# Using all created features to predict temperature
# This means the mask will remove the current 'wind', 'snow' and 'rain' columns
mask = (df2.columns.str.contains('temperature') | df2.columns.str.contains('lag') | df2.columns.str.contains('rolling'))
mask
Out[13]:
array([ True, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])
In [14]:
# Final dataset
df_final = df2[df2.columns[mask]]
In [15]:
# Check columns
df_final.columns
Out[15]:
Index(['temperature', 'temperature_lag1', 'temperature_rolling_avg1',
       'temperature_lag2', 'temperature_rolling_avg2', 'temperature_lag3',
       'temperature_rolling_avg3', 'wind_lag1', 'wind_rolling_avg1',
       'wind_lag2', 'wind_rolling_avg2', 'wind_lag3', 'wind_rolling_avg3',
       'snow_lag1', 'snow_rolling_avg1', 'snow_lag2', 'snow_rolling_avg2',
       'snow_lag3', 'snow_rolling_avg3', 'rain_lag1', 'rain_rolling_avg1',
       'rain_lag2', 'rain_rolling_avg2', 'rain_lag3', 'rain_rolling_avg3'],
      dtype='object')

Train Test Split

In [16]:
# Using the first 4 years to predict the last (80/20 split)
df_train = df_final[:-24*365]
df_test = df_final[-24*365:]
In [17]:
# Plot train x test data
plt.plot(df_train.index, df_train.temperature)
plt.plot(df_test.index, df_test.temperature)
plt.ylabel('Temperature', fontsize = 24)
plt.legend(['Train', 'Test'], fontsize = 24)
plt.show()

GPU

In [18]:
!nvidia-smi
Mon Jul 27 09:48:00 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 446.14       Driver Version: 446.14       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|

H2O Cluster & Frames

In [19]:
# Initialize the cluster
h2o.init()
|   0  GeForce GTX 960M   WDDM  | 00000000:01:00.0 Off |                  N/A |Checking whether there is an H2O instance running at http://localhost:54321 
.| N/A   48C    P8    N/A /  N/A |     37MiB /  2048MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU                  PID   Type   Process name                  GPU Memory |
|                                                                  Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+
.... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) Client VM (build 25.251-b08, mixed mode)
C:\Users\Matheus\Anaconda3\lib\site-packages\h2o\backend\server.py:385: UserWarning:   You have a 32-bit version of Java. H2O works best with 64-bit Java.
  Please download the latest 64-bit Java SE JDK from Oracle.

  warn("  You have a 32-bit version of Java. H2O works best with 64-bit Java.\n"
  Starting server from C:\Users\Matheus\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Matheus\AppData\Local\Temp\tmplv4c8zu1
  JVM stdout: C:\Users\Matheus\AppData\Local\Temp\tmplv4c8zu1\h2o_Matheus_started_from_python.out
  JVM stderr: C:\Users\Matheus\AppData\Local\Temp\tmplv4c8zu1\h2o_Matheus_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
H2O_cluster_uptime: 04 secs
H2O_cluster_timezone: America/Sao_Paulo
H2O_data_parsing_timezone: UTC
H2O_cluster_version: 3.30.0.6
H2O_cluster_version_age: 26 days
H2O_cluster_name: H2O_from_python_Matheus_w9gx68
H2O_cluster_total_nodes: 1
H2O_cluster_free_memory: 247.5 Mb
H2O_cluster_total_cores: 8
H2O_cluster_allowed_cores: 8
H2O_cluster_status: accepting new members, healthy
H2O_connection_url: http://127.0.0.1:54321
H2O_connection_proxy: {"http": null, "https": null}
H2O_internal_security: False
H2O_API_Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
Python_version: 3.7.7 final
In [20]:
# Convert the train dataframe from pandas to H2O
hf_train = h2o.H2OFrame(df_train)
Parse progress: |█████████████████████████████████████████████████████████| 100%
In [21]:
# Convert the test dataframe from pandas to H2O
hf_test = h2o.H2OFrame(df_test)
Parse progress: |█████████████████████████████████████████████████████████| 100%
In [22]:
# Summary of the new dataframe
hf_train.describe()
Rows:35035
Cols:25


temperature temperature_lag1 temperature_rolling_avg1 temperature_lag2 temperature_rolling_avg2 temperature_lag3 temperature_rolling_avg3 wind_lag1 wind_rolling_avg1 wind_lag2 wind_rolling_avg2 wind_lag3 wind_rolling_avg3 snow_lag1 snow_rolling_avg1 snow_lag2 snow_rolling_avg2 snow_lag3 snow_rolling_avg3 rain_lag1 rain_rolling_avg1 rain_lag2 rain_rolling_avg2 rain_lag3 rain_rolling_avg3
type int int real int real int real real real real real real real int real int real int real int real int real int real
mins -19.0 -19.0 -18.666666666666668 -19.0 -18.666666666666668 -19.0 -18.666666666666668 0.45 0.5966666666663781 0.45 0.5966666666663781 0.45 0.5966666666663781 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
mean 12.15630084201510812.15595832738691812.155634841349153 12.15564435564435912.155311355311328 12.15530184101610112.154987869273578 24.9624738119024324.958931164073867 24.9589013843299824.95546510632219 24.95541829598972724.952113981256854 0.0574568288854003240.057475857475857470.0574853717710860570.057485371771086010.0574853717710860570.057485371771086010.209904381332952560.209904381332952920.20990438133295260.209904381332952860.209904381332952670.20990438133295292
maxs 41.0 41.0 41.0 41.0 41.0 41.0 41.0 585.6 581.58 585.6 581.58 585.6 581.58 27.0 26.0 27.0 26.0 27.0 26.0 36.0 35.0 36.0 35.0 36.0 35.0
sigma 12.30628292517788812.30659343310804812.260370727523656 12.30689080642424 12.260665066052116 12.30720128074686712.260950594252325 51.3153493981906350.14924326332216 51.3123571289783550.14650412530533 51.30966703961540550.14406953075699 0.7738766478157415 0.7392920678293055 0.7738929698990649 0.7392934730344249 0.7738929698990649 0.7392934730344249 1.4900323726075717 1.4253660939434738 1.49003237260757171.4253660939434731 1.490032372607572 1.4253660939434736
zeros 882 882 376 882 376 882 376 0 0 0 0 0 0 34705 34594 34704 34593 34704 34593 33519 32710 33519 32710 33519 32710
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 -6.0 -5.0 -5.0 -5.0 -4.666666666666667 -5.0 -4.333333333333333 6.25 5.060000000000001 5.36 3.870000000000001 3.57 2.68 2.0 1.0 1.0 0.3333333333333333 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 -6.0 -6.0 -5.333333333333333 -5.0 -5.0 -5.0 -4.666666666666667 7.14 6.250000000000001 6.25 5.060000000000001 5.36 3.870000000000001 3.0 2.0 2.0 1.0 1.0 0.3333333333333333 0.0 0.0 0.0 0.0 0.0 0.0
2 -5.0 -6.0 -5.666666666666667 -6.0 -5.333333333333333 -5.0 -5.0 8.93 7.440000000000001 7.14 6.250000000000001 6.25 5.060000000000001 4.0 3.0 3.0 2.0 2.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
3 -6.0 -5.0 -5.666666666666667 -6.0 -5.666666666666667 -6.0 -5.333333333333333 10.72 8.930000000000001 8.93 7.440000000000001 7.14 6.250000000000001 0.0 2.3333333333333335 4.0 3.0 3.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0
4 -5.0 -6.0 -5.666666666666667 -5.0 -5.666666666666667 -6.0 -5.666666666666667 12.51 10.72 10.72 8.930000000000001 8.93 7.440000000000001 0.0 1.3333333333333333 0.0 2.3333333333333335 4.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
5 -5.0 -5.0 -5.333333333333333 -6.0 -5.666666666666667 -5.0 -5.666666666666667 14.3 12.510000000000003 12.51 10.72 10.72 8.930000000000001 0.0 0.0 0.0 1.3333333333333333 0.0 2.3333333333333335 0.0 0.0 0.0 0.0 0.0 0.0
6 -5.0 -5.0 -5.333333333333333 -5.0 -5.333333333333333 -6.0 -5.666666666666667 17.43 14.74666666666667 14.3 12.510000000000003 12.51 10.72 1.0 0.3333333333333333 0.0 0.0 0.0 1.3333333333333333 0.0 0.0 0.0 0.0 0.0 0.0
7 -5.0 -5.0 -5.0 -5.0 -5.333333333333333 -5.0 -5.333333333333333 20.56 17.430000000000003 17.43 14.74666666666667 14.3 12.510000000000003 0.0 0.3333333333333333 1.0 0.3333333333333333 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8 -5.0 -5.0 -5.0 -5.0 -5.0 -5.0 -5.333333333333333 23.69 20.56000000000001 20.56 17.430000000000003 17.43 14.74666666666667 0.0 0.3333333333333333 0.0 0.3333333333333333 1.0 0.3333333333333333 0.0 0.0 0.0 0.0 0.0 0.0
9 -5.0 -5.0 -5.0 -5.0 -5.0 -5.0 -5.0 27.71 23.986666666666668 23.69 20.56000000000001 20.56 17.430000000000003 0.0 0.0 0.0 0.3333333333333333 0.0 0.3333333333333333 0.0 0.0 0.0 0.0 0.0 0.0
In [23]:
# Define y
y = 'temperature'
In [24]:
# Define X
X = hf_train.columns
X.remove(y)
In [25]:
X
Out[25]:
['temperature_lag1',
 'temperature_rolling_avg1',
 'temperature_lag2',
 'temperature_rolling_avg2',
 'temperature_lag3',
 'temperature_rolling_avg3',
 'wind_lag1',
 'wind_rolling_avg1',
 'wind_lag2',
 'wind_rolling_avg2',
 'wind_lag3',
 'wind_rolling_avg3',
 'snow_lag1',
 'snow_rolling_avg1',
 'snow_lag2',
 'snow_rolling_avg2',
 'snow_lag3',
 'snow_rolling_avg3',
 'rain_lag1',
 'rain_rolling_avg1',
 'rain_lag2',
 'rain_rolling_avg2',
 'rain_lag3',
 'rain_rolling_avg3']

Training With H2O AutoML

Instead of picking a numer of models, I'll simply define the max_runtime and let H2O go through as many models as possible in that time

In [26]:
# Create model
model_aml = H2OAutoML(max_runtime_secs = 300)
In [27]:
%%time
model_aml.train(x = X,
                y = y,
                training_frame = hf_train,
                leaderboard_frame = hf_test)
AutoML progress: |
09:48:31.958: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%
Wall time: 5min 6s

Model Performance

In [28]:
# Get list with trained models
models = model_aml.leaderboard
In [29]:
# View trained models
models
model_id mean_residual_deviance rmse mse mae rmsle
StackedEnsemble_AllModels_AutoML_20200727_094831 1.849711.360041.849710.951169 nan
StackedEnsemble_BestOfFamily_AutoML_20200727_094831 1.854011.361621.854010.952874 nan
GBM_grid__1_AutoML_20200727_094831_model_1 1.9091 1.3817 1.9091 0.978494 nan
GLM_1_AutoML_20200727_094831 1.972971.404621.972970.991196 nan
GBM_grid__1_AutoML_20200727_094831_model_2 1.992011.411391.992011.03775 nan
XRT_1_AutoML_20200727_094831 2.693951.641332.693951.17479 nan
DRF_1_AutoML_20200727_094831 2.772341.665032.772341.2098 nan
GBM_1_AutoML_20200727_094831 3.213051.7925 3.213051.42645 nan
DeepLearning_1_AutoML_20200727_094831 3.260271.805623.260271.32046 nan
GBM_2_AutoML_20200727_094831 3.770311.941733.770311.57439 nan
Out[29]:

In [30]:
# Get the best model
model_leader = model_aml.leader
In [31]:
# Save model
h2o.save_model(model = model_leader,
               path = 'models/',
               force = True)
Out[31]:
'C:\\Portfolio\\H2O_TimeSeries_AutoML\\models\\StackedEnsemble_AllModels_AutoML_20200727_094831'

Predictions

In [32]:
# Predicting with best model
hf_test_predict = model_leader.predict(hf_test)
stackedensemble prediction progress: |████████████████████████████████████| 100%
In [33]:
# Predictions
hf_test_predict.head(5)
predict
6.6835
6.94515
6.59117
5.84578
5.69144
Out[33]:

In [34]:
hf_test_predict.shape
Out[34]:
(8760, 1)
In [35]:
# Create dataframe with true and predicted values
df_results = pd.DataFrame()
df_results['y_true'] = df_test['temperature'].reset_index(drop = True)
df_results['y_pred'] = h2o.as_list(hf_test_predict, use_pandas = True)
df_results.head()
Out[35]:
y_true y_pred
0 7.0 6.683505
1 7.0 6.945151
2 6.0 6.591172
3 6.0 5.845779
4 3.0 5.691440
In [36]:
# The linregress() function from Scipy can extract prediction metrics
slope, intercept, r_value, p_value, std_err = stats.linregress(x = df_results['y_pred'],
                                                               y = df_results['y_true'])
In [37]:
print('R² Coefficient = ', r_value * r_value)
R² Coefficient =  0.9863831131845733
In [38]:
# Plot y_pred and y_true
plt.plot(df_results['y_true'])
plt.plot(df_results['y_pred'])
plt.ylabel('Temperature', fontsize = 24)
plt.legend(['y_true','y_pred'], fontsize = 24)
plt.show()

Putting Everything Together

And predicting 5 hours (delay = 5) ahead and with 5 lags!

In [39]:
%%time

# Copy original df
df3 = df.copy()

# Number of lags
num_lags = 5

# Delay (aka how many hours ahead to predict)
delay = 5

# Rolling window size
ws = 3

# Loop to apply the lag to each variable in the dataset
for column in df3:
    for lag in range(delay, num_lags+delay):
        df3[column + '_lag' + str(lag)] = df3[column].shift(lag)
        df3[column + '_rolling_avg' + str(lag)] = df3[column].shift(lag).rolling(window=ws).mean()
        
# Remove NA values
df3.dropna(inplace = True)

# Mask
mask = (df3.columns.str.contains('temperature') | df3.columns.str.contains('lag') | df3.columns.str.contains('rolling'))

# Final dataset
df_processed = df3[df3.columns[mask]]

# Using the first 4 years to predict the last (80/20 split)
DF_TRAIN = df_processed[:-24*365]
DF_TEST = df_processed[-24*365:]

# H2O Frame
HF_TRAIN = h2o.H2OFrame(DF_TRAIN)
HF_TEST = h2o.H2OFrame(DF_TEST)

# Split X and Y
TARGET = 'temperature'
PREDICTORS = HF_TRAIN.columns
PREDICTORS.remove(TARGET)

# Create model
aml = H2OAutoML(max_runtime_secs = 300)

# Train model
aml.train(x = PREDICTORS,  y = TARGET, training_frame = HF_TRAIN, leaderboard_frame = HF_TEST)

# Best model
aml_leader = aml.leader

# Predictions
HF_TEST_PREDICT = aml_leader.predict(HF_TEST)

# Results
df_compare = pd.DataFrame()
df_compare['y_true'] = DF_TEST['temperature'].reset_index(drop = True)
df_compare['y_pred'] = h2o.as_list(HF_TEST_PREDICT, use_pandas = True)

# Metrics
slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(x = df_compare['y_pred'],
                                                               y = df_compare['y_true'])

print('R² Coefficient = ', r_value2 * r_value2)
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |
09:53:44.911: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%
R² Coefficient =  0.8910766347739651
Wall time: 7min 49s
In [40]:
# Plot predictions
plt.plot(df_compare['y_true'])
plt.plot(df_compare['y_pred'])
plt.ylabel('Temperature', fontsize = 24)
plt.legend(['y_true','y_pred'], fontsize = 24)
plt.show()
In [41]:
# Shutdown H2O cluster
h2o.cluster().shutdown()
H2O session _sid_a0fd closed.

End