MILES-GUESS Regression Example Notebook#
John Schreck, David John Gagne, Charlie Becker, Gabrielle Gantos, Dhamma Kimpara, Thomas Martin - distinguish between authors and contributors - add orchids for people?
Objective#
This notebook is meant to demonstrate a functional example of how to use the miles-guess repository for training an evidential regression model.
Steps to get this notebook running: 1) Follow package installation steps in the miles-guess ReadMe. 2) Run the cells in this notebook in order.
Key Points#
Evidential deep learning is capable of training a single model to simultaneously learn the problem and estimate the problem’s aleatoric and epistemic uncertainties. See this paper here().
This notebook represents an example of evidential deep learning for a regression problem. Specifically, that regression problem
Data? (Make sure to mention the subsetting of the dataset for ptype)
Parameters?
What will user be prepared to do with miles-guess after running this notebook?
Notes I can add#
Here is where data comes in
Here’s where you can vary a parameter or not
Defaults to parameters
Limits to the parameters (only good for this timestep, only good for this seed, etc)
Physical meaning to these parameters?
[ ]:
import os, tqdm, yaml
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from mlguess.keras.models import GaussianRegressorDNN, EvidentialRegressorDNN
from mlguess.keras.models import BaseRegressor as RegressorDNN
from mlguess.keras.callbacks import get_callbacks
from mlguess.regression_uq import compute_results
Config File#
Load the config file#
[2]:
config = "../config/surface_layer/mlp.yml"
with open(config) as cf:
conf = yaml.load(cf, Loader=yaml.FullLoader)
Data#
Load Surface Layer data from the repo#
[ ]:
data = pd.read_csv("../data/sample_cabauw_surface_layer.csv")
data["day"] = data["Time"].apply(lambda x: str(x).split(" ")[0])
Train-Valid-Test Splits#
This is a two-step process: 1. Split all of the data on the day column between train (90%) and test (10%). The test data will be consisten accross all trained models and all data and model ensembles. 2. Split the 90% training data from Step 1 into training and validation.
[ ]:
# TODO: Why do we need two seeds?
data_seed = 0
flat_seed = 1000
[ ]:
# Need the same test_data for all trained models (data and model ensembles)
gsp = GroupShuffleSplit(n_splits=1, random_state=flat_seed, train_size=0.9)
splits = list(gsp.split(data, groups=data["day"]))
train_index, test_index = splits[0]
train_data, test_data = data.iloc[train_index].copy(), data.iloc[test_index].copy()
# Make N train-valid splits using day as grouping variable
gsp = GroupShuffleSplit(n_splits=1, random_state=flat_seed, train_size=0.885)
splits = list(gsp.split(train_data, groups=train_data["day"]))
train_index, valid_index = splits[data_seed]
train_data, valid_data = train_data.iloc[train_index].copy(), train_data.iloc[valid_index].copy()
Data Scaling#
[ ]:
input_cols = conf["data"]["input_cols"]
#TODO: Should we include the other two output variables as potential options?
output_cols = ["friction_velocity:surface:m_s-1"]
[ ]:
x_scaler, y_scaler = RobustScaler(), MinMaxScaler((0, 1))
x_train = x_scaler.fit_transform(train_data[input_cols])
x_valid = x_scaler.transform(valid_data[input_cols])
x_test = x_scaler.transform(test_data[input_cols])
y_train = y_scaler.fit_transform(train_data[output_cols])
y_valid = y_scaler.transform(valid_data[output_cols])
y_test = y_scaler.transform(test_data[output_cols])
Train the model#
[ ]:
# TODO: Why overwrite these variables in the config?
conf["model"]["epochs"] = 1
conf["model"]["verbose"] = 1
[ ]:
model = RegressorDNN(**conf["model"])
model.build_neural_network(x_train.shape[-1], y_train.shape[-1])
[ ]:
model.fit(x_train,
y_train,
validation_data=(x_valid, y_valid),
callbacks=get_callbacks(conf))
[ ]:
y_pred = model.predict(x_test, y_scaler)
[ ]:
mae = np.mean(np.abs(y_pred[:, 0]-test_data[output_cols[0]]))
mae
[ ]:
monte_carlo_steps = 10
[ ]:
results = model.predict_monte_carlo(x_test, monte_carlo_steps, y_scaler)
[ ]:
mu_ensemble = np.mean(results, axis=0)
var_ensemble = np.var(results, axis=0)
[ ]:
config = "../config/surface_layer/gaussian.yml"
with open(config) as cf:
conf = yaml.load(cf, Loader=yaml.FullLoader)
conf["model"]["epochs"] = 1
conf["model"]["verbose"] = 1
[ ]:
gauss_model = GaussianRegressorDNN(**conf["model"])
gauss_model.build_neural_network(x_train.shape[-1], y_train.shape[-1])
[ ]:
gauss_model.fit(
x_train,
y_train,
validation_data=(x_valid, y_valid),
callbacks=get_callbacks(conf)
)
[ ]:
mu, var = gauss_model.predict_uncertainty(x_test, y_scaler)
[ ]:
# compute variance and std from learned parameters
#mu, var = gauss_model.calc_uncertainties(y_pred, y_scaler)
[ ]:
mae = np.mean(np.abs(mu[:, 0]-test_data[output_cols[0]]))
print(mae, np.mean(var) ** (1/2))
[ ]:
sns.jointplot(x=test_data[output_cols[0]], y=mu[:, 0], kind='hex')
plt.xlabel('Target')
plt.ylabel('Predicted Target')
plt.show()
[ ]:
sns.jointplot(x=mu[:, 0], y=np.sqrt(var)[:, 0], kind='hex')
plt.xlabel('Predicted mu')
plt.ylabel('Predicted sigma')
plt.show()
[ ]:
config = "../config/surface_layer/evidential.yml"
with open(config) as cf:
conf = yaml.load(cf, Loader=yaml.FullLoader)
conf["model"]["epochs"] = 5
conf["model"]["verbose"] = 1
[ ]:
ev_model = EvidentialRegressorDNN(**conf["model"])
ev_model.build_neural_network(x_train.shape[-1], y_train.shape[-1])
[ ]:
ev_model.fit(
x_train,
y_train,
validation_data=(x_valid, y_valid),
callbacks=get_callbacks(conf))
[ ]:
result = ev_model.predict_uncertainty(x_test, scaler=y_scaler)
mu, aleatoric, epistemic = result
[ ]:
mae = np.mean(np.abs(mu[:, 0] - test_data[output_cols[0]]))
print(mae, np.mean(aleatoric)**(1/2), np.mean(epistemic)**(1/2))
[ ]:
compute_results(test_data,
output_cols,
mu,
np.sqrt(aleatoric),
np.sqrt(epistemic))
[ ]:
config = "../config/surface_layer/gaussian.yml"
with open(config) as cf:
conf = yaml.load(cf, Loader=yaml.FullLoader)
conf["save_loc"] = "./"
conf["model"]["epochs"] = 1
conf["model"]["verbose"] = 0
n_splits = conf["ensemble"]["n_splits"]
[ ]:
# make save directory for model weights
os.makedirs(os.path.join(conf["save_loc"], "cv_ensemble", "models"), exist_ok=True)
[ ]:
data_seed = 0
gsp = GroupShuffleSplit(n_splits=1, random_state=flat_seed, train_size=0.9)
splits = list(gsp.split(data, groups=data["day"]))
train_index, test_index = splits[0]
[ ]:
ensemble_mu = np.zeros((n_splits, test_data.shape[0], 1))
ensemble_var = np.zeros((n_splits, test_data.shape[0], 1))
for data_seed in tqdm.tqdm(range(n_splits)):
data = pd.read_csv(fn)
data["day"] = data["Time"].apply(lambda x: str(x).split(" ")[0])
# Need the same test_data for all trained models (data and model ensembles)
flat_seed = 1000
gsp = GroupShuffleSplit(n_splits=1,
random_state=flat_seed,
train_size=0.9)
splits = list(gsp.split(data, groups=data["day"]))
train_index, test_index = splits[0]
train_data, test_data = data.iloc[train_index].copy(), data.iloc[test_index].copy()
# Make N train-valid splits using day as grouping variable
gsp = GroupShuffleSplit(n_splits=n_splits, random_state=flat_seed, train_size=0.885)
splits = list(gsp.split(train_data, groups=train_data["day"]))
train_index, valid_index = splits[data_seed]
train_data, valid_data = train_data.iloc[train_index].copy(), train_data.iloc[valid_index].copy()
x_scaler, y_scaler = RobustScaler(), MinMaxScaler((0, 1))
x_train = x_scaler.fit_transform(train_data[input_cols])
x_valid = x_scaler.transform(valid_data[input_cols])
x_test = x_scaler.transform(test_data[input_cols])
y_train = y_scaler.fit_transform(train_data[output_cols])
y_valid = y_scaler.transform(valid_data[output_cols])
y_test = y_scaler.transform(test_data[output_cols])
model = GaussianRegressorDNN(**conf["model"])
model.build_neural_network(x_train.shape[-1], y_train.shape[-1])
model.fit(
x_train,
y_train,
validation_data=(x_valid, y_valid),
callbacks=get_callbacks(conf))
model.model_name = f"cv_ensemble/models/model_seed0_split{data_seed}.h5"
model.save_model()
# Save the best model
model.model_name = "cv_ensemble/models/best.h5"
model.save_model()
mu, var = model.predict_uncertainty(x_test, y_scaler)
mae = np.mean(np.abs(mu[:, 0]-test_data[output_cols[0]]))
ensemble_mu[data_seed] = mu
ensemble_var[data_seed] = var
[ ]:
model = GaussianRegressorDNN().load_model(conf)
[ ]:
ensemble_mu, ensemble_var = model.predict_ensemble(x_test, scaler=y_scaler)
[ ]:
epistemic = np.var(ensemble_mu, axis=0)
aleatoric = np.mean(ensemble_var, axis=0)
[ ]:
print(epistemic.mean()**(1/2), aleatoric.mean()**(1/2))
[ ]:
compute_results(test_data,
output_cols,
np.mean(ensemble_mu, axis=0),
np.sqrt(aleatoric),
np.sqrt(epistemic))
[ ]:
monte_carlo_steps = 10
ensemble_mu, ensemble_var = model.predict_monte_carlo(x_test,
monte_carlo_steps,
scaler=y_scaler)
[ ]:
ensemble_epistemic = np.var(ensemble_mu, axis=0)
ensemble_aleatoric = np.mean(ensemble_var, axis=0)
ensemble_mean = np.mean(ensemble_mu, axis=0)