Train a Neural Network - Alternative with Keras#

The following cells show how the alternative use of keras and tensorflow would look like to set up the same model for our regression task.

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.models import load_model

import optuna
# Tensorflow error handling
#tf.config.optimizer.set_jit(False)
#tf.config.experimental.enable_op_determinism()
#tf.random.set_seed(1)
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
tf.config.list_physical_devices()

Load and prepare data#

df = pd.read_csv('data/HB_data.csv', sep=',')

df_val = df.sample(frac=0.05, random_state=42)
df_tt = df.drop(df_val.index)

target_col = "energy"
X = df_tt.drop(columns=[target_col])
y = df_tt[target_col].to_numpy().reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Preprocess and transform features
numerical_features = X.select_dtypes(include=["number"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

# Define scaling and encoding for X
x_preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
    ],
    remainder="drop",
)
# Define scaling for y
y_scaler = StandardScaler()

# Fit scaling for X on train data only, apply (transform) scaling to both train and test
X_train_t = x_preprocessor.fit_transform(X_train)
X_test_t = x_preprocessor.transform(X_test)

# Fit scaling for y on train data only, apply (transform) scaling to both train and test
y_train_t = y_scaler.fit_transform(y_train)
y_test_t = y_scaler.transform(y_test)

# Get transformed feature names (handy for inspection)
feat_names = x_preprocessor.get_feature_names_out()
    
# Inspect transformed data
print("X_train: ", X_train_t.shape)
print("X_test:  ", X_test_t.shape)
print("y_train: ", y_train_t.shape)
print("y_test:  ", y_test_t.shape)
display(pd.DataFrame(X_train_t, columns=feat_names).head(5))
X_train:  (1244, 16)
X_test:   (312, 16)
y_train:  (1244, 1)
y_test:   (312, 1)
num__bo-acc num__bo-donor num__q-acc num__q-donor num__q-hatom num__dist-dh num__dist-ah cat__atomtype-acc_Cl cat__atomtype-acc_F cat__atomtype-acc_N cat__atomtype-acc_O cat__atomtype-acc_S cat__atomtype-don_F cat__atomtype-don_N cat__atomtype-don_O cat__atomtype-don_S
0 -1.285813 0.917745 -0.115091 -0.282173 0.864921 -0.662341 1.298361 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
1 -0.647649 0.186561 0.970476 -0.771634 -0.619231 -0.258013 1.832981 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2 -1.162635 -0.394051 0.083728 -0.078369 0.426753 -0.294188 0.813984 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0
3 0.306321 -0.886830 0.274388 0.630579 0.169388 -0.271833 -1.260058 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0
4 2.782975 0.231048 -2.398073 -1.403848 -1.551664 -0.384992 -0.519736 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
# Define the model with a parameterizable architecture with Keras
def create_model(in_dim, hidden=(64, 32), dropout=0.1, lr=1e-3, weight_decay=1e-5):
    inputs = keras.Input(shape=(in_dim,))
    x = inputs
    for h in hidden:
        x = layers.Dense(
            h,
            activation="relu",
            kernel_regularizer=regularizers.l2(weight_decay)
        )(x)
        x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(1, activation="linear")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="mse",
        metrics=[keras.metrics.RootMeanSquaredError(name="rmse")]
    )
    return model
# Define function for training loop with hyperparameter configuration
# Also define a directory to store model checkpoints during training
CKPT_DIR = "optuna_ckpts"
os.makedirs(CKPT_DIR, exist_ok=True)

def objective(trial):
    hidden_layers_list = [(256,128,64), (128,64), (64,32), (64,)]
    idx = trial.suggest_int("hidden_layers_idx", 0, len(hidden_layers_list)-1)
    hidden_layers = hidden_layers_list[idx]
    dropout = trial.suggest_float("dropout", 0.0, 0.3)
    lr = trial.suggest_float("lr", 5e-4, 3e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16,32,64,128,256])
    epochs = 50

    best_rmse = float("inf")
    best_model_path = os.path.join(CKPT_DIR, f"best_trial.keras")

    model = create_model(
        in_dim=X_train_t.shape[1],
        hidden=hidden_layers,
        dropout=dropout,
        lr=lr,
        weight_decay=weight_decay,
    )

    history = model.fit(
        X_train_t, y_train_t,
        validation_data=(X_test_t, y_test_t),
        batch_size=batch_size,
        epochs=epochs,
        verbose=0,
    )

    # Evaluate in original units (unscaled y)
    y_pred = y_scaler.inverse_transform(model.predict(X_test_t))
    y_true = y_scaler.inverse_transform(y_test_t)
    val_rmse = math.sqrt(np.mean((y_pred - y_true) ** 2))

    # Store model if model improved
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        model.save(best_model_path)
    
    # Report to optuna
    trial.set_user_attr("keras_history", history.history)
    trial.set_user_attr("best_model", best_model_path)

    return val_rmse
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20, timeout=600)
best_trial = study.best_trial
best_params = best_trial.params
display("Params:", best_params)
print("Best RMSE:", best_trial.value)
'Params:'
{'hidden_layers_idx': 1,
 'dropout': 0.03851177664057214,
 'lr': 0.0009183114007974771,
 'weight_decay': 5.794506140272295e-05,
 'batch_size': 64}
Best RMSE: 2.553923289711433
# Analyze the best trials history
history_dict = best_trial.user_attrs["keras_history"]

df_history = pd.DataFrame(history_dict)
df_history[['loss', 'val_loss']].plot(legend=True)
plt.show()
../_images/8cd483cd7021aa9ac95fcbd0f8c946956ef86877ee8e1e131a5c121e29670b76.png
best_model_path = best_trial.user_attrs["best_model"]
best_model = load_model(best_model_path)
best_model.summary()
Model: "functional_19"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_19 (InputLayer)     │ (None, 16)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_60 (Dense)                │ (None, 64)             │         1,088 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_41 (Dropout)            │ (None, 64)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_61 (Dense)                │ (None, 32)             │         2,080 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_42 (Dropout)            │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_62 (Dense)                │ (None, 1)              │            33 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 9,605 (37.52 KB)
 Trainable params: 3,201 (12.50 KB)
 Non-trainable params: 0 (0.00 B)
 Optimizer params: 6,404 (25.02 KB)