Train a Neural Network - Alternative with Keras

Contents

Train a Neural Network - Alternative with Keras#

The following cells show how the alternative use of keras and tensorflow would look like to set up the same model for our regression task.

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.models import load_model

import optuna

# Tensorflow error handling
#tf.config.optimizer.set_jit(False)
#tf.config.experimental.enable_op_determinism()
#tf.random.set_seed(1)
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
tf.config.list_physical_devices()

Load and prepare data#

df = pd.read_csv('data/HB_data.csv', sep=',')

df_val = df.sample(frac=0.05, random_state=42)
df_tt = df.drop(df_val.index)

target_col = "energy"
X = df_tt.drop(columns=[target_col])
y = df_tt[target_col].to_numpy().reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess and transform features
numerical_features = X.select_dtypes(include=["number"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

# Define scaling and encoding for X
x_preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
    ],
    remainder="drop",
)
# Define scaling for y
y_scaler = StandardScaler()

# Fit scaling for X on train data only, apply (transform) scaling to both train and test
X_train_t = x_preprocessor.fit_transform(X_train)
X_test_t = x_preprocessor.transform(X_test)

# Fit scaling for y on train data only, apply (transform) scaling to both train and test
y_train_t = y_scaler.fit_transform(y_train)
y_test_t = y_scaler.transform(y_test)

# Get transformed feature names (handy for inspection)
feat_names = x_preprocessor.get_feature_names_out()
    
# Inspect transformed data
print("X_train: ", X_train_t.shape)
print("X_test:  ", X_test_t.shape)
print("y_train: ", y_train_t.shape)
print("y_test:  ", y_test_t.shape)
display(pd.DataFrame(X_train_t, columns=feat_names).head(5))

X_train:  (1244, 16)
X_test:   (312, 16)
y_train:  (1244, 1)
y_test:   (312, 1)

	num__bo-acc	num__bo-donor	num__q-acc	num__q-donor	num__q-hatom	num__dist-dh	num__dist-ah	cat__atomtype-acc_Cl	cat__atomtype-acc_F	cat__atomtype-acc_N	cat__atomtype-acc_O	cat__atomtype-acc_S	cat__atomtype-don_F	cat__atomtype-don_N	cat__atomtype-don_O	cat__atomtype-don_S
0	-1.285813	0.917745	-0.115091	-0.282173	0.864921	-0.662341	1.298361	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0
1	-0.647649	0.186561	0.970476	-0.771634	-0.619231	-0.258013	1.832981	0.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0
2	-1.162635	-0.394051	0.083728	-0.078369	0.426753	-0.294188	0.813984	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0
3	0.306321	-0.886830	0.274388	0.630579	0.169388	-0.271833	-1.260058	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0
4	2.782975	0.231048	-2.398073	-1.403848	-1.551664	-0.384992	-0.519736	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0

# Define the model with a parameterizable architecture with Keras
def create_model(in_dim, hidden=(64, 32), dropout=0.1, lr=1e-3, weight_decay=1e-5):
    inputs = keras.Input(shape=(in_dim,))
    x = inputs
    for h in hidden:
        x = layers.Dense(
            h,
            activation="relu",
            kernel_regularizer=regularizers.l2(weight_decay)
        )(x)
        x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(1, activation="linear")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="mse",
        metrics=[keras.metrics.RootMeanSquaredError(name="rmse")]
    )
    return model

# Define function for training loop with hyperparameter configuration
# Also define a directory to store model checkpoints during training
CKPT_DIR = "optuna_ckpts"
os.makedirs(CKPT_DIR, exist_ok=True)

def objective(trial):
    hidden_layers_list = [(256,128,64), (128,64), (64,32), (64,)]
    idx = trial.suggest_int("hidden_layers_idx", 0, len(hidden_layers_list)-1)
    hidden_layers = hidden_layers_list[idx]
    dropout = trial.suggest_float("dropout", 0.0, 0.3)
    lr = trial.suggest_float("lr", 5e-4, 3e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16,32,64,128,256])
    epochs = 50

    best_rmse = float("inf")
    best_model_path = os.path.join(CKPT_DIR, f"best_trial.keras")

    model = create_model(
        in_dim=X_train_t.shape[1],
        hidden=hidden_layers,
        dropout=dropout,
        lr=lr,
        weight_decay=weight_decay,
    )

    history = model.fit(
        X_train_t, y_train_t,
        validation_data=(X_test_t, y_test_t),
        batch_size=batch_size,
        epochs=epochs,
        verbose=0,
    )

    # Evaluate in original units (unscaled y)
    y_pred = y_scaler.inverse_transform(model.predict(X_test_t))
    y_true = y_scaler.inverse_transform(y_test_t)
    val_rmse = math.sqrt(np.mean((y_pred - y_true) ** 2))

    # Store model if model improved
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        model.save(best_model_path)
    
    # Report to optuna
    trial.set_user_attr("keras_history", history.history)
    trial.set_user_attr("best_model", best_model_path)

    return val_rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20, timeout=600)

best_trial = study.best_trial
best_params = best_trial.params
display("Params:", best_params)
print("Best RMSE:", best_trial.value)

'Params:'

{'hidden_layers_idx': 1,
 'dropout': 0.03851177664057214,
 'lr': 0.0009183114007974771,
 'weight_decay': 5.794506140272295e-05,
 'batch_size': 64}

Best RMSE: 2.553923289711433

# Analyze the best trials history
history_dict = best_trial.user_attrs["keras_history"]

df_history = pd.DataFrame(history_dict)
df_history[['loss', 'val_loss']].plot(legend=True)
plt.show()

../_images/8cd483cd7021aa9ac95fcbd0f8c946956ef86877ee8e1e131a5c121e29670b76.png

best_model_path = best_trial.user_attrs["best_model"]
best_model = load_model(best_model_path)
best_model.summary()

Model: "functional_19"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_19 (InputLayer)     │ (None, 16)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_60 (Dense)                │ (None, 64)             │         1,088 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_41 (Dropout)            │ (None, 64)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_61 (Dense)                │ (None, 32)             │         2,080 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_42 (Dropout)            │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_62 (Dense)                │ (None, 1)              │            33 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 9,605 (37.52 KB)

 Trainable params: 3,201 (12.50 KB)

 Non-trainable params: 0 (0.00 B)

 Optimizer params: 6,404 (25.02 KB)