Supervised and Unsupervised Machine Learning Methods for Urban Sound dataset#

In this exercise, we will apply supervised and unsupervised machine learning techniques to classify urban sounds using the UrbanSound8K dataset. After extracting features from audio files, we will train a K-Nearest Neighbors (KNN) classifier and visualize the data using UMAP (Uniform Manifold Approximation and Projection). Next, we will use the same features to train a Convolutional Neural Network (CNN) and compare its performance to KNN. UMAP will also be used to visualize one of the CNN’s last layers.

Urban Sound Dataset
%load_ext autoreload
%autoreload 2
import os
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from sklearn.metrics import confusion_matrix, classification_report
import warnings

warnings.filterwarnings("ignore")

Paths and devices#

data_path = Path("..") / "data"
metadata_path = data_path / "UrbanSound8K.csv"

# load device depending on your system
if torch.cuda.is_available():
    device = torch.device("cuda")        # NVIDIA GPU
elif torch.backends.mps.is_available():
    device = torch.device("mps")         # Apple Silicon
else:
    device = torch.device("cpu")         # CPU fallback

print(f"Using device: {device}")
# One liner:
# device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
metadata = pd.read_csv(metadata_path)
metadata.head()

Have a first look at the labels and the distribution of the dataset:

plt.figure(figsize=(6, 3))
sns.countplot(
    y=metadata["class"], 
    order=metadata["class"].value_counts().index, 
    palette="viridis")
plt.title("Classes with their counts")
plt.tight_layout()
plt.show()

Audio processing parameters#

Librosa is a Python package for music and audio analysis.

from audio_processing import show_mel_augmentations
show_mel_augmentations(metadata, data_path)

Extract features#

from audio_processing import extract_audio_features

# Initialize the dictionary with only the keys you want to compute
audio_dict = {
    "path": [],
    "labels": [],
    "class": [],
    "train_test": [],
    "raw": [],
    
    # Toggle features by commenting/uncommenting
    # "rms": [],
    # "spec_bw": [],
    # "poly_features": [],
    "spec_centroid": [],
    # "spec_flatness": [],
    # "spec_rolloff": [],
    "mean_mfccs": [],
}

# Get the set of keys to determine which features to extract
feature_keys = set(audio_dict.keys())

# Process each audio file in the dataset
for i, row in tqdm(metadata.iterrows(), total=len(metadata), desc="Extracting audio features"):
    # Construct audio file path
    audio_path = os.path.join(data_path, f"fold{row['fold']}", row["slice_file_name"])
    
    # Extract only the features we need
    features = extract_audio_features(audio_path, feature_keys)
    
    # Add metadata (always included)
    audio_dict["path"].append(audio_path)
    audio_dict["labels"].append(row["classID"])
    audio_dict["class"].append(row["class"])
    audio_dict["train_test"].append("train" if row["fold"] <= 8 else "test")
    
    # Add extracted features (only those that were computed)
    for key, value in features.items():
        audio_dict[key].append(value)

If needed, we can create a new feature vectors by concatenating the extracted features

audio_dict["feature"] = audio_dict["mean_mfccs"]
# audio_dict["feature"] = audio_dict["spec_centroid"]

# Alternatively, you can concatenate multiple features into a single feature vector
# audio_dict["feature"] = []
# for i in range(len(audio_dict["labels"])):
#     audio_dict["feature"].append(
#         np.concatenate([audio_dict["mean_mfccs"][i], audio_dict["spec_centroid"][i]])
#     )
audio_df = pd.DataFrame(
    audio_dict, columns=["path", "labels", "class", "train_test", "feature"]
)
audio_df.head()

Let’s listen#

import IPython.display
import random
from config import SR

label_to_listen = 3

# Find random index of the label to listen
indices = [
    i for i, num in enumerate(audio_dict["labels"]) if num == label_to_listen
]

if indices:
    random_index = random.choice(indices)
    print("{} (index={})".format(audio_dict["class"][random_index], random_index))

IPython.display.Audio(audio_dict["raw"][random_index], rate=SR)

Supervised Learning Methods using the extracted features#

Here, we will use the extracted features to train:

Split the dataset into train and test#

def split_features_and_labels(df, feature_col="feature", label_col="labels"):
    """
    Split a DataFrame into train and test sets based on the 'train_test' column.
    
    Args:
        df: DataFrame containing the data
        feature_col: Column name for features
        label_col: Column name for labels
        
    Returns:
        X_train, y_train, X_test, y_test: NumPy arrays of features and labels
    """
    # Extract train data
    train_data = df[df["train_test"] == "train"]
    X_train = np.array(train_data[feature_col].tolist())
    y_train = np.array(train_data[label_col].tolist())
    
    # Extract test data
    test_data = df[df["train_test"] == "test"]
    X_test = np.array(test_data[feature_col].tolist())
    y_test = np.array(test_data[label_col].tolist())
    
    return X_train, y_train, X_test, y_test

# Use the function to get train and test data
X, y, X_test, y_test = split_features_and_labels(audio_df)

# Print shapes to verify
print(f"Training features: {X.shape}")
print(f"Training labels: {y.shape}")
print(f"Test features: {X_test.shape}")
print(f"Test labels: {y_test.shape}")
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

model_knn = KNeighborsClassifier(n_neighbors=5)
model_mlp = MLPClassifier(
    hidden_layer_sizes=(512, 256),
    activation="relu",
    solver="adam",
    max_iter=5000,
    random_state=42,
    learning_rate_init=0.001,
    early_stopping=True,
)
model_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
)
model_knn.fit(X, y)
model_mlp.fit(X, y)
model_rf.fit(X, y)

Confusion Matrix#

A confusion matrix is a table used to evaluate the performance of a classification model by comparing its predicted labels to the actual labels, showing how many predictions were correct and where errors occurred. Each row typically represents the actual class, and each column the predicted class, with the diagonal cells indicating correct predictions and off-diagonal cells showing misclassifications. This visualization helps identify not just overall accuracy but also specific types of errors, such as false positives and false negatives, enabling deeper analysis and improvement of the model.

Confusion Matrix
class_dict = {
    0: "air_conditioner",
    1: "car_horn",
    2: "children_playing",
    3: "dog_bark",
    4: "drilling",
    5: "engine_idling",
    6: "gun_shot",
    7: "jackhammer",
    8: "siren",
    9: "street_music",
}

def evaluate_model(model, X_test, y_test):
    """Evaluates the model and prints the classification report and confusion matrix"""
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)

    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=list(class_dict.values()),
        yticklabels=list(class_dict.values()),
    )
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()
evaluate_model(model_knn, X_test, y_test)
# evaluate_model(model_mlp, X_test, y_test)
# evaluate_model(model_rf, X_test, y_test)

UMAP#

Unsupervised method

import umap

reducer = umap.UMAP(
    random_state=42, 
    n_neighbors=5, 
    min_dist=0.5, 
    n_components=2, 
    verbose=True)

embedding = reducer.fit_transform(audio_dict["feature"])
import datamapplot

plot = datamapplot.create_interactive_plot(
    embedding,
    audio_dict["class"],
    hover_text=audio_dict["class"],
)
plot

CNN#

train_metadata = metadata[metadata["fold"].isin(range(1, 9))]
test_metadata = metadata[metadata["fold"].isin([9, 10])]

print(f"Training set: {len(train_metadata)} examples")
print(f"Test set: {len(test_metadata)} examples")
train_metadata
from model_utils import SimpleCNN, train_model
from data_utils import AudioDataset, SpectrogramAugmentation
from config import (
    BATCH_SIZE,
    NUMBER_WORKERS,
    EPOCHS,
    LEARNING_RATE,
    EARLY_STOPPING_PATIENCE,
    SCHEDULER_STEP_SIZE,
    SCHEDULER_GAMMA,
    NUM_CLASSES
)

train_dataset = AudioDataset(
    train_metadata, data_path, transform=SpectrogramAugmentation()
)
test_dataset = AudioDataset(test_metadata, data_path, transform=None)

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUMBER_WORKERS
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUMBER_WORKERS
)
simple_cnn = SimpleCNN().to(device)
simple_cnn
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(simple_cnn.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=SCHEDULER_STEP_SIZE, gamma=SCHEDULER_GAMMA)
model, train_losses, test_losses, train_accuracies, test_accuracies = train_model(
    simple_cnn,
    train_loader,
    test_loader,
    criterion,
    optimizer,
    device,
    num_epochs=EPOCHS,
    patience=EARLY_STOPPING_PATIENCE,
  # Number of epochs to wait for improvement
)
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label="Train Loss")
plt.plot(test_losses, label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.title("Training and Test Loss")

plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label="Train Accuracy")
plt.plot(test_accuracies, label="Test Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy (%)")
plt.legend()
plt.title("Training and Test Accuracy")

plt.tight_layout()
plt.show()
def evaluate_model(model, test_loader, num_classes=NUM_CLASSES):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc="Evaluating"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print("Classification Report:")
    print(
        classification_report(
            all_labels, all_preds, target_names=list(class_dict.values())
        )
    )

    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=list(class_dict.values()),
        yticklabels=list(class_dict.values()),
    )
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()
print("Simple CNN:")
evaluate_model(simple_cnn, test_loader)
model.eval()
all_outputs = []
all_labels = []
for inputs, labels in tqdm(train_loader):
    inputs, labels = inputs.to(device), labels.to(device)
    outputs = model.feature_extractor(inputs)
    all_outputs.append(outputs.detach().cpu().numpy())
    all_labels.append(labels.detach().cpu().numpy())

all_outputs = np.vstack(all_outputs)
all_labels = np.concatenate(all_labels)
class_names_labels = [class_dict[label] for label in all_labels]
import umap

reducer = umap.UMAP(
    random_state=42, n_neighbors=5, min_dist=0.5, n_components=2, verbose=True
)

embedding_post_training = reducer.fit_transform(all_outputs)
plot = datamapplot.create_interactive_plot(
    embedding_post_training,
    class_names_labels,
    hover_text=class_names_labels,
)
plot