6.1 Supervised learning: reproducibility illustration with housing prices

We show how to reproduce the results of the chapter 6.2.1 - Application to supervised machine learning - Regression problem: housing price prediction of the book. We will compare the codpy model with other standard regression methods. The goal is to show the codpy model capacity to fit the training data.

Necessary Imports

import sys
import os
import time
from pathlib import Path

try:
    base_path = Path(__file__).parent.parent
except NameError:
    base_path = Path.cwd().parent

sys.path.append(str(base_path))

try:
    CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    CURRENT_DIR = os.getcwd()
data_path = os.path.join(CURRENT_DIR, "data")
PARENT_DIR = os.path.abspath(os.path.join(CURRENT_DIR, ".."))
sys.path.insert(0, PARENT_DIR)

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from utils.ch9.path_generation import California_data_generator
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

from codpy.kernel import Kernel
from codpy.plot_utils import multi_plot

Regression models

In the following methods we define the regression models to be used for the comparison. Each model gets evaluated on the training data based on the Mean Squared Error (MSE) loss. We also compute the Maximum Mean Discrepancy (MMD) for the codpy model, used to showcase the direct inverse relationship between MMD and Score.

To use CodPy for regression, we instanciate a Kernel object and pass x as the training data and fx as the target values. Calling the kernel with z will return the predictions, similar to how we would use predict().

def codpy_model(x, fx, z, fz):
    kernel = Kernel(
        x=x,
        fx=fx,
    )
    results = kernel(z)
    eval_loss = np.mean((results - fz) ** 2)
    mmd = np.sqrt(kernel.discrepancy(z))
    return eval_loss, mmd


# Different other standard models are defined below.
# The user can add these models to the lists of models to be evaluated.
def torch_model(x, fx, z, fz):
    input_scaler = StandardScaler()
    x = input_scaler.fit_transform(x)
    z = input_scaler.transform(z)

    target_scaler = StandardScaler()
    fx = target_scaler.fit_transform(fx.reshape(-1, 1)).reshape(-1)
    fz_scaled = target_scaler.transform(fz.reshape(-1, 1)).reshape(-1)

    x = torch.tensor(x, dtype=torch.float32)
    fx = torch.tensor(fx, dtype=torch.float32)
    z = torch.tensor(z, dtype=torch.float32)
    fz = torch.tensor(fz_scaled, dtype=torch.float32)

    class FFN(nn.Module):
        def __init__(self, input_dim):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(input_dim, 64),
                nn.ReLU(),
                nn.Linear(64, 128),
                nn.ReLU(),
                nn.Linear(128, 64),
                nn.ReLU(),
                nn.Linear(64, 1),
            )

        def forward(self, x):
            return self.net(x)

    model = FFN(x.shape[1])
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-3)
    loss_fn = nn.MSELoss()

    n_samples = x.shape[0]
    batch_size = max(n_samples // 4, 32)

    start_time = time.perf_counter()

    model.train()
    for epoch in range(128):
        indices = torch.randperm(n_samples)
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            batch_idx = indices[start:end]
            x_batch = x[batch_idx]
            fx_batch = fx[batch_idx]

            pred = model(x_batch).squeeze()
            loss = loss_fn(pred, fx_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        pred_scaled = model(z).squeeze().numpy()
        pred = target_scaler.inverse_transform(pred_scaled.reshape(-1, 1)).reshape(-1)
        true = target_scaler.inverse_transform(fz.numpy().reshape(-1, 1)).reshape(-1)
        eval_loss = np.mean((pred - true) ** 2)

    return eval_loss, None


def random_forest_model(x, fx, z, fz):
    x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel()
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(x, fx)
    results = model.predict(z)
    eval_loss = np.mean((results - fz) ** 2)
    return eval_loss, None


def adaboost_model(x, fx, z, fz):
    x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel()
    from sklearn.ensemble import AdaBoostRegressor

    model = AdaBoostRegressor(n_estimators=50, learning_rate=1)
    model.fit(x, fx)
    results = model.predict(z)
    eval_loss = np.mean((results - fz) ** 2)
    return eval_loss, None


def xgboost_model(x, fx, z, fz):
    x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel()
    import xgboost as xgb

    model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1)
    model.fit(x, fx)
    results = model.predict(z)
    eval_loss = np.mean((results - fz) ** 2)
    return eval_loss, None


def svr_model(x, fx, z, fz):
    x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel()
    from sklearn.svm import SVR

    model = SVR(kernel="rbf", C=1.0, epsilon=0.1)
    model.fit(x, fx)
    results = model.predict(z)
    eval_loss = np.mean((results - fz) ** 2)
    return eval_loss, None


def gradient_boosting_model(x, fx, z, fz):
    x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel()
    from sklearn.ensemble import GradientBoostingRegressor

    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
    model.fit(x, fx)
    results = model.predict(z)
    eval_loss = np.mean((results - fz) ** 2)
    return eval_loss, None


def decision_tree_model(x, fx, z, fz):
    x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel()
    from sklearn.tree import DecisionTreeRegressor

    model = DecisionTreeRegressor(max_depth=5)
    model.fit(x, fx)
    results = model.predict(z)
    eval_loss = np.mean((results - fz) ** 2)
    return eval_loss, None

Gathering Results

Here we benchmark 3 of the models defined above. We use the California housing dataset, and train the models on different subsets of the training dataset. We always evaluate the models on the entire training set. Therefore, the last training procedure trains and evaluate on the exact same data.

results = {}
times = {}
mmds = {}
models = [torch_model, random_forest_model, codpy_model]
model_aliases = {
    "torch_model": "FFN",
    "codpy_model": "KRR",
    "random_forest_model": "RF",
}
data_generator_ = California_data_generator()
X, FX, X, FX, Z, FZ = data_generator_.get_data(-1, -1, -1, -1)
torch.manual_seed(0)
idxs = torch.randperm(len(X))
SIZE = 512
X, FX, Z, FZ = (
    X.values[idxs][:SIZE],
    FX.values[idxs][:SIZE],
    Z.values[idxs][:SIZE],
    FZ.values[idxs][:SIZE],
)
length_ = len(X)
# Different slices of the data to be used for x different training & evaluation procedures
scenarios_list = [
    (int(i), int(i), -1, -1) for i in np.arange(16, length_ + 1, (length_ - 16) / 10)
]
for scenario in scenarios_list:
    Nx, Nfx, Nz, Nfz = scenario
    x, fx, z, fz = X[:Nx], FX[:Nfx], Z[:Nz], FZ[:Nfz]
    results[len(x)] = {}
    times[len(x)] = {}
    mmds[len(x)] = {}
    for model in models:
        start_time = time.perf_counter()
        loss, mmd = model(x, fx, z, fz)
        mmds[len(x)][model.__name__] = mmd
        results[len(x)][model.__name__] = loss
        end_time = time.perf_counter()
        times[len(x)][model.__name__] = end_time - start_time
        print(
            f"Model: {model.__name__}, Time taken: {times[len(x)][model.__name__]:.4f} seconds"
        )
res = [{"data": results}, {"data": mmds}, {"data": times}]

Model: torch_model, Time taken: 0.0842 seconds
Model: random_forest_model, Time taken: 0.0352 seconds
Model: codpy_model, Time taken: 0.0023 seconds
Model: torch_model, Time taken: 0.2311 seconds
Model: random_forest_model, Time taken: 0.0439 seconds
Model: codpy_model, Time taken: 0.0033 seconds
Model: torch_model, Time taken: 0.3783 seconds
Model: random_forest_model, Time taken: 0.0567 seconds
Model: codpy_model, Time taken: 0.0040 seconds
Model: torch_model, Time taken: 0.4160 seconds
Model: random_forest_model, Time taken: 0.0688 seconds
Model: codpy_model, Time taken: 0.0050 seconds
Model: torch_model, Time taken: 0.4961 seconds
Model: random_forest_model, Time taken: 0.0721 seconds
Model: codpy_model, Time taken: 0.0060 seconds
Model: torch_model, Time taken: 0.4183 seconds
Model: random_forest_model, Time taken: 0.0785 seconds
Model: codpy_model, Time taken: 0.0079 seconds
Model: torch_model, Time taken: 0.5380 seconds
Model: random_forest_model, Time taken: 0.0938 seconds
Model: codpy_model, Time taken: 0.0112 seconds
Model: torch_model, Time taken: 0.5278 seconds
Model: random_forest_model, Time taken: 0.1019 seconds
Model: codpy_model, Time taken: 0.0164 seconds
Model: torch_model, Time taken: 0.4612 seconds
Model: random_forest_model, Time taken: 0.1089 seconds
Model: codpy_model, Time taken: 0.0176 seconds
Model: torch_model, Time taken: 0.5876 seconds
Model: random_forest_model, Time taken: 0.1306 seconds
Model: codpy_model, Time taken: 0.0211 seconds
Model: torch_model, Time taken: 0.5588 seconds
Model: random_forest_model, Time taken: 0.1346 seconds
Model: codpy_model, Time taken: 0.0268 seconds

Plotting

def plot_one(inputs):
    results = inputs["data"]
    ax = inputs["ax"]
    legend = inputs["legend"]

    model_aliases = {
        "torch_model": "FFN",
        "codpy_model": "KRR",
        "random_forest_model": "RF",
    }

    x_vals = sorted(results.keys())
    for model_name in next(iter(results.values())).keys():
        y_vals = [results[x][model_name] for x in x_vals]
        label = model_aliases.get(model_name, model_name)
        ax.plot(x_vals, y_vals, marker="o", label=label)

    ax.set_xlabel("Number of Training Examples")
    ax.set_ylabel(legend)
    ax.legend()
    ax.grid(True)

    return ax


multi_plot(
    res,
    plot_one,
    mp_nrows=1,
    mp_ncols=3,
    legends=["MSE", "MMD", "Times"],
    mp_figsize=(14, 10),
)
plt.show()
pass

Total running time of the script: (0 minutes 6.025 seconds)

Gallery generated by Sphinx-Gallery