.. DO NOT EDIT. .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: .. "auto_ch6\ch6_housing.py" .. LINE NUMBERS ARE GIVEN BELOW. .. only:: html .. note:: :class: sphx-glr-download-link-note :ref:`Go to the end ` to download the full example code. .. rst-class:: sphx-glr-example-title .. _sphx_glr_auto_ch6_ch6_housing.py: ========================================================================= 6.1 Supervised learning: reproducibility illustration with housing prices ========================================================================= We show how to reproduce the results of the chapter 6.2.1 - Application to supervised machine learning - Regression problem: housing price prediction of the book. We will compare the codpy model with other standard regression methods. The goal is to show the codpy model capacity to fit the training data. .. GENERATED FROM PYTHON SOURCE LINES 12-15 Necessary Imports ------------------------ ######################################################################## .. GENERATED FROM PYTHON SOURCE LINES 15-47 .. code-block:: Python import sys import os import time from pathlib import Path try: base_path = Path(__file__).parent.parent except NameError: base_path = Path.cwd().parent sys.path.append(str(base_path)) try: CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) except NameError: CURRENT_DIR = os.getcwd() data_path = os.path.join(CURRENT_DIR, "data") PARENT_DIR = os.path.abspath(os.path.join(CURRENT_DIR, "..")) sys.path.insert(0, PARENT_DIR) import matplotlib.pyplot as plt import numpy as np import torch import torch.nn as nn from utils.ch9.path_generation import California_data_generator from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import StandardScaler from codpy.kernel import Kernel from codpy.plot_utils import multi_plot .. GENERATED FROM PYTHON SOURCE LINES 48-53 Regression models ------------------------ In the following methods we define the regression models to be used for the comparison. Each model gets evaluated on the training data based on the Mean Squared Error (MSE) loss. We also compute the Maximum Mean Discrepancy (MMD) for the codpy model, used to showcase the direct inverse relationship between MMD and Score. .. GENERATED FROM PYTHON SOURCE LINES 55-59 To use CodPy for regression, we instanciate a Kernel object and pass x as the training data and fx as the target values. Calling the kernel with z will return the predictions, similar to how we would use predict(). .. GENERATED FROM PYTHON SOURCE LINES 59-201 .. code-block:: Python def codpy_model(x, fx, z, fz): kernel = Kernel( x=x, fx=fx, ) results = kernel(z) eval_loss = np.mean((results - fz) ** 2) mmd = np.sqrt(kernel.discrepancy(z)) return eval_loss, mmd # Different other standard models are defined below. # The user can add these models to the lists of models to be evaluated. def torch_model(x, fx, z, fz): input_scaler = StandardScaler() x = input_scaler.fit_transform(x) z = input_scaler.transform(z) target_scaler = StandardScaler() fx = target_scaler.fit_transform(fx.reshape(-1, 1)).reshape(-1) fz_scaled = target_scaler.transform(fz.reshape(-1, 1)).reshape(-1) x = torch.tensor(x, dtype=torch.float32) fx = torch.tensor(fx, dtype=torch.float32) z = torch.tensor(z, dtype=torch.float32) fz = torch.tensor(fz_scaled, dtype=torch.float32) class FFN(nn.Module): def __init__(self, input_dim): super().__init__() self.net = nn.Sequential( nn.Linear(input_dim, 64), nn.ReLU(), nn.Linear(64, 128), nn.ReLU(), nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, 1), ) def forward(self, x): return self.net(x) model = FFN(x.shape[1]) optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-3) loss_fn = nn.MSELoss() n_samples = x.shape[0] batch_size = max(n_samples // 4, 32) start_time = time.perf_counter() model.train() for epoch in range(128): indices = torch.randperm(n_samples) for start in range(0, n_samples, batch_size): end = min(start + batch_size, n_samples) batch_idx = indices[start:end] x_batch = x[batch_idx] fx_batch = fx[batch_idx] pred = model(x_batch).squeeze() loss = loss_fn(pred, fx_batch) optimizer.zero_grad() loss.backward() optimizer.step() model.eval() with torch.no_grad(): pred_scaled = model(z).squeeze().numpy() pred = target_scaler.inverse_transform(pred_scaled.reshape(-1, 1)).reshape(-1) true = target_scaler.inverse_transform(fz.numpy().reshape(-1, 1)).reshape(-1) eval_loss = np.mean((pred - true) ** 2) return eval_loss, None def random_forest_model(x, fx, z, fz): x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel() model = RandomForestRegressor(n_estimators=100, random_state=42) model.fit(x, fx) results = model.predict(z) eval_loss = np.mean((results - fz) ** 2) return eval_loss, None def adaboost_model(x, fx, z, fz): x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel() from sklearn.ensemble import AdaBoostRegressor model = AdaBoostRegressor(n_estimators=50, learning_rate=1) model.fit(x, fx) results = model.predict(z) eval_loss = np.mean((results - fz) ** 2) return eval_loss, None def xgboost_model(x, fx, z, fz): x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel() import xgboost as xgb model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1) model.fit(x, fx) results = model.predict(z) eval_loss = np.mean((results - fz) ** 2) return eval_loss, None def svr_model(x, fx, z, fz): x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel() from sklearn.svm import SVR model = SVR(kernel="rbf", C=1.0, epsilon=0.1) model.fit(x, fx) results = model.predict(z) eval_loss = np.mean((results - fz) ** 2) return eval_loss, None def gradient_boosting_model(x, fx, z, fz): x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel() from sklearn.ensemble import GradientBoostingRegressor model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1) model.fit(x, fx) results = model.predict(z) eval_loss = np.mean((results - fz) ** 2) return eval_loss, None def decision_tree_model(x, fx, z, fz): x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel() from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor(max_depth=5) model.fit(x, fx) results = model.predict(z) eval_loss = np.mean((results - fz) ** 2) return eval_loss, None .. GENERATED FROM PYTHON SOURCE LINES 202-207 Gathering Results ------------------------ Here we benchmark 3 of the models defined above. We use the California housing dataset, and train the models on different subsets of the training dataset. We always evaluate the models on the entire training set. Therefore, the last training procedure trains and evaluate on the exact same data. .. GENERATED FROM PYTHON SOURCE LINES 207-253 .. code-block:: Python results = {} times = {} mmds = {} models = [torch_model, random_forest_model, codpy_model] model_aliases = { "torch_model": "FFN", "codpy_model": "KRR", "random_forest_model": "RF", } data_generator_ = California_data_generator() X, FX, X, FX, Z, FZ = data_generator_.get_data(-1, -1, -1, -1) torch.manual_seed(0) idxs = torch.randperm(len(X)) SIZE = 512 X, FX, Z, FZ = ( X.values[idxs][:SIZE], FX.values[idxs][:SIZE], Z.values[idxs][:SIZE], FZ.values[idxs][:SIZE], ) length_ = len(X) # Different slices of the data to be used for x different training & evaluation procedures scenarios_list = [ (int(i), int(i), -1, -1) for i in np.arange(16, length_ + 1, (length_ - 16) / 10) ] for scenario in scenarios_list: Nx, Nfx, Nz, Nfz = scenario x, fx, z, fz = X[:Nx], FX[:Nfx], Z[:Nz], FZ[:Nfz] results[len(x)] = {} times[len(x)] = {} mmds[len(x)] = {} for model in models: start_time = time.perf_counter() loss, mmd = model(x, fx, z, fz) mmds[len(x)][model.__name__] = mmd results[len(x)][model.__name__] = loss end_time = time.perf_counter() times[len(x)][model.__name__] = end_time - start_time print( f"Model: {model.__name__}, Time taken: {times[len(x)][model.__name__]:.4f} seconds" ) res = [{"data": results}, {"data": mmds}, {"data": times}] .. rst-class:: sphx-glr-script-out .. code-block:: none Model: torch_model, Time taken: 0.0842 seconds Model: random_forest_model, Time taken: 0.0352 seconds Model: codpy_model, Time taken: 0.0023 seconds Model: torch_model, Time taken: 0.2311 seconds Model: random_forest_model, Time taken: 0.0439 seconds Model: codpy_model, Time taken: 0.0033 seconds Model: torch_model, Time taken: 0.3783 seconds Model: random_forest_model, Time taken: 0.0567 seconds Model: codpy_model, Time taken: 0.0040 seconds Model: torch_model, Time taken: 0.4160 seconds Model: random_forest_model, Time taken: 0.0688 seconds Model: codpy_model, Time taken: 0.0050 seconds Model: torch_model, Time taken: 0.4961 seconds Model: random_forest_model, Time taken: 0.0721 seconds Model: codpy_model, Time taken: 0.0060 seconds Model: torch_model, Time taken: 0.4183 seconds Model: random_forest_model, Time taken: 0.0785 seconds Model: codpy_model, Time taken: 0.0079 seconds Model: torch_model, Time taken: 0.5380 seconds Model: random_forest_model, Time taken: 0.0938 seconds Model: codpy_model, Time taken: 0.0112 seconds Model: torch_model, Time taken: 0.5278 seconds Model: random_forest_model, Time taken: 0.1019 seconds Model: codpy_model, Time taken: 0.0164 seconds Model: torch_model, Time taken: 0.4612 seconds Model: random_forest_model, Time taken: 0.1089 seconds Model: codpy_model, Time taken: 0.0176 seconds Model: torch_model, Time taken: 0.5876 seconds Model: random_forest_model, Time taken: 0.1306 seconds Model: codpy_model, Time taken: 0.0211 seconds Model: torch_model, Time taken: 0.5588 seconds Model: random_forest_model, Time taken: 0.1346 seconds Model: codpy_model, Time taken: 0.0268 seconds .. GENERATED FROM PYTHON SOURCE LINES 254-256 Plotting ------------------------ .. GENERATED FROM PYTHON SOURCE LINES 256-291 .. code-block:: Python def plot_one(inputs): results = inputs["data"] ax = inputs["ax"] legend = inputs["legend"] model_aliases = { "torch_model": "FFN", "codpy_model": "KRR", "random_forest_model": "RF", } x_vals = sorted(results.keys()) for model_name in next(iter(results.values())).keys(): y_vals = [results[x][model_name] for x in x_vals] label = model_aliases.get(model_name, model_name) ax.plot(x_vals, y_vals, marker="o", label=label) ax.set_xlabel("Number of Training Examples") ax.set_ylabel(legend) ax.legend() ax.grid(True) return ax multi_plot( res, plot_one, mp_nrows=1, mp_ncols=3, legends=["MSE", "MMD", "Times"], mp_figsize=(14, 10), ) plt.show() pass .. image-sg:: /auto_ch6/images/sphx_glr_ch6_housing_001.png :alt: ch6 housing :srcset: /auto_ch6/images/sphx_glr_ch6_housing_001.png :class: sphx-glr-single-img .. rst-class:: sphx-glr-timing **Total running time of the script:** (0 minutes 6.025 seconds) .. _sphx_glr_download_auto_ch6_ch6_housing.py: .. only:: html .. container:: sphx-glr-footer sphx-glr-footer-example .. container:: sphx-glr-download sphx-glr-download-jupyter :download:`Download Jupyter notebook: ch6_housing.ipynb ` .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: ch6_housing.py ` .. container:: sphx-glr-download sphx-glr-download-zip :download:`Download zipped: ch6_housing.zip ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_