.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "auto_ch6\ch6_housing.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        :ref:`Go to the end <sphx_glr_download_auto_ch6_ch6_housing.py>`
        to download the full example code.

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_auto_ch6_ch6_housing.py:


=========================================================================
6.1 Supervised learning: reproducibility illustration with housing prices
=========================================================================

We show how to reproduce the results of the chapter 6.2.1 - Application to supervised machine learning - Regression problem: housing price prediction of the book.
We will compare the codpy model with other standard regression methods.
The goal is to show the codpy model capacity to fit the training data.

.. GENERATED FROM PYTHON SOURCE LINES 12-15

Necessary Imports
 ------------------------
########################################################################

.. GENERATED FROM PYTHON SOURCE LINES 15-47

.. code-block:: Python


    import sys
    import os 
    import time
    from pathlib import Path

    try:
        base_path = Path(__file__).parent.parent
    except NameError:
        base_path = Path.cwd().parent

    sys.path.append(str(base_path))

    try:
        CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
    except NameError:
        CURRENT_DIR = os.getcwd()
    data_path = os.path.join(CURRENT_DIR, "data")
    PARENT_DIR = os.path.abspath(os.path.join(CURRENT_DIR, ".."))
    sys.path.insert(0, PARENT_DIR)

    import matplotlib.pyplot as plt
    import numpy as np
    import torch
    import torch.nn as nn
    from utils.ch9.path_generation import California_data_generator
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.preprocessing import StandardScaler

    from codpy.kernel import Kernel
    from codpy.plot_utils import multi_plot


.. GENERATED FROM PYTHON SOURCE LINES 48-53

Regression models
------------------------
In the following methods we define the regression models to be used for the comparison.
Each model gets evaluated on the training data based on the Mean Squared Error (MSE) loss.
We also compute the Maximum Mean Discrepancy (MMD) for the codpy model, used to showcase the direct inverse relationship between MMD and Score.

.. GENERATED FROM PYTHON SOURCE LINES 55-59

To use CodPy for regression, we instanciate a Kernel object
and pass x as the training data and fx as the target values.
Calling the kernel with z will return the predictions,
similar to how we would use predict().

.. GENERATED FROM PYTHON SOURCE LINES 59-201

.. code-block:: Python

    def codpy_model(x, fx, z, fz):
        kernel = Kernel(
            x=x,
            fx=fx,
        )
        results = kernel(z)
        eval_loss = np.mean((results - fz) ** 2)
        mmd = np.sqrt(kernel.discrepancy(z))
        return eval_loss, mmd


    # Different other standard models are defined below.
    # The user can add these models to the lists of models to be evaluated.
    def torch_model(x, fx, z, fz):
        input_scaler = StandardScaler()
        x = input_scaler.fit_transform(x)
        z = input_scaler.transform(z)

        target_scaler = StandardScaler()
        fx = target_scaler.fit_transform(fx.reshape(-1, 1)).reshape(-1)
        fz_scaled = target_scaler.transform(fz.reshape(-1, 1)).reshape(-1)

        x = torch.tensor(x, dtype=torch.float32)
        fx = torch.tensor(fx, dtype=torch.float32)
        z = torch.tensor(z, dtype=torch.float32)
        fz = torch.tensor(fz_scaled, dtype=torch.float32)

        class FFN(nn.Module):
            def __init__(self, input_dim):
                super().__init__()
                self.net = nn.Sequential(
                    nn.Linear(input_dim, 64),
                    nn.ReLU(),
                    nn.Linear(64, 128),
                    nn.ReLU(),
                    nn.Linear(128, 64),
                    nn.ReLU(),
                    nn.Linear(64, 1),
                )

            def forward(self, x):
                return self.net(x)

        model = FFN(x.shape[1])
        optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-3)
        loss_fn = nn.MSELoss()

        n_samples = x.shape[0]
        batch_size = max(n_samples // 4, 32)

        start_time = time.perf_counter()

        model.train()
        for epoch in range(128):
            indices = torch.randperm(n_samples)
            for start in range(0, n_samples, batch_size):
                end = min(start + batch_size, n_samples)
                batch_idx = indices[start:end]
                x_batch = x[batch_idx]
                fx_batch = fx[batch_idx]

                pred = model(x_batch).squeeze()
                loss = loss_fn(pred, fx_batch)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            pred_scaled = model(z).squeeze().numpy()
            pred = target_scaler.inverse_transform(pred_scaled.reshape(-1, 1)).reshape(-1)
            true = target_scaler.inverse_transform(fz.numpy().reshape(-1, 1)).reshape(-1)
            eval_loss = np.mean((pred - true) ** 2)

        return eval_loss, None


    def random_forest_model(x, fx, z, fz):
        x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel()
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(x, fx)
        results = model.predict(z)
        eval_loss = np.mean((results - fz) ** 2)
        return eval_loss, None


    def adaboost_model(x, fx, z, fz):
        x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel()
        from sklearn.ensemble import AdaBoostRegressor

        model = AdaBoostRegressor(n_estimators=50, learning_rate=1)
        model.fit(x, fx)
        results = model.predict(z)
        eval_loss = np.mean((results - fz) ** 2)
        return eval_loss, None


    def xgboost_model(x, fx, z, fz):
        x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel()
        import xgboost as xgb

        model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1)
        model.fit(x, fx)
        results = model.predict(z)
        eval_loss = np.mean((results - fz) ** 2)
        return eval_loss, None


    def svr_model(x, fx, z, fz):
        x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel()
        from sklearn.svm import SVR

        model = SVR(kernel="rbf", C=1.0, epsilon=0.1)
        model.fit(x, fx)
        results = model.predict(z)
        eval_loss = np.mean((results - fz) ** 2)
        return eval_loss, None


    def gradient_boosting_model(x, fx, z, fz):
        x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel()
        from sklearn.ensemble import GradientBoostingRegressor

        model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
        model.fit(x, fx)
        results = model.predict(z)
        eval_loss = np.mean((results - fz) ** 2)
        return eval_loss, None


    def decision_tree_model(x, fx, z, fz):
        x, fx, z, fz = np.array(x), np.array(fx).ravel(), np.array(z), np.array(fz).ravel()
        from sklearn.tree import DecisionTreeRegressor

        model = DecisionTreeRegressor(max_depth=5)
        model.fit(x, fx)
        results = model.predict(z)
        eval_loss = np.mean((results - fz) ** 2)
        return eval_loss, None


.. GENERATED FROM PYTHON SOURCE LINES 202-207

Gathering Results
------------------------
Here we benchmark 3 of the models defined above.
We use the California housing dataset, and train the models on different subsets of the training dataset.
We always evaluate the models on the entire training set. Therefore, the last training procedure trains and evaluate on the exact same data.

.. GENERATED FROM PYTHON SOURCE LINES 207-253

.. code-block:: Python


    results = {}
    times = {}
    mmds = {}
    models = [torch_model, random_forest_model, codpy_model]
    model_aliases = {
        "torch_model": "FFN",
        "codpy_model": "KRR",
        "random_forest_model": "RF",
    }
    data_generator_ = California_data_generator()
    X, FX, X, FX, Z, FZ = data_generator_.get_data(-1, -1, -1, -1)
    torch.manual_seed(0)
    idxs = torch.randperm(len(X))
    SIZE = 512
    X, FX, Z, FZ = (
        X.values[idxs][:SIZE],
        FX.values[idxs][:SIZE],
        Z.values[idxs][:SIZE],
        FZ.values[idxs][:SIZE],
    )
    length_ = len(X)
    # Different slices of the data to be used for x different training & evaluation procedures
    scenarios_list = [
        (int(i), int(i), -1, -1) for i in np.arange(16, length_ + 1, (length_ - 16) / 10)
    ]
    for scenario in scenarios_list:
        Nx, Nfx, Nz, Nfz = scenario
        x, fx, z, fz = X[:Nx], FX[:Nfx], Z[:Nz], FZ[:Nfz]
        results[len(x)] = {}
        times[len(x)] = {}
        mmds[len(x)] = {}
        for model in models:
            start_time = time.perf_counter()
            loss, mmd = model(x, fx, z, fz)
            mmds[len(x)][model.__name__] = mmd
            results[len(x)][model.__name__] = loss
            end_time = time.perf_counter()
            times[len(x)][model.__name__] = end_time - start_time
            print(
                f"Model: {model.__name__}, Time taken: {times[len(x)][model.__name__]:.4f} seconds"
            )
    res = [{"data": results}, {"data": mmds}, {"data": times}]


.. rst-class:: sphx-glr-script-out

 .. code-block:: none

    Model: torch_model, Time taken: 0.0842 seconds
    Model: random_forest_model, Time taken: 0.0352 seconds
    Model: codpy_model, Time taken: 0.0023 seconds
    Model: torch_model, Time taken: 0.2311 seconds
    Model: random_forest_model, Time taken: 0.0439 seconds
    Model: codpy_model, Time taken: 0.0033 seconds
    Model: torch_model, Time taken: 0.3783 seconds
    Model: random_forest_model, Time taken: 0.0567 seconds
    Model: codpy_model, Time taken: 0.0040 seconds
    Model: torch_model, Time taken: 0.4160 seconds
    Model: random_forest_model, Time taken: 0.0688 seconds
    Model: codpy_model, Time taken: 0.0050 seconds
    Model: torch_model, Time taken: 0.4961 seconds
    Model: random_forest_model, Time taken: 0.0721 seconds
    Model: codpy_model, Time taken: 0.0060 seconds
    Model: torch_model, Time taken: 0.4183 seconds
    Model: random_forest_model, Time taken: 0.0785 seconds
    Model: codpy_model, Time taken: 0.0079 seconds
    Model: torch_model, Time taken: 0.5380 seconds
    Model: random_forest_model, Time taken: 0.0938 seconds
    Model: codpy_model, Time taken: 0.0112 seconds
    Model: torch_model, Time taken: 0.5278 seconds
    Model: random_forest_model, Time taken: 0.1019 seconds
    Model: codpy_model, Time taken: 0.0164 seconds
    Model: torch_model, Time taken: 0.4612 seconds
    Model: random_forest_model, Time taken: 0.1089 seconds
    Model: codpy_model, Time taken: 0.0176 seconds
    Model: torch_model, Time taken: 0.5876 seconds
    Model: random_forest_model, Time taken: 0.1306 seconds
    Model: codpy_model, Time taken: 0.0211 seconds
    Model: torch_model, Time taken: 0.5588 seconds
    Model: random_forest_model, Time taken: 0.1346 seconds
    Model: codpy_model, Time taken: 0.0268 seconds


.. GENERATED FROM PYTHON SOURCE LINES 254-256

Plotting
------------------------

.. GENERATED FROM PYTHON SOURCE LINES 256-291

.. code-block:: Python

    def plot_one(inputs):
        results = inputs["data"]
        ax = inputs["ax"]
        legend = inputs["legend"]

        model_aliases = {
            "torch_model": "FFN",
            "codpy_model": "KRR",
            "random_forest_model": "RF",
        }

        x_vals = sorted(results.keys())
        for model_name in next(iter(results.values())).keys():
            y_vals = [results[x][model_name] for x in x_vals]
            label = model_aliases.get(model_name, model_name)
            ax.plot(x_vals, y_vals, marker="o", label=label)

        ax.set_xlabel("Number of Training Examples")
        ax.set_ylabel(legend)
        ax.legend()
        ax.grid(True)

        return ax


    multi_plot(
        res,
        plot_one,
        mp_nrows=1,
        mp_ncols=3,
        legends=["MSE", "MMD", "Times"],
        mp_figsize=(14, 10),
    )
    plt.show()
    pass


.. image-sg:: /auto_ch6/images/sphx_glr_ch6_housing_001.png
   :alt: ch6 housing
   :srcset: /auto_ch6/images/sphx_glr_ch6_housing_001.png
   :class: sphx-glr-single-img


.. rst-class:: sphx-glr-timing

   **Total running time of the script:** (0 minutes 6.025 seconds)


.. _sphx_glr_download_auto_ch6_ch6_housing.py:

.. only:: html

  .. container:: sphx-glr-footer sphx-glr-footer-example

    .. container:: sphx-glr-download sphx-glr-download-jupyter

      :download:`Download Jupyter notebook: ch6_housing.ipynb <ch6_housing.ipynb>`

    .. container:: sphx-glr-download sphx-glr-download-python

      :download:`Download Python source code: ch6_housing.py <ch6_housing.py>`

    .. container:: sphx-glr-download sphx-glr-download-zip

      :download:`Download zipped: ch6_housing.zip <ch6_housing.zip>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_