.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "auto_ch6\ch6_mnist.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        :ref:`Go to the end <sphx_glr_download_auto_ch6_ch6_mnist.py>`
        to download the full example code.

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_auto_ch6_ch6_mnist.py:


=========================================================
6.2 Supervised learning: benchmarks of methods with MNIST
=========================================================

Here we reproduce the results of chapter 6.2.2 - Classification problem: handwritten digits.
We will compare different models with codpy for a classification task, scoring models on unseen data.

.. GENERATED FROM PYTHON SOURCE LINES 11-14

Necessary Imports
 ------------------------
########################################################################

.. GENERATED FROM PYTHON SOURCE LINES 14-48

.. code-block:: Python

    import random
    import time

    import matplotlib.pyplot as plt
    import numpy as np
    import scipy
    import scipy.fft
    import scipy.linalg
    from scipy import ndimage

    import torch
    import torch.nn as nn
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.linear_model import LinearRegression
    import torchvision.transforms.functional as tf
    from torch.utils.data import DataLoader, TensorDataset
    from torchvision import datasets, transforms
    from typing import List,Set,Dict,get_type_hints

    from PIL import Image

    import codpy.core as core
    import codpy.lalg as lalg
    from codpy.data_processing import hot_encoder
    from codpy.kernel import KernelClassifier
    from codpy.plot_utils import multi_plot
    from scipy.special import softmax
    from codpydll import *
    from codpy.kernel import Kernel,Sampler
    from codpy.sampling import get_uniforms,get_normals,get_qmc_uniforms,get_qmc_normals
    from codpy.permutation import lsap, map_invertion,Gromov_Monge,dic_invertion
    from codpy.conditioning import ConditionerKernel


.. GENERATED FROM PYTHON SOURCE LINES 49-53

MNIST Data Preparation
------------------------
We normalize pixel values and reshape data for processing.
Pixel data is used as flat vectors, and labels are one-hot encoded.

.. GENERATED FROM PYTHON SOURCE LINES 56-87

.. code-block:: Python

    def get_MNIST_data(N=-1, flatten=True, one_hot=True, seed=43):
        transform = transforms.ToTensor()

        train_data = datasets.MNIST(
            root=".", train=True, download=True, transform=transform
        )
        test_data = datasets.MNIST(
            root=".", train=False, download=True, transform=transform
        )

        x = train_data.data.numpy().astype(np.float32) / 255.0
        fx = train_data.targets.numpy()

        z = test_data.data.numpy().astype(np.float32) / 255.0
        fz = test_data.targets.numpy()

        if flatten:
            x = x.reshape(len(x), -1)
            z = z.reshape(len(z), -1)

        if one_hot:
            fx = np.eye(10)[fx]
            fz = np.eye(10)[fz]
        if N==-1:
            N = x.shape[0]
        random.seed(seed)
        indices = random.sample(range(x.shape[0]), min(N,x.shape[0]))
        x, fx = x[indices], fx[indices]
        
        return x, fx, z, fz


.. GENERATED FROM PYTHON SOURCE LINES 88-92

Classification Models
------------------------
Below we define the 3 classification models which will be used in our experiments.
We use the accuracy loss as a metric along with Maximum Mean Discrepancy (MMD) for the codpy model to showcase the inverse relationship between the score and MMD.

.. GENERATED FROM PYTHON SOURCE LINES 94-483

.. code-block:: Python


    # For classification tasks, use KernelClassifier.
    # This is used the same way as Kernel, but it returns a softmax distribution over classes.
    def RKHS_ridge_regression(x, fx, z, fz):
        kernel = KernelClassifier(
            x=x,
            fx=fx,
            clip=None,  # Clipping is used for non-probability inputs
        )
        preds = kernel(z)
        end_time = time.perf_counter()
        mmd = np.sqrt(kernel.discrepancy(z))
        return preds, fz, mmd, end_time


    class conv_classifier(KernelClassifier):
        def get_kernel(self) -> callable:
            if not hasattr(self, "kernel"):
                cd.set_kernel("tinv_poly_kernel",{"p": str(1)})
                # ones ="1;"*(x.shape[0]-1)+"1"
                # cd.set_kernel("tinv_maternnorm",{"weights" : ones})
                cd.kernel_interface.set_polynomial_order(0)
                cd.kernel_interface.set_regularization(1e-9)
                cd.kernel_interface.set_map("scale_to_unitcube")
                self.kernel = core.KerInterface.get_kernel_ptr()
            return self.kernel
            # pass

    def cut(x,N):
        out = np.array_split(x,N,axis=1)
        out = [np.concatenate([np.ones([o.shape[0],1])*i,o],axis=1) for i,o in enumerate(out)]
        return out

    def codpy_tr_model(x, fx, z, fz,N=28):
    
        kernels = []
        xs = []
        latentxs = []
        kernel = None
        x_splitted = cut(x,N)
        fx_splitted = np.repeat(fx,len(x_splitted),axis=1)
        x_splitted = [np.concatenate([x_split,fx],axis=1) for x_split,fx_split in zip(x_splitted,fx_splitted)]
        count = 0
        values = np.ones_like(fx)
        values /= values.sum(1)[:,None]
        for x_split,fx_split in zip(x_splitted,fx_splitted):
            test=np.linalg.norm(x_split[1:,:N+1])
            if test > 16:
                x_split[:,N+1:] = values
                kernel = KernelClassifier(x=x_split,fx = fx,reg=1e-9)
                values = kernel(kernel.get_x())
                kernels += [kernel]
                count = count + 1
                xs+= [kernel.get_x()]
            else:
                kernels += [None]
            # latentxs+= [kernel(kernel.get_x())]

        z_splitted = cut(z,N)
        values = np.ones_like(fz)
        values /= values.sum(1)[:,None]
        # values = fz
        for kernel,z_split in zip(kernels,z_splitted):
            if kernel is not None:
                z = np.concatenate([z_split,values],axis=1)
                values = kernel(z)
        preds = values
        mmd=0
        end_time = time.perf_counter()
        return preds, fz, mmd, end_time
    def get_weights2D(x,fx,sigma=.4):
        out = np.zeros(x.shape[1])
        for n in range(x.shape[1]):
           j = n//28 -14
           i = n%28 - 14
           out[(i+28*j)%x.shape[1]]+=np.exp(-(i*i+j*j)*sigma)
        #    out[(i+28*j)%x.shape[1]]+=max(1-np.sqrt(i*i+j*j)*sigma,0.)
        out /= out.sum()
        out = scipy.linalg.toeplitz(out)
        return out
    def rotate(x,angle=10):
        images = x.reshape(x.shape[0],28,28)
        out = x.copy()
        for n in range(x.shape[0]):
            image = images[n]
            # plt.imshow(image,cmap='gray')
            image = np.asarray(tf.rotate(Image.fromarray(image),angle))
            # plt.imshow(image,cmap='gray')
            out[n] = image.ravel()
        return out

    def get_weights1D(x,fx,sigma=4.):
        out = np.zeros(x.shape[1])
        for n in range(x.shape[1]):
            i=(n-x.shape[1]//2)
            out[i%x.shape[1]]=np.exp(-i*i*sigma)
        out /= out.sum()
        out = scipy.linalg.toeplitz(out)
        return out


    def RKHS_conv_model(x, fx, z, fz,get_weights=get_weights2D):
        sigma=.5
        weights = get_weights(x,fx,sigma)
        xcs = x@weights
        xrs1=rotate(xcs,-10.)
        xrs2=rotate(xcs,+10.)
        xs=np.concatenate([x,xrs1,xrs2])
        fxs=np.concatenate([fx,fx,fx])   
        classifier_kernel = KernelClassifier(x=xs,fx=fxs,clip=None)
        preds = classifier_kernel(z@weights)
        mmd=0
        end_time = time.perf_counter()
        return preds, fz, mmd, end_time


    def codpy_fft_model(x, fx, z, fz,get_weights=get_weights2D):
        xs=scipy.fft.fft(x,workers=-1)
        # xs = np.real(xs)
        xs = np.concatenate([x,np.real(xs)],axis=1)
        classifier_kernel = KernelClassifier(x=xs,fx=fx,clip=None)
        zs=scipy.fft.fft(z,workers=-1)
        zs = np.concatenate([z,np.real(zs)],axis=1)
        # zs = np.real(zs)
        preds = classifier_kernel(zs)
        mmd=0
        end_time = time.perf_counter()
        return preds, fz, mmd, end_time

    def get_fun(x,fx):
        left = core.KerOp.dnm(fx,fx,distance="norm22")
        right = core.KerOp.dnm(x,x,distance="norm22")
        J1 = (left * right).sum()
        J2 = (left - right)
        J2 = (J2*J2).sum()
        return J1,J2

    def get_J(x,fx,weights=[1,2,4,10,4,2,1]):
        test = get_fun(x,fx)
        out = core.KerOp.dnm(fx,fx,distance="norm22")
        out = out - np.diag(out.sum(1))
        out = out @ x
        out = x.T @ out
        out = np.identity(out.shape[0]) + out / (2.*np.max(np.fabs(out)))
        return out/out.sum(1)[:,None]
    def get_A(x,fx,weights=[1,2,4,10,4,2,1]):
        weights=[np.exp(-n*n*.5) for n in range(-5,5)]
        out = np.zeros(x.shape[1])
        for n,w in enumerate(weights):
           i,j = n-len(weights)//2,0
           out[(i+28*j)%x.shape[1]]+=w
           i,j = 0,n-len(weights)//2
           out[(i+28*j)%x.shape[1]]+=w
        #    i,j = n-len(weights)//2,n-len(weights)//2
        #    out[i+28*j]+=w
        #    i,j = n-len(weights)//2,len(weights)//2 - n
        #    out[i+28*j]+=w
        out = scipy.linalg.toeplitz(out)
        return out/out.sum(1)[:,None]

    def get_A(x,fx):
        # test = get_fun(x,fx)
        dfx = core.KerOp.dnm(fx,fx,distance="norm22")/(fx.shape[1]*fx.shape[1])
        dx = core.KerOp.dnm(x,x,distance="norm22")/(x.shape[1]*x.shape[1])
        D = dfx-dx
        B = -np.ones(x.shape[0])
        B = scipy.linalg.toeplitz(B)
        B *= D
        B -= np.diag(B.sum(1))

        xs = B@x
        out = x.T @ xs
        U,D = lalg.LAlg.self_adjoint_eigen_decomposition(out)
        D =np.array(D)
        D = np.exp(-D)
        out = U@np.diag(D)@U.T
        test = get_fun(x@out,fx)

        return out


    def codpy_convolutional_model(x, fx, z, fz):
        kernel = conv_classifier(
            x=x,
            fx=fx,
            clip=None,  # Clipping is used for non-probability inputs
        )
        preds = kernel(z)
        end_time = time.perf_counter()

        # mmd = np.sqrt(kernel.discrepancy(z))
        mmd=0

        return preds, fz, mmd, end_time

    def random_forest_model(x, fx, z, fz):
        # Convert inputs to numpy arrays and one-hot to labels
        x = np.array(x)
        fx_labels = np.argmax(np.array(fx), axis=1)
        z = np.array(z)
        fz_labels = np.argmax(np.array(fz), axis=1)

        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(x, fx_labels)
        results = model.predict(z)

        num_classes = 10  # MNIST has 10 classes (0-9)
        results_oh = np.zeros((len(results), num_classes))
        results_oh[np.arange(len(results)), results] = 1

        # Also convert true labels to one-hot for consistent return
        fz_oh = np.zeros((len(fz_labels), num_classes))
        fz_oh[np.arange(len(fz_labels)), fz_labels] = 1

        return results_oh, fz_oh, None, time.perf_counter()


    def torch_model(x, fx, z, fz):
        x = torch.tensor(x, dtype=torch.float32)
        fx = torch.tensor(fx, dtype=torch.long).argmax(dim=1)
        z = torch.tensor(z, dtype=torch.float32)
        fz = torch.tensor(fz, dtype=torch.long)
        out_shape = fz.shape[1]

        class FFN(nn.Module):
            def __init__(self, input_dim, output_dim):
                super().__init__()
                self.net = nn.Sequential(
                    nn.Linear(input_dim, 256),
                    nn.BatchNorm1d(256),
                    nn.ReLU(),
                    nn.Dropout(0.3),
                    nn.Linear(256, 128),
                    nn.BatchNorm1d(128),
                    nn.ReLU(),
                    nn.Dropout(0.3),
                    nn.Linear(128, 64),
                    nn.BatchNorm1d(64),
                    nn.ReLU(),
                    nn.Dropout(0.3),
                    nn.Linear(64, output_dim),
                )

            def forward(self, x):
                return self.net(x)

        model = FFN(x.shape[1], out_shape)

        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        loss_fn = nn.CrossEntropyLoss()

        n_samples = x.shape[0]
        batch_size = max(n_samples // 4, 32)

        model.train()
        for epoch in range(30):
            indices = torch.randperm(n_samples)
            for start in range(0, n_samples, batch_size):
                end = min(start + batch_size, n_samples)
                batch_idx = indices[start:end]

                x_batch = x[batch_idx]
                fx_batch = fx[batch_idx]

                pred = model(x_batch)
                loss = loss_fn(pred, fx_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            results = model(z)

        return results.cpu(), fz.cpu(), None, time.perf_counter()


    def Perceptron(x, fx, z, fz):
        x = torch.tensor(x, dtype=torch.float32)
        fx = torch.tensor(fx, dtype=torch.float32).argmax(dim=1)
        z = torch.tensor(z, dtype=torch.float32)
        fz = torch.tensor(fz, dtype=torch.float32)
        out_shape = fz.shape[1]

        class FFN(nn.Module):
            def __init__(self, input_dim, output_dim):
                super().__init__()
                self.net = nn.Sequential(
                    nn.Linear(input_dim, 8),
                    nn.ReLU(),
                    nn.Linear(8, 32),
                    nn.ReLU(),
                    nn.Linear(32, 64),
                    nn.ReLU(),
                    nn.Linear(64, output_dim),
                )

            def forward(self, x):
                return self.net(x)

        model = FFN(x.shape[1], out_shape)

        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        loss_fn = nn.CrossEntropyLoss()

        n_samples = x.shape[0]
        batch_size = n_samples // 4
        for epoch in range(128):
            # Shuffle data each epoch
            indices = torch.randperm(n_samples)

            for start in range(0, n_samples, batch_size):
                end = min(start + batch_size, n_samples)
                batch_idx = indices[start:end]

                x_batch = x[batch_idx]
                fx_batch = fx[batch_idx]

                pred = model(x_batch)
                loss = loss_fn(pred, fx_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        with torch.no_grad():
            results = model(z)
        return results, fz, None, time.perf_counter()


    def vgg_torch_model(x, fx, z, fz):
        x = torch.tensor(x, dtype=torch.float32).view(-1, 1, 28, 28)
        fx = torch.tensor(fx, dtype=torch.float32).argmax(dim=1).long()

        z = torch.tensor(z, dtype=torch.float32).view(-1, 1, 28, 28)
        fz = torch.tensor(fz, dtype=torch.float32).long()  # .argmax(dim=1).long()
        out_shape = fz.shape[1]

        dataset = TensorDataset(x, fx)
        trainloader = DataLoader(dataset, batch_size=min(128, x.shape[0]), shuffle=True)

        class VGG(nn.Module):
            def __init__(self, input_dim=1, output_dim=10):
                super().__init__()
                self.features = nn.Sequential(
                    nn.Conv2d(
                        in_channels=input_dim, out_channels=32, kernel_size=3, padding=1
                    ),
                    nn.BatchNorm2d(32),
                    nn.ReLU(),
                    nn.Dropout2d(0.2),
                    nn.MaxPool2d(2),
                    nn.Conv2d(32, 64, 3, padding=1),
                    nn.BatchNorm2d(64),
                    nn.ReLU(),
                    nn.Dropout2d(0.3),
                    nn.MaxPool2d(2),
                )

                self.classifier = nn.Sequential(
                    nn.Flatten(),
                    nn.Linear(64 * 7 * 7, 128),
                    nn.ReLU(),
                    nn.Dropout(0.5),
                    nn.Linear(128, output_dim),
                )

            def forward(self, x):
                return self.classifier(self.features(x))

        model = VGG(x.shape[1], out_shape)
        opt = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
        loss_fn = nn.CrossEntropyLoss()

        for epoch in range(30):
            for x_batch, fx_batch in trainloader:
                pred = model(x_batch)
                loss = loss_fn(pred, fx_batch)
                opt.zero_grad()
                loss.backward()
                opt.step()

        with torch.no_grad():
            results = model(z)
        return results, fz, None, time.perf_counter()


.. GENERATED FROM PYTHON SOURCE LINES 484-488

Gathering Results
------------------------
We train the models on different subsets of the training dataset.
We always evaluate the models on the entire test set.

.. GENERATED FROM PYTHON SOURCE LINES 490-491

core.KerInterface.set_verbose()

.. GENERATED FROM PYTHON SOURCE LINES 491-546

.. code-block:: Python


    results = {}
    times = {}
    mmds = {}
    models = [
        RKHS_conv_model,
        RKHS_ridge_regression,
        # torch_model,
        Perceptron,
        # random_forest_model,
        vgg_torch_model,
    ]
    model_aliases = {
        "RKHS_conv_model": "K_CM",
        "RKHS_ridge_regression": "K_RR",
        # "torch_model": "FFN",
        "Perceptron": "NN_PM",
        # "random_forest_model": "RF",
        "vgg_torch_model": "NN_VGG",
    }

    X, FX, Z, FZ = get_MNIST_data()
    # torch.manual_seed(0)
    # idxs = torch.randperm(len(X))
    # idxs_z = torch.randperm(len(Z))
    SIZE = 2**11
    # X, FX, Z, FZ = X[idxs][:SIZE], FX[idxs][:SIZE], Z[idxs_z][:SIZE], FZ[idxs_z][:SIZE]
    length_ = len(X)
    scenarios_list = [
        (int(i), int(i), -1, -1) for i in 2 ** np.arange(8, 12)
    ]

    for scenario in scenarios_list:
        Nx, Nfx, Nz, Nfz = scenario
        x, fx, z, fz = X[:Nx], FX[:Nfx], Z[:Nz], FZ[:Nfz]
        # x, fx, z, fz = X[:Nx], FX[:Nfx], X[:Nx], FX[:Nfx]
        results[len(x)] = {}
        times[len(x)] = {}
        # mmds[len(x)] = {}
        for model in models:
            start_time = time.perf_counter()
            logits, target, mmd, end_time = model(x, fx, z, fz)
            # mmds[len(x)][model.__name__] = mmd
            pred_classes = np.argmax(logits, axis=1)
            true_classes = np.argmax(target, axis=1)
            accuracy = accuracy_score(true_classes, pred_classes)
            results[len(x)][model.__name__] = accuracy
            times[len(x)][model.__name__] = end_time - start_time
            print(
                f"Model: {model.__name__}, size: {Nx}, Time taken: {times[len(x)][model.__name__]:.4f} seconds, Accuracy: {results[len(x)][model.__name__]:.4f} %"
            )
    res = [{"data": results}, {"data": times}]
    # res = [{"data": results}, {"data": mmds}, {"data": times}]


.. rst-class:: sphx-glr-script-out

 .. code-block:: none

    Model: RKHS_conv_model, size: 256, Time taken: 0.8587 seconds, Accuracy: 0.9119 %
    Model: RKHS_ridge_regression, size: 256, Time taken: 0.3842 seconds, Accuracy: 0.8578 %
    Model: Perceptron, size: 256, Time taken: 0.3200 seconds, Accuracy: 0.7780 %
    Model: vgg_torch_model, size: 256, Time taken: 1.4865 seconds, Accuracy: 0.8313 %
    Model: RKHS_conv_model, size: 512, Time taken: 1.5440 seconds, Accuracy: 0.9321 %
    Model: RKHS_ridge_regression, size: 512, Time taken: 0.5735 seconds, Accuracy: 0.8929 %
    Model: Perceptron, size: 512, Time taken: 0.3603 seconds, Accuracy: 0.7994 %
    Model: vgg_torch_model, size: 512, Time taken: 2.2845 seconds, Accuracy: 0.8823 %
    Model: RKHS_conv_model, size: 1024, Time taken: 3.3486 seconds, Accuracy: 0.9545 %
    Model: RKHS_ridge_regression, size: 1024, Time taken: 1.0122 seconds, Accuracy: 0.9238 %
    Model: Perceptron, size: 1024, Time taken: 0.4119 seconds, Accuracy: 0.8671 %
    Model: vgg_torch_model, size: 1024, Time taken: 3.8322 seconds, Accuracy: 0.9190 %
    Model: RKHS_conv_model, size: 2048, Time taken: 9.3553 seconds, Accuracy: 0.9663 %
    Model: RKHS_ridge_regression, size: 2048, Time taken: 2.0154 seconds, Accuracy: 0.9462 %
    Model: Perceptron, size: 2048, Time taken: 0.4969 seconds, Accuracy: 0.8767 %
    Model: vgg_torch_model, size: 2048, Time taken: 6.7345 seconds, Accuracy: 0.9432 %


.. GENERATED FROM PYTHON SOURCE LINES 547-550

Plotting
 ------------------------
########################################################################

.. GENERATED FROM PYTHON SOURCE LINES 550-578

.. code-block:: Python

    def plot_one(inputs):
        results = inputs["data"]
        ax = inputs["ax"]
        legend = inputs["legend"]

        x_vals = sorted(results.keys())
        for model_name in next(iter(results.values())).keys():
            y_vals = [results[x][model_name] for x in x_vals]
            label = model_aliases.get(model_name, model_name)
            ax.plot(x_vals, y_vals, marker="o", label=label)

        ax.set_xlabel("Number of Training Examples")
        ax.set_ylabel(legend)
        ax.legend()
        ax.grid(True)

        return ax


    multi_plot(
        res,
        plot_one,
        mp_nrows=1,
        mp_ncols=3,
        legends=["Accuracy", "Times"],
        mp_figsize=(14, 10),
    )
    plt.show()
    pass


.. image-sg:: /auto_ch6/images/sphx_glr_ch6_mnist_001.png
   :alt: ch6 mnist
   :srcset: /auto_ch6/images/sphx_glr_ch6_mnist_001.png
   :class: sphx-glr-single-img


.. rst-class:: sphx-glr-timing

   **Total running time of the script:** (0 minutes 50.348 seconds)


.. _sphx_glr_download_auto_ch6_ch6_mnist.py:

.. only:: html

  .. container:: sphx-glr-footer sphx-glr-footer-example

    .. container:: sphx-glr-download sphx-glr-download-jupyter

      :download:`Download Jupyter notebook: ch6_mnist.ipynb <ch6_mnist.ipynb>`

    .. container:: sphx-glr-download sphx-glr-download-python

      :download:`Download Python source code: ch6_mnist.py <ch6_mnist.py>`

    .. container:: sphx-glr-download sphx-glr-download-zip

      :download:`Download zipped: ch6_mnist.zip <ch6_mnist.zip>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_