.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "auto_ch6\ch6_CreditCardFraud.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        :ref:`Go to the end <sphx_glr_download_auto_ch6_ch6_CreditCardFraud.py>`
        to download the full example code.

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_auto_ch6_ch6_CreditCardFraud.py:


6.4 Unsupervised learning: Clustering - Fraud detection 
==========================

We show how to reproduce the results of the chapter 6.3.3 - Credit card fraud dectection. 
We will compare the codpy MMD minimization-based algorithm with scikit learn k-means in an unsupervised setting. 
The goal is to show the different scores as we increase the number of centroids Ny used for clustering. 

.. GENERATED FROM PYTHON SOURCE LINES 11-14

Necessary Imports
 ------------------------
########################################################################

.. GENERATED FROM PYTHON SOURCE LINES 14-43

.. code-block:: Python

    import sys
    import os 
    import time
    import seaborn as sns

    import pandas as pd
    from sklearn.preprocessing import RobustScaler
    from sklearn.model_selection import train_test_split
    import kagglehub

    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    from codpy.clustering import GreedySearch, SharpDiscrepancy
    from codpy.kernel import *
    from codpy.data_processing import hot_encoder
    from ch6_Clustering import *
    from sklearn.cluster import KMeans

    try:
        current_dir = os.path.dirname(__file__)
        data_dir = os.path.join(current_dir, "data")
    except NameError:
        current_dir = os.getcwd()
        data_dir = os.path.join(current_dir, "data")

    curr_f = os.path.join(os.getcwd(), "codpybook", "utils")
    sys.path.insert(0, curr_f)


.. GENERATED FROM PYTHON SOURCE LINES 44-48

CreditCardFraud Data Preparation
 ------------------------
We get the data from Kagglehub. We scale the values using RobustScaler, which is robust to outliers.
This is usefull for the CreditCardFraud dataset, which contains a very small percentage of fraudulent transactions.

.. GENERATED FROM PYTHON SOURCE LINES 48-88

.. code-block:: Python


    def get_dataset():
        path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

        print("Path to dataset files:", path)

        data = pd.read_csv(os.path.join(path, "creditcard.csv"))
        return data 

    def prep_data(x,n):
        rob_scaler = RobustScaler()
        x["Time"] = rob_scaler.fit_transform(x['Time'].values.reshape(-1,1))
        x["Amount"] = rob_scaler.fit_transform(x['Amount'].values.reshape(-1,1))

        x = x[:n]
        frauds = x[x["Class"]==1]
        no_frauds = x[x["Class"]==0]
        train_size = 0.8

        x_train_fraud, z_test_fraud= train_test_split(frauds, train_size=train_size, random_state=42)
        x, z = train_test_split(no_frauds, train_size=train_size, random_state=42)
        x = pd.concat([x,x_train_fraud])
        z = pd.concat([z,z_test_fraud])

        fx = x['Class']
        x = x.drop(['Class'],axis=1)
        fz = z['Class']
        z = z.drop(['Class'],axis=1)

        fx, fz = (
            hot_encoder(pd.DataFrame(data=fx.values), cat_cols_include=[0], sort_columns=True),
            hot_encoder(pd.DataFrame(data=fz.values), cat_cols_include=[0], sort_columns=True),
        )
        x, fx, z, fz = (x.to_numpy(), fx.to_numpy(), z.to_numpy(), fz.to_numpy())

        return x,fx,z,fz

    def fraud_generator(n):
        return prep_data(get_dataset(),n)


.. GENERATED FROM PYTHON SOURCE LINES 89-93

Running the Experiment
 ------------------------
This section runs the experiment to compare K-means and CodPy clustering.
We use the models defined in 6.3 Unsupervised learning: Clustering - MNIST

.. GENERATED FROM PYTHON SOURCE LINES 93-145

.. code-block:: Python


    def one_experiment(X, fx, Ny, get_predictor, z, fz):
        def get_score(X, cluster_centers, predictor):
            inertia = compute_inertia(X, cluster_centers)
            mmd = compute_mmd(X, cluster_centers)

            f_z = predictor(z)
            f_z = f_z.argmax(1)
            ground_truth = fz.argmax(axis=-1)
            out = confusion_matrix(ground_truth, f_z)

            return inertia, mmd, out

        elapsed_time = time.time()
        cluster_centers, predictor = get_predictor(X, fx, Ny)
        elapsed_time = time.time() - elapsed_time
        inertia, mmd, conf_matrix = get_score(X, cluster_centers, predictor)
        return inertia, mmd, elapsed_time, conf_matrix

    def run_experiment(data_generator, Nx, Ny_values, get_predictors, labels, file_name=None):
        results = []
        conf_matrices = {}
        for Ny in Ny_values:
            N_MNIST_pics = Nx
            x, fx, z, fz = data_generator(N_MNIST_pics)
            for get_predictor, label in zip(get_predictors, labels):
                inertia, mmd, elapsed_time, conf_matrix = one_experiment(x, fx, Ny, get_predictor, z, fz)
                print(
                    "Method:",label,
                    "N_partition:",Ny,
                    " inertia:",inertia,
                    " mmd:",mmd,
                    " time:",elapsed_time,
                )
                results.append(
                    {
                        "Method": label,
                        "Nx": Nx,
                        "Ny": Ny,
                        "Execution Time (s)": elapsed_time,
                        "inertia": inertia,
                        "mmd": mmd,
                    }
                )
                conf_matrices[label] = conf_matrix
        out = pd.DataFrame(results)
        print(out)
        if file_name is not None:
            out.to_csv(file_name, index=False)
        conf_matrices = [{"data": conf_mat} for label, conf_mat in conf_matrices.items()]
        return results, conf_matrices


.. GENERATED FROM PYTHON SOURCE LINES 146-149

Plotting
 ------------------------
This section formats data plots the different experiments on a figure.

.. GENERATED FROM PYTHON SOURCE LINES 149-213

.. code-block:: Python

    def plot_experiment(inputs):
        """
            This is mainly boilerplate formatting the data for plotting.
        """
        results = [{"data": {}} for _ in range(3)]
        for res in inputs:
            ny = res["Ny"]
            method = res["Method"]
            t = res["Execution Time (s)"]
            inertia = res["inertia"]
            mmd = res["mmd"]
            results[0]["data"].setdefault(ny, {})[method] = mmd
            results[1]["data"].setdefault(ny, {})[method] = inertia
            results[2]["data"].setdefault(ny, {})[method] = t

 
        def plot_one(inputs):
            results = inputs["data"]
            ax = inputs["ax"]
            legend = inputs["legend"]
            for model_name in next(iter(results.values())).keys():
                x_vals = sorted(results.keys())
                y_vals = [results[x][model_name] for x in x_vals]
                ax.plot(x_vals, y_vals, marker='o', label=model_name)
            ax.set_xlabel('Ny')
            ax.set_ylabel(legend)
            ax.legend()
            ax.grid(True)

            return ax

        multi_plot(
            results,
            plot_one,
            mp_nrows=1,
            mp_ncols=4,
            mp_figsize=(14, 10),
            legends=["discrepancy_errors", "inertia", "execution_time"],
        )

    def plot_conf_matrix(inputs):
        conf_matrix = inputs["data"]
        ax = inputs["ax"]
        legend = inputs["legend"]
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
        ax.set_xlabel('Predicted labels')
        ax.set_ylabel('True labels')
        ax.set_title(legend)
        return ax 

    if __name__ == "__main__":
        get_predictors = [
            lambda X, fx, N: codpy_clustering(X, fx, N),
            lambda X, fx, N: kmeans_clustering(X, fx, N),
        ]
        labels = ["greedy", "kmeans"]
        # Run the experiment
        Nxs, Nys = 4096*4, [10, 20]

        results, conf_matrices = run_experiment(fraud_generator, Nxs, Nys, get_predictors, labels)
        plot_experiment(results)
        plt.show()
        multi_plot(conf_matrices, plot_conf_matrix, mp_nrows=1, mp_ncols=2, mp_figsize=(14, 10), legends=["MMD Codpy", "k-means"])
        plt.show()


.. rst-class:: sphx-glr-horizontal


    *

      .. image-sg:: /auto_ch6/images/sphx_glr_ch6_CreditCardFraud_001.png
         :alt: ch6 CreditCardFraud
         :srcset: /auto_ch6/images/sphx_glr_ch6_CreditCardFraud_001.png
         :class: sphx-glr-multi-img

    *

      .. image-sg:: /auto_ch6/images/sphx_glr_ch6_CreditCardFraud_002.png
         :alt: MMD Codpy, k-means
         :srcset: /auto_ch6/images/sphx_glr_ch6_CreditCardFraud_002.png
         :class: sphx-glr-multi-img


.. rst-class:: sphx-glr-script-out

 .. code-block:: none

    Path to dataset files: C:\Users\geoff\.cache\kagglehub\datasets\mlg-ulb\creditcardfraud\versions\3
    Method: greedy N_partition: 10  inertia: 464594.9838624175  mmd: 0.0948630994595838  time: 0.2972526550292969
    Method: kmeans N_partition: 10  inertia: 282535.826614884  mmd: 0.09666078993090986  time: 0.33942198753356934
    Path to dataset files: C:\Users\geoff\.cache\kagglehub\datasets\mlg-ulb\creditcardfraud\versions\3
    Method: greedy N_partition: 20  inertia: 426947.06153945025  mmd: 0.04651846029808986  time: 0.31640195846557617
    Method: kmeans N_partition: 20  inertia: 215497.90363048157  mmd: 0.048244749837085515  time: 0.4898262023925781
       Method     Nx  Ny  Execution Time (s)        inertia       mmd
    0  greedy  16384  10            0.297253  464594.983862  0.094863
    1  kmeans  16384  10            0.339422  282535.826615  0.096661
    2  greedy  16384  20            0.316402  426947.061539  0.046518
    3  kmeans  16384  20            0.489826  215497.903630  0.048245


.. rst-class:: sphx-glr-timing

   **Total running time of the script:** (0 minutes 6.787 seconds)


.. _sphx_glr_download_auto_ch6_ch6_CreditCardFraud.py:

.. only:: html

  .. container:: sphx-glr-footer sphx-glr-footer-example

    .. container:: sphx-glr-download sphx-glr-download-jupyter

      :download:`Download Jupyter notebook: ch6_CreditCardFraud.ipynb <ch6_CreditCardFraud.ipynb>`

    .. container:: sphx-glr-download sphx-glr-download-python

      :download:`Download Python source code: ch6_CreditCardFraud.py <ch6_CreditCardFraud.py>`

    .. container:: sphx-glr-download sphx-glr-download-zip

      :download:`Download zipped: ch6_CreditCardFraud.zip <ch6_CreditCardFraud.zip>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_