6.4 Unsupervised learning: Clustering - Fraud detection

We show how to reproduce the results of the chapter 6.3.3 - Credit card fraud dectection. We will compare the codpy MMD minimization-based algorithm with scikit learn k-means in an unsupervised setting. The goal is to show the different scores as we increase the number of centroids Ny used for clustering.

Necessary Imports

import sys
import os
import time
import seaborn as sns

import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import kagglehub

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from codpy.clustering import GreedySearch, SharpDiscrepancy
from codpy.kernel import *
from codpy.data_processing import hot_encoder
from ch6_Clustering import *
from sklearn.cluster import KMeans

try:
    current_dir = os.path.dirname(__file__)
    data_dir = os.path.join(current_dir, "data")
except NameError:
    current_dir = os.getcwd()
    data_dir = os.path.join(current_dir, "data")

curr_f = os.path.join(os.getcwd(), "codpybook", "utils")
sys.path.insert(0, curr_f)

CreditCardFraud Data Preparation

We get the data from Kagglehub. We scale the values using RobustScaler, which is robust to outliers. This is usefull for the CreditCardFraud dataset, which contains a very small percentage of fraudulent transactions.

def get_dataset():
    path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

    print("Path to dataset files:", path)

    data = pd.read_csv(os.path.join(path, "creditcard.csv"))
    return data

def prep_data(x,n):
    rob_scaler = RobustScaler()
    x["Time"] = rob_scaler.fit_transform(x['Time'].values.reshape(-1,1))
    x["Amount"] = rob_scaler.fit_transform(x['Amount'].values.reshape(-1,1))

    x = x[:n]
    frauds = x[x["Class"]==1]
    no_frauds = x[x["Class"]==0]
    train_size = 0.8

    x_train_fraud, z_test_fraud= train_test_split(frauds, train_size=train_size, random_state=42)
    x, z = train_test_split(no_frauds, train_size=train_size, random_state=42)
    x = pd.concat([x,x_train_fraud])
    z = pd.concat([z,z_test_fraud])

    fx = x['Class']
    x = x.drop(['Class'],axis=1)
    fz = z['Class']
    z = z.drop(['Class'],axis=1)

    fx, fz = (
        hot_encoder(pd.DataFrame(data=fx.values), cat_cols_include=[0], sort_columns=True),
        hot_encoder(pd.DataFrame(data=fz.values), cat_cols_include=[0], sort_columns=True),
    )
    x, fx, z, fz = (x.to_numpy(), fx.to_numpy(), z.to_numpy(), fz.to_numpy())

    return x,fx,z,fz

def fraud_generator(n):
    return prep_data(get_dataset(),n)

Running the Experiment

This section runs the experiment to compare K-means and CodPy clustering. We use the models defined in 6.3 Unsupervised learning: Clustering - MNIST

def one_experiment(X, fx, Ny, get_predictor, z, fz):
    def get_score(X, cluster_centers, predictor):
        inertia = compute_inertia(X, cluster_centers)
        mmd = compute_mmd(X, cluster_centers)

        f_z = predictor(z)
        f_z = f_z.argmax(1)
        ground_truth = fz.argmax(axis=-1)
        out = confusion_matrix(ground_truth, f_z)

        return inertia, mmd, out

    elapsed_time = time.time()
    cluster_centers, predictor = get_predictor(X, fx, Ny)
    elapsed_time = time.time() - elapsed_time
    inertia, mmd, conf_matrix = get_score(X, cluster_centers, predictor)
    return inertia, mmd, elapsed_time, conf_matrix

def run_experiment(data_generator, Nx, Ny_values, get_predictors, labels, file_name=None):
    results = []
    conf_matrices = {}
    for Ny in Ny_values:
        N_MNIST_pics = Nx
        x, fx, z, fz = data_generator(N_MNIST_pics)
        for get_predictor, label in zip(get_predictors, labels):
            inertia, mmd, elapsed_time, conf_matrix = one_experiment(x, fx, Ny, get_predictor, z, fz)
            print(
                "Method:",label,
                "N_partition:",Ny,
                " inertia:",inertia,
                " mmd:",mmd,
                " time:",elapsed_time,
            )
            results.append(
                {
                    "Method": label,
                    "Nx": Nx,
                    "Ny": Ny,
                    "Execution Time (s)": elapsed_time,
                    "inertia": inertia,
                    "mmd": mmd,
                }
            )
            conf_matrices[label] = conf_matrix
    out = pd.DataFrame(results)
    print(out)
    if file_name is not None:
        out.to_csv(file_name, index=False)
    conf_matrices = [{"data": conf_mat} for label, conf_mat in conf_matrices.items()]
    return results, conf_matrices

Plotting

This section formats data plots the different experiments on a figure.

def plot_experiment(inputs):
    """
        This is mainly boilerplate formatting the data for plotting.
    """
    results = [{"data": {}} for _ in range(3)]
    for res in inputs:
        ny = res["Ny"]
        method = res["Method"]
        t = res["Execution Time (s)"]
        inertia = res["inertia"]
        mmd = res["mmd"]
        results[0]["data"].setdefault(ny, {})[method] = mmd
        results[1]["data"].setdefault(ny, {})[method] = inertia
        results[2]["data"].setdefault(ny, {})[method] = t


    def plot_one(inputs):
        results = inputs["data"]
        ax = inputs["ax"]
        legend = inputs["legend"]
        for model_name in next(iter(results.values())).keys():
            x_vals = sorted(results.keys())
            y_vals = [results[x][model_name] for x in x_vals]
            ax.plot(x_vals, y_vals, marker='o', label=model_name)
        ax.set_xlabel('Ny')
        ax.set_ylabel(legend)
        ax.legend()
        ax.grid(True)

        return ax

    multi_plot(
        results,
        plot_one,
        mp_nrows=1,
        mp_ncols=4,
        mp_figsize=(14, 10),
        legends=["discrepancy_errors", "inertia", "execution_time"],
    )

def plot_conf_matrix(inputs):
    conf_matrix = inputs["data"]
    ax = inputs["ax"]
    legend = inputs["legend"]
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title(legend)
    return ax

if __name__ == "__main__":
    get_predictors = [
        lambda X, fx, N: codpy_clustering(X, fx, N),
        lambda X, fx, N: kmeans_clustering(X, fx, N),
    ]
    labels = ["greedy", "kmeans"]
    # Run the experiment
    Nxs, Nys = 4096*4, [10, 20]

    results, conf_matrices = run_experiment(fraud_generator, Nxs, Nys, get_predictors, labels)
    plot_experiment(results)
    plt.show()
    multi_plot(conf_matrices, plot_conf_matrix, mp_nrows=1, mp_ncols=2, mp_figsize=(14, 10), legends=["MMD Codpy", "k-means"])
    plt.show()

Path to dataset files: C:\Users\geoff\.cache\kagglehub\datasets\mlg-ulb\creditcardfraud\versions\3
Method: greedy N_partition: 10  inertia: 464594.9838624175  mmd: 0.0948630994595838  time: 0.2972526550292969
Method: kmeans N_partition: 10  inertia: 282535.826614884  mmd: 0.09666078993090986  time: 0.33942198753356934
Path to dataset files: C:\Users\geoff\.cache\kagglehub\datasets\mlg-ulb\creditcardfraud\versions\3
Method: greedy N_partition: 20  inertia: 426947.06153945025  mmd: 0.04651846029808986  time: 0.31640195846557617
Method: kmeans N_partition: 20  inertia: 215497.90363048157  mmd: 0.048244749837085515  time: 0.4898262023925781
   Method     Nx  Ny  Execution Time (s)        inertia       mmd
0  greedy  16384  10            0.297253  464594.983862  0.094863
1  kmeans  16384  10            0.339422  282535.826615  0.096661
2  greedy  16384  20            0.316402  426947.061539  0.046518
3  kmeans  16384  20            0.489826  215497.903630  0.048245

Total running time of the script: (0 minutes 6.787 seconds)

Gallery generated by Sphinx-Gallery