.. DO NOT EDIT. .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: .. "auto_ch6\ch6_CreditCardFraud.py" .. LINE NUMBERS ARE GIVEN BELOW. .. only:: html .. note:: :class: sphx-glr-download-link-note :ref:`Go to the end ` to download the full example code. .. rst-class:: sphx-glr-example-title .. _sphx_glr_auto_ch6_ch6_CreditCardFraud.py: 6.4 Unsupervised learning: Clustering - Fraud detection ========================== We show how to reproduce the results of the chapter 6.3.3 - Credit card fraud dectection. We will compare the codpy MMD minimization-based algorithm with scikit learn k-means in an unsupervised setting. The goal is to show the different scores as we increase the number of centroids Ny used for clustering. .. GENERATED FROM PYTHON SOURCE LINES 11-14 Necessary Imports ------------------------ ######################################################################## .. GENERATED FROM PYTHON SOURCE LINES 14-43 .. code-block:: Python import sys import os import time import seaborn as sns import pandas as pd from sklearn.preprocessing import RobustScaler from sklearn.model_selection import train_test_split import kagglehub import matplotlib.pyplot as plt import numpy as np import pandas as pd from codpy.clustering import GreedySearch, SharpDiscrepancy from codpy.kernel import * from codpy.data_processing import hot_encoder from ch6_Clustering import * from sklearn.cluster import KMeans try: current_dir = os.path.dirname(__file__) data_dir = os.path.join(current_dir, "data") except NameError: current_dir = os.getcwd() data_dir = os.path.join(current_dir, "data") curr_f = os.path.join(os.getcwd(), "codpybook", "utils") sys.path.insert(0, curr_f) .. GENERATED FROM PYTHON SOURCE LINES 44-48 CreditCardFraud Data Preparation ------------------------ We get the data from Kagglehub. We scale the values using RobustScaler, which is robust to outliers. This is usefull for the CreditCardFraud dataset, which contains a very small percentage of fraudulent transactions. .. GENERATED FROM PYTHON SOURCE LINES 48-88 .. code-block:: Python def get_dataset(): path = kagglehub.dataset_download("mlg-ulb/creditcardfraud") print("Path to dataset files:", path) data = pd.read_csv(os.path.join(path, "creditcard.csv")) return data def prep_data(x,n): rob_scaler = RobustScaler() x["Time"] = rob_scaler.fit_transform(x['Time'].values.reshape(-1,1)) x["Amount"] = rob_scaler.fit_transform(x['Amount'].values.reshape(-1,1)) x = x[:n] frauds = x[x["Class"]==1] no_frauds = x[x["Class"]==0] train_size = 0.8 x_train_fraud, z_test_fraud= train_test_split(frauds, train_size=train_size, random_state=42) x, z = train_test_split(no_frauds, train_size=train_size, random_state=42) x = pd.concat([x,x_train_fraud]) z = pd.concat([z,z_test_fraud]) fx = x['Class'] x = x.drop(['Class'],axis=1) fz = z['Class'] z = z.drop(['Class'],axis=1) fx, fz = ( hot_encoder(pd.DataFrame(data=fx.values), cat_cols_include=[0], sort_columns=True), hot_encoder(pd.DataFrame(data=fz.values), cat_cols_include=[0], sort_columns=True), ) x, fx, z, fz = (x.to_numpy(), fx.to_numpy(), z.to_numpy(), fz.to_numpy()) return x,fx,z,fz def fraud_generator(n): return prep_data(get_dataset(),n) .. GENERATED FROM PYTHON SOURCE LINES 89-93 Running the Experiment ------------------------ This section runs the experiment to compare K-means and CodPy clustering. We use the models defined in 6.3 Unsupervised learning: Clustering - MNIST .. GENERATED FROM PYTHON SOURCE LINES 93-145 .. code-block:: Python def one_experiment(X, fx, Ny, get_predictor, z, fz): def get_score(X, cluster_centers, predictor): inertia = compute_inertia(X, cluster_centers) mmd = compute_mmd(X, cluster_centers) f_z = predictor(z) f_z = f_z.argmax(1) ground_truth = fz.argmax(axis=-1) out = confusion_matrix(ground_truth, f_z) return inertia, mmd, out elapsed_time = time.time() cluster_centers, predictor = get_predictor(X, fx, Ny) elapsed_time = time.time() - elapsed_time inertia, mmd, conf_matrix = get_score(X, cluster_centers, predictor) return inertia, mmd, elapsed_time, conf_matrix def run_experiment(data_generator, Nx, Ny_values, get_predictors, labels, file_name=None): results = [] conf_matrices = {} for Ny in Ny_values: N_MNIST_pics = Nx x, fx, z, fz = data_generator(N_MNIST_pics) for get_predictor, label in zip(get_predictors, labels): inertia, mmd, elapsed_time, conf_matrix = one_experiment(x, fx, Ny, get_predictor, z, fz) print( "Method:",label, "N_partition:",Ny, " inertia:",inertia, " mmd:",mmd, " time:",elapsed_time, ) results.append( { "Method": label, "Nx": Nx, "Ny": Ny, "Execution Time (s)": elapsed_time, "inertia": inertia, "mmd": mmd, } ) conf_matrices[label] = conf_matrix out = pd.DataFrame(results) print(out) if file_name is not None: out.to_csv(file_name, index=False) conf_matrices = [{"data": conf_mat} for label, conf_mat in conf_matrices.items()] return results, conf_matrices .. GENERATED FROM PYTHON SOURCE LINES 146-149 Plotting ------------------------ This section formats data plots the different experiments on a figure. .. GENERATED FROM PYTHON SOURCE LINES 149-213 .. code-block:: Python def plot_experiment(inputs): """ This is mainly boilerplate formatting the data for plotting. """ results = [{"data": {}} for _ in range(3)] for res in inputs: ny = res["Ny"] method = res["Method"] t = res["Execution Time (s)"] inertia = res["inertia"] mmd = res["mmd"] results[0]["data"].setdefault(ny, {})[method] = mmd results[1]["data"].setdefault(ny, {})[method] = inertia results[2]["data"].setdefault(ny, {})[method] = t def plot_one(inputs): results = inputs["data"] ax = inputs["ax"] legend = inputs["legend"] for model_name in next(iter(results.values())).keys(): x_vals = sorted(results.keys()) y_vals = [results[x][model_name] for x in x_vals] ax.plot(x_vals, y_vals, marker='o', label=model_name) ax.set_xlabel('Ny') ax.set_ylabel(legend) ax.legend() ax.grid(True) return ax multi_plot( results, plot_one, mp_nrows=1, mp_ncols=4, mp_figsize=(14, 10), legends=["discrepancy_errors", "inertia", "execution_time"], ) def plot_conf_matrix(inputs): conf_matrix = inputs["data"] ax = inputs["ax"] legend = inputs["legend"] sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax) ax.set_xlabel('Predicted labels') ax.set_ylabel('True labels') ax.set_title(legend) return ax if __name__ == "__main__": get_predictors = [ lambda X, fx, N: codpy_clustering(X, fx, N), lambda X, fx, N: kmeans_clustering(X, fx, N), ] labels = ["greedy", "kmeans"] # Run the experiment Nxs, Nys = 4096*4, [10, 20] results, conf_matrices = run_experiment(fraud_generator, Nxs, Nys, get_predictors, labels) plot_experiment(results) plt.show() multi_plot(conf_matrices, plot_conf_matrix, mp_nrows=1, mp_ncols=2, mp_figsize=(14, 10), legends=["MMD Codpy", "k-means"]) plt.show() .. rst-class:: sphx-glr-horizontal * .. image-sg:: /auto_ch6/images/sphx_glr_ch6_CreditCardFraud_001.png :alt: ch6 CreditCardFraud :srcset: /auto_ch6/images/sphx_glr_ch6_CreditCardFraud_001.png :class: sphx-glr-multi-img * .. image-sg:: /auto_ch6/images/sphx_glr_ch6_CreditCardFraud_002.png :alt: MMD Codpy, k-means :srcset: /auto_ch6/images/sphx_glr_ch6_CreditCardFraud_002.png :class: sphx-glr-multi-img .. rst-class:: sphx-glr-script-out .. code-block:: none Path to dataset files: C:\Users\geoff\.cache\kagglehub\datasets\mlg-ulb\creditcardfraud\versions\3 Method: greedy N_partition: 10 inertia: 464594.9838624175 mmd: 0.0948630994595838 time: 0.2972526550292969 Method: kmeans N_partition: 10 inertia: 282535.826614884 mmd: 0.09666078993090986 time: 0.33942198753356934 Path to dataset files: C:\Users\geoff\.cache\kagglehub\datasets\mlg-ulb\creditcardfraud\versions\3 Method: greedy N_partition: 20 inertia: 426947.06153945025 mmd: 0.04651846029808986 time: 0.31640195846557617 Method: kmeans N_partition: 20 inertia: 215497.90363048157 mmd: 0.048244749837085515 time: 0.4898262023925781 Method Nx Ny Execution Time (s) inertia mmd 0 greedy 16384 10 0.297253 464594.983862 0.094863 1 kmeans 16384 10 0.339422 282535.826615 0.096661 2 greedy 16384 20 0.316402 426947.061539 0.046518 3 kmeans 16384 20 0.489826 215497.903630 0.048245 .. rst-class:: sphx-glr-timing **Total running time of the script:** (0 minutes 6.787 seconds) .. _sphx_glr_download_auto_ch6_ch6_CreditCardFraud.py: .. only:: html .. container:: sphx-glr-footer sphx-glr-footer-example .. container:: sphx-glr-download sphx-glr-download-jupyter :download:`Download Jupyter notebook: ch6_CreditCardFraud.ipynb ` .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: ch6_CreditCardFraud.py ` .. container:: sphx-glr-download sphx-glr-download-zip :download:`Download zipped: ch6_CreditCardFraud.zip ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_