Credit Card Fraud Detection using snap-ml-local¶

In this example we will train a Logistic Regression model a credit card fraud dataset, using snap-ml-local.

Getting Data¶

For this example we use the dataset from the Kaggle credit card fraud detection competition. To prepare the data, first, create a new directory:

mkdir data
cd data

and then download the data from the Kaggle webpage into the data directory and unzip it:

unzip creditcardfraud.zip
cd ../

Data Preprocessing¶

Before doing the training we show how to preprocess the dataset and dump it into numpy binary format for fast loading

from sklearn.datasets import load_svmlight_file
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import normalize
import pandas as pd
import numpy as np

# Import the data from csv format
data = pd.read_csv("data/creditcard.csv")

# Standardize features by removing the mean and scaling to unit variance
data.iloc[:, 1:29] = StandardScaler().fit_transform(data.iloc[:, 1:29])

# Convert the data frame to its Numpy-array representation
data_matrix = data.as_matrix()
X = data_matrix[:, 1:29]
y = data_matrix[:, 30]

# Normalize the data
X = normalize(X, norm="l1")

# Split the data in train and test
stratSplit = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
for train_index, test_index in stratSplit.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Save the dense matrices
np.save("data/creditcard.X_train", X_train)
np.save("data/creditcard.X_test",  X_test)

# Save the labels
np.save("data/creditcard.y_train", y_train)
np.save("data/creditcard.y_test", y_test)

Training using Snap ML¶

After preprocessing the data you are good to go and train a logistic regression classifier using snap-ml-local.

import numpy as np
from scipy.sparse import load_npz
import time
import sys
import argparse
import pandas as pd
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve
from sklearn.preprocessing import StandardScaler
from sklearn.utils import compute_class_weight
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import normalize


# timing
t0 = time.time()

# Import the data
X_train = np.load("data/creditcard.X_train.npy")
X_test  = np.load("data/creditcard.X_test.npy")
y_train = np.load("data/creditcard.y_train.npy")
y_test  = np.load("data/creditcard.y_test.npy")
print("Data load time (s): {0:.2f}".format(time.time()-t0))

# specify whether to use GPUs for training or not
use_gpu = True
device_ids = []

if use_gpu:
    num_threads = 256
    cpu_gpu = "GPU"
    # specify how many and which GPUs to use
    device_ids = [0,1,2,3]

else:
    num_threads = 8
    cpu_gpu = "CPU"


# specify whether to balance class weights
use_balanced_class_weights = True

if use_balanced_class_weights:
    class_weight = "balanced"
else:
    class_weight = None



# Import the LogisticRegression classifier from pai4sk
from pai4sk import LogisticRegression
# Alternatively you can also use the LogisticRegression classifier from pai4sk.linear_model
# from pai4sk.linear_model import LogisticRegression

lr = LogisticRegression(use_gpu = use_gpu, device_ids = device_ids,
                        num_threads = num_threads, class_weight = class_weight,
                        fit_intercept = True, regularizer = 100)

# Training
t0 = time.time()
lr.fit(X_train, y_train)
print("[pai4sk] Training time (s):  {1:.2f}".format(cpu_gpu, time.time()-t0))

# set num_threads to use for inference
num_threads_inference = 2

# Evaluate log-loss on test set
pred = lr.predict_proba(X_test,num_threads = num_threads_inference)[:,1]

from sklearn.metrics import average_precision_score
acc_snap = average_precision_score(y_test, pred)
print("[pai4sk] Average Precision Score :   {1:.4f}".format(cpu_gpu, acc_snap))