Credit Card Fraud Detection using snap-ml-local¶
In this example we will train a Logistic Regression model a credit card fraud dataset, using snap-ml-local.
Getting Data¶
For this example we use the dataset from the Kaggle credit card fraud detection competition. To prepare the data, first, create a new directory:
mkdir data
cd data
and then download the data from the Kaggle webpage into the data directory and unzip it:
unzip creditcardfraud.zip
cd ../
Data Preprocessing¶
Before doing the training we show how to preprocess the dataset and dump it into numpy binary format for fast loading
from sklearn.datasets import load_svmlight_file
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import normalize
import pandas as pd
import numpy as np
# Import the data from csv format
data = pd.read_csv("data/creditcard.csv")
# Standardize features by removing the mean and scaling to unit variance
data.iloc[:, 1:29] = StandardScaler().fit_transform(data.iloc[:, 1:29])
# Convert the data frame to its Numpy-array representation
data_matrix = data.as_matrix()
X = data_matrix[:, 1:29]
y = data_matrix[:, 30]
# Normalize the data
X = normalize(X, norm="l1")
# Split the data in train and test
stratSplit = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
for train_index, test_index in stratSplit.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
# Save the dense matrices
np.save("data/creditcard.X_train", X_train)
np.save("data/creditcard.X_test",  X_test)
# Save the labels
np.save("data/creditcard.y_train", y_train)
np.save("data/creditcard.y_test", y_test)
Training using Snap ML¶
After preprocessing the data you are good to go and train a logistic regression classifier using snap-ml-local.
import numpy as np
from scipy.sparse import load_npz
import time
import sys
import argparse
import pandas as pd
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve
from sklearn.preprocessing import StandardScaler
from sklearn.utils import compute_class_weight
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import normalize
# timing
t0 = time.time()
# Import the data
X_train = np.load("data/creditcard.X_train.npy")
X_test  = np.load("data/creditcard.X_test.npy")
y_train = np.load("data/creditcard.y_train.npy")
y_test  = np.load("data/creditcard.y_test.npy")
print("Data load time (s): {0:.2f}".format(time.time()-t0))
# specify whether to use GPUs for training or not
use_gpu = True
device_ids = []
if use_gpu:
    num_threads = 256
    cpu_gpu = "GPU"
    # specify how many and which GPUs to use
    device_ids = [0,1,2,3]
else:
    num_threads = 8
    cpu_gpu = "CPU"
# specify whether to balance class weights
use_balanced_class_weights = True
if use_balanced_class_weights:
    class_weight = "balanced"
else:
    class_weight = None
# Import the LogisticRegression classifier from pai4sk
from pai4sk import LogisticRegression
# Alternatively you can also use the LogisticRegression classifier from pai4sk.linear_model
# from pai4sk.linear_model import LogisticRegression
lr = LogisticRegression(use_gpu = use_gpu, device_ids = device_ids,
                        num_threads = num_threads, class_weight = class_weight,
                        fit_intercept = True, regularizer = 100)
# Training
t0 = time.time()
lr.fit(X_train, y_train)
print("[pai4sk] Training time (s):  {1:.2f}".format(cpu_gpu, time.time()-t0))
# set num_threads to use for inference
num_threads_inference = 2
# Evaluate log-loss on test set
pred = lr.predict_proba(X_test,num_threads = num_threads_inference)[:,1]
from sklearn.metrics import average_precision_score
acc_snap = average_precision_score(y_test, pred)
print("[pai4sk] Average Precision Score :   {1:.4f}".format(cpu_gpu, acc_snap))
© Copyright IBM Corporation 2018