Transaction Detection Project
# -*- coding: utf-8 -*-
# 전처리 & 정규화
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
df=pd.read_csv('/content/drive/MyDrive/ml/data/Fraud.csv')
df.head()
df.info()
df.isnull().sum()
df['isFraud'].value_counts()
df.describe()
Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
# Column Dtype
--- ------ -----
0 step int64
1 type object
2 amount float64
3 nameOrig object
4 oldbalanceOrg float64
5 newbalanceOrig float64
6 nameDest object
7 oldbalanceDest float64
8 newbalanceDest float64
9 isFraud int64
10 isFlaggedFraud int64
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
step | amount | oldbalanceOrg | newbalanceOrig | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | |
---|---|---|---|---|---|---|---|---|
count | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 |
mean | 2.433972e+02 | 1.798619e+05 | 8.338831e+05 | 8.551137e+05 | 1.100702e+06 | 1.224996e+06 | 1.290820e-03 | 2.514687e-06 |
std | 1.423320e+02 | 6.038582e+05 | 2.888243e+06 | 2.924049e+06 | 3.399180e+06 | 3.674129e+06 | 3.590480e-02 | 1.585775e-03 |
min | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
25% | 1.560000e+02 | 1.338957e+04 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
50% | 2.390000e+02 | 7.487194e+04 | 1.420800e+04 | 0.000000e+00 | 1.327057e+05 | 2.146614e+05 | 0.000000e+00 | 0.000000e+00 |
75% | 3.350000e+02 | 2.087215e+05 | 1.073152e+05 | 1.442584e+05 | 9.430367e+05 | 1.111909e+06 | 0.000000e+00 | 0.000000e+00 |
max | 7.430000e+02 | 9.244552e+07 | 5.958504e+07 | 4.958504e+07 | 3.560159e+08 | 3.561793e+08 | 1.000000e+00 | 1.000000e+00 |
df.groupby(['isFraud', 'type']).size().unstack(fill_value=0)
# one-hot encoder
types = pd.get_dummies(df['type'])
types.head()
# label encoder
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df['nameOrig'] = label.fit_transform(df['nameOrig'])
df['nameDest'] = label.fit_transform(df['nameDest'])
df = pd.concat([df, types], axis=1)
df = df.drop('type', axis=1)
df = df.drop(['step'], axis=1)
df.head()
X = df.drop('isFraud', axis = 1) # feature
y = df['isFraud'] # label
X.shape, y.shape
((6362620, 13), (6362620,))
RANDOM_SEED = 42
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_under, y_under = rus.fit_resample(X, y)
print('lable : ')
print(pd.Series(y_under).value_counts())
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_under, y_under, test_size=0.2, random_state=RANDOM_SEED)
print('RandomUnderSampling train set : ', X_train_under.shape, y_train_under.shape)
lable :
0 8213
1 8213
Name: isFraud, dtype: int64
RandomUnderSampling train set : (13140, 13) (13140,)
RANDOM_SEED = 42
from imblearn.over_sampling import SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
smote = SMOTE(random_state=RANDOM_SEED)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)
print('lable : ')
print(pd.Series(y_train_over).value_counts())
print('SMOTE train set : ', X_train_over.shape, y_train_over.shape)
lable :
0 5083503
1 5083503
Name: isFraud, dtype: int64
SMOTE train set : (10167006, 13) (10167006,)
from sklearn.preprocessing import StandardScaler
std_scaler_under = StandardScaler().fit(X_train_under)
X_train_under_scaled = std_scaler_under.transform(X_train_under)
X_test_under_scaled = std_scaler_under.transform(X_test_under)
print(X_train_under_scaled[:10])
[[-0.42507879 1.54064314 -0.30105755 -0.11212719 1.14811629 -0.19977104
-0.28031565 -0.0302337 -0.35450618 -0.86354816 -0.05796382 2.25963447
-0.64541371]
[-0.44141933 0.36421259 -0.38509374 -0.20875718 -0.7010858 -0.0492685
-0.13909182 -0.0302337 -0.35450618 -0.86354816 17.25214084 -0.44254945
-0.64541371]
[ 0.39482084 -0.1013111 0.09868026 -0.21203341 -0.34565313 -0.1812047
0.08001191 -0.0302337 -0.35450618 1.15801301 -0.05796382 -0.44254945
-0.64541371]
[-0.31563093 1.31085544 -0.38824743 -0.11636098 -0.23917395 -0.19977104
-0.28031565 -0.0302337 2.82082527 -0.86354816 -0.05796382 -0.44254945
-0.64541371]
[-0.44235006 0.45614636 -0.36612323 -0.18346092 1.70885871 -0.19977104
-0.28031565 -0.0302337 -0.35450618 -0.86354816 -0.05796382 2.25963447
-0.64541371]
[-0.39182926 0.44071992 -0.35878255 -0.21203341 0.04773663 -0.19977104
-0.28031565 -0.0302337 -0.35450618 -0.86354816 -0.05796382 -0.44254945
1.54939379]
[ 0.36728988 1.45972186 0.0826701 -0.21203341 -0.86991861 -0.19977104
-0.28031565 -0.0302337 -0.35450618 -0.86354816 -0.05796382 -0.44254945
1.54939379]
[-0.40164861 -0.34698807 -0.38825659 -0.21203341 -0.72674583 -0.02443463
-0.09957331 -0.0302337 -0.35450618 1.15801301 -0.05796382 -0.44254945
-0.64541371]
[-0.27283985 1.50068044 0.72053295 1.35339186 -0.75495367 0.5263188
0.32935342 -0.0302337 2.82082527 -0.86354816 -0.05796382 -0.44254945
-0.64541371]
[ 0.50427538 -0.23453883 0.16233166 -0.21203341 0.02459493 -0.19977104
-0.28031565 -0.0302337 -0.35450618 -0.86354816 -0.05796382 -0.44254945
1.54939379]]
from sklearn.preprocessing import StandardScaler
std_scaler_over = StandardScaler().fit(X_train_over)
X_train_over_scaled = std_scaler_over.transform(X_train_over)
X_test_over_scaled = std_scaler_over.transform(X_test)
from keras import backend as K
def recall(y_target, y_pred):
y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다
y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다
# True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우
count_true_positive = K.sum(y_target_yn * y_pred_yn)
# (True Positive + False Negative) = 실제 값이 1(Positive) 전체
count_true_positive_false_negative = K.sum(y_target_yn)
# Recall = (True Positive) / (True Positive + False Negative)
# divide by zero error 예방
recall = count_true_positive / (count_true_positive_false_negative + K.epsilon())
return recall
def precision(y_target, y_pred):
y_pred_yn = K.round(K.clip(y_pred, 0, 1))
y_target_yn = K.round(K.clip(y_target, 0, 1))
count_true_positive = K.sum(y_target_yn * y_pred_yn)
# (True Positive + False Positive) = 예측 값이 1(Positive) 전체
count_true_positive_false_positive = K.sum(y_pred_yn)
# Precision = (True Positive) / (True Positive + False Positive)
# divide by zero error 예방
precision = count_true_positive / (count_true_positive_false_positive + K.epsilon())
return precision
def f1score(y_target, y_pred):
_recall = recall(y_target, y_pred)
_precision = precision(y_target, y_pred)
# divide by zero error 예방
_f1score = ( 2 * _recall * _precision) / (_recall + _precision+ K.epsilon())
return _f1score
def plot_learningCurve(history, epoch):
# Plot training & validation accuracy values
epoch_range = range(1, epoch+1)
plt.plot(epoch_range, history.history['val_accuracy'])
plt.plot(epoch_range, history.history['val_recall'])
plt.plot(epoch_range, history.history['val_precision'])
plt.plot(epoch_range, history.history['val_f1score'])
plt.title('Model f1score')
plt.ylabel('score')
plt.xlabel('Epoch')
plt.legend(['accuracy', 'recall', 'precision', 'f1score'], loc='upper left')
plt.show()
# Plot training & validation loss values
plt.plot(epoch_range, history.history['loss'])
plt.plot(epoch_range, history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
def print_score(y_test, y_pred):
print(f"Recall Score: {recall_score(y_test, y_pred)}")
print(f"Precision Score: {precision_score(y_test, y_pred)}")
print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}")
from sklearn.model_selection import learning_curve
def plot_acc(estimator, X, y):
# cv is the number of folds while performing Cross Validation
sizes, training_scores, testing_scores = learning_curve(estimator, X, y, cv=5, scoring='accuracy', train_sizes=np.linspace(0.01, 1.0, 50))
# Mean and Standard Deviation of training scores
mean_training = np.mean(training_scores, axis=1)
Standard_Deviation_training = np.std(training_scores, axis=1)
# Mean and Standard Deviation of testing scores
mean_testing = np.mean(testing_scores, axis=1)
Standard_Deviation_testing = np.std(testing_scores, axis=1)
# dotted blue line is for training scores and green line is for cross-validation score
plt.plot(sizes, mean_training, '--', color="b", label="Training score")
plt.plot(sizes, mean_testing, color="g", label="Cross-validation score")
# Drawing plot
plt.title("Accuracy")
plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()
from sklearn.model_selection import GridSearchCV, ShuffleSplit, learning_curve, cross_val_score
def plot_learning_curve(estimator, x, y, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 20), s='neg_log_loss'):
# train_sizes의 80%(cv=5)를 0.1, 0.325, 0.55, 0.775, 1의 비율로 학습시긴다.
train_sizes, train_scores, test_scores = learning_curve(estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=s)
if s.find('neg') != -1:
train_scores, test_scores = -train_scores, -test_scores
train_scores_mean = np.nanmean(train_scores, axis = 1)
test_scores_mean = np.nanmean(test_scores, axis = 1)
plt.plot(train_sizes, train_scores_mean, 'o-', color="#ff9124", label="Train score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="#2492ff", label="Test score")
plt.xlabel('Training size')
plt.ylabel('Loss')
# 그림에 선 표시
plt.grid(True)
# 범례 표시: best - 자동으로 최적의 위치에
plt.legend(loc="best")
plt.show()
# SVM
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
parameters = {'C' : [0.0001,0.001,0.01,0.1,0,1,10,50,100]}
grid_svm = GridSearchCV(LinearSVC(), parameters, cv=3, return_train_score=True, n_jobs=-1)
grid_svm.fit(X_train_under_scaled,y_train_under)
GridSearchCV(cv=3, estimator=LinearSVC(), n_jobs=-1,
param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 0, 1, 10, 50, 100]},
return_train_score=True)
print("Best Parameter",grid_svm.best_params_)
Best Parameter {'C': 10}
parameters = {'C' : [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]}
grid_svm = GridSearchCV(LinearSVC(), parameters, cv=3, return_train_score=True, n_jobs=-1)
grid_svm.fit(X_train_under_scaled,y_train_under)
GridSearchCV(cv=3, estimator=LinearSVC(), n_jobs=-1,
param_grid={'C': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20]},
return_train_score=True)
print("Best Parameter",grid_svm.best_params_)
Best Parameter {'C': 20}
plot_acc(grid_svm, X_train_under_scaled, y_train_under)
plot_learning_curve(grid_svm, X_train_under_scaled, y_train_under)
svm_under = Pipeline([
("scaler", StandardScaler()),
("linear_svc", LinearSVC(C=13))
])
svm_under.fit(X_train_under_scaled, y_train_under)
svm_y_pred_under = svm_under.predict(X_test_under_scaled)
print_score(y_test_under, svm_y_pred_under)
Recall Score: 0.9743433109346366
Precision Score: 0.9562350119904077
Accuracy Score: 0.9650030432136336
F1 Score: 0.9652042360060515
plot_acc(svm_under, X_train_under_scaled, y_train_under)
plot_learning_curve(svm_under, X_train_under_scaled, y_train_under)
# DecisionTree
from sklearn.tree import DecisionTreeClassifier
parameters = {'max_leaf_nodes':[10,11,12,13,14,15 ], 'max_depth':[2,3,4,5,6,7]}
grid_rf = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True, n_jobs=-1)
grid_rf.fit(X_train_under_scaled,y_train_under)
print("Best Parameter",grid_rf.best_params_)
tree_clf = DecisionTreeClassifier(max_depth=5, max_leaf_nodes=14, random_state=0).fit(X_train_under_scaled, y_train_under)
tree_y_pred = tree_clf.predict(X_test_under)
print_score(y_test_under, tree_y_pred)
plot_acc(tree_clf, X_train_under_scaled, y_train_under)
plot_learning_curve(tree_clf, X_train_under_scaled, y_train_under)
GridSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_jobs=-1,
param_grid={'max_depth': [2, 3, 4, 5, 6, 7],
'max_leaf_nodes': [10, 11, 12, 13, 14, 15]},
return_train_score=True)
print("Best Parameter",grid_rf.best_params_)
Best Parameter {'max_depth': 5, 'max_leaf_nodes': 14}
tree_clf = DecisionTreeClassifier(max_depth=5, max_leaf_nodes=14, random_state=0).fit(X_train_under_scaled, y_train_under)
tree_y_pred = tree_clf.predict(X_test_under)
print_score(y_test_under, tree_y_pred)
Recall Score: 0.9993891264508247
Precision Score: 0.4980213089802131
Accuracy Score: 0.497869750456482
F1 Score: 0.6647704185290532
plot_acc(tree_clf, X_train_under_scaled, y_train_under)
plot_learning_curve(tree_clf, X_train_under_scaled, y_train_under)
!jupyter nbconvert --to markdown "/content/drive/MyDrive/ml/cnn_under.ipyn"
[NbConvertApp] Converting notebook /content/drive/MyDrive/ml/cnn_under.ipynb to markdown
[NbConvertApp] Support files will be in cnn_under_files/
[NbConvertApp] Making directory /content/drive/MyDrive/ml/cnn_under_files
[NbConvertApp] Making directory /content/drive/MyDrive/ml/cnn_under_files
[NbConvertApp] Making directory /content/drive/MyDrive/ml/cnn_under_files
[NbConvertApp] Making directory /content/drive/MyDrive/ml/cnn_under_files
[NbConvertApp] Writing 39330 bytes to /content/drive/MyDrive/ml/cnn_der.md
고양이 사진~~~~
댓글남기기