先从 Credit card fraud detection 引入 stacking 和 meta labeling 的概念,再嫁接到金融数据上
'''Main'''
import numpy as np
import pandas as pd
'''Data Viz'''
import matplotlib.pyplot as plt
import seaborn as sns
#plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = [16, 9]
plt.rcParams['figure.dpi'] = 300
plt.rcParams['font.size'] = 20
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['font.family'] = 'serif'
%matplotlib inline
'''Data Prep'''
from sklearn import preprocessing as pp
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
'''Metrics'''
from sklearn.metrics import log_loss, accuracy_score, f1_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
'''Algos'''
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
def view(data, num = 5):
print('The shape is', data.shape)
return data.head(num).append(data.tail(num))
data_original = pd.read_csv('creditcard.csv')
view(data_original)
数据集含有 28 个匿名的特征,1个数量特征,1个时间特征,还有一个目标变量,也就是target。数据展示了两天内的交易结构,其中我们在284807条交易数据中有492个违约情况。特征匿名化是为了保护客户的隐私以及这些特征是来自于PCA降维后的结果。没有被PCA转换的是 amount 和 time。
data_original.info()
#see the cluster and corralation of features and classes
def plot_corr(data = data_original, target = data_original.Class):
ax1 = data.corrwith(target).plot.bar(figsize = (20, 10),
title = "Correlation with class",
fontsize = 18, color='r',
rot = 45,
grid = True)
ax1.title.set_size(28)
sns.set(style="white")
cmap = sns.diverging_palette(220, 20, as_cmap=True)
corr =data.corr()
sns.clustermap(corr,cmap=cmap,
linewidths=1,linecolor='w')
plot_corr();
对于meta labeling来说,我们需要一个类别不均衡的数据,所以precision和recall不会同时变得很高。
val_counts = data_original[['Class']].value_counts()
ax = sns.barplot(x=val_counts.index,
y=val_counts/len(data_original))
ax.set(title=f'Frequency Percentage by {val_counts}',
xlabel='Class',
ylabel='Frequency Percentage');
所以一般的算法会很容易欺骗我们,准确率往往会非常高,因为本来99.9%的数据就是 non-frauds,但是我们的目的是找到违约的情况,那么这个结果就是有欺骗性的,这就需要我们有更好的对于准确度的衡量标准!
那么先来简单介绍一下这些 metrics
这些指标比较相似,所以放在一起了。
ROC 曲线的 x 轴是 FPR,y 轴是 TPR,表示 TPR 对于 FPR 的敏感程度。AUC 是 ROC 曲线下方的面积,表示 TPR 和 FPR 之间的权衡关系,越大也好。
不同于 precision-recall 曲线,ROC 曲线对于均衡数据更有效
先来说一下 Precision-recall curve。在评估分类器的好坏时,我们一般分析的是在改变阈值时,precision和recall的变化情况。好的分类器的precision将会随着recall增加而增加。
本图显示了大概达到40%的recall时,不会牺牲任何的precision,但是达到100%的recall,就是牺牲掉50%的precision。
那么,average precision 是一个具体的数,用来表示分类器的表现,也就是上图中曲线下面的面积,对于所有 recall 对应的 precision 进行积分。
$$\int_{0}^{1}p(r)dr$$实际当中,积分可由离散值的累加来代替
$$\sum_{k=1}^{N}p(k)\Delta r(k)\\ ~~~~~ where~~ \Delta r(k) ~~is ~~the ~~change ~~in ~~recall$$https://sanchom.wordpress.com/tag/average-precision/
F-score(亦被称做F-measure)是一种量测方法的精确度常用的指标,经常用来判断算法的精确度
一般式为
其中,$\beta=1$ 则变成了 F1 score,F-score最理想的数值是趋近于1,做法是让precision和recall都有很高的值。若两者皆为1,使得 ${\displaystyle 2\cdot {\frac {1}{2}}=1}$,则F-score = 1 (100%),代表该算法有着最佳的精确度。
混淆矩阵在面对高度的不均衡数据时显得乏力,不多介绍了。
def metrics_summary(true_label, prediction_prob, Threshold=0.5):
#basically, slearn provides all the functions for metrics.
average_precision = average_precision_score(true_label
,prediction_prob)
fpr, tpr, thresholds = roc_curve(true_label, prediction_prob)
areaUnderROC = auc(fpr, tpr)
prediction_int = prediction_prob > Threshold
accuracy = accuracy_score(true_label, prediction_int)
print(f'accuracy: {accuracy}')
print(f"average_precision: {average_precision}")
print(f'areaUnderROC--AUC: {areaUnderROC } \n')
print('*'*60)
print(' '*20, 'classification_report')
print('*'*60, "\n")
print(classification_report(true_label, prediction_int))
print('*'*60)
print(' '*20, 'confusion_matrix \n')
print('*'*60, "\n")
display(confusion_matrix(true_label, prediction_int))
print("\n")
# precision_recall_curve and areaUnderROC
precision, recall, thresholds = precision_recall_curve( \
true_label, prediction_int)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,9))
ax1.step(recall, precision, color='k', alpha=0.7, where='post')
ax1.fill_between(recall, precision, step='post',
alpha=0.3,color='k')
ax1.set_xlabel('Recall', fontname="Arial", fontsize=24)
ax1.set_ylabel('Precision', fontname="Arial", fontsize=24)
ax1.tick_params(labelsize=20)
ax1.set_title('Precision-Recall curve: Average Precision\
={0:0.2f}'.format(average_precision), fontsize=24,
fontname="Arial")
ax2.plot(fpr, tpr, color='r', lw=2, label='ROC curve')
ax2.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
ax2.tick_params(labelsize=20)
ax2.set_xlabel('False Positive Rate', fontname="Arial",
fontsize=24)
ax2.set_ylabel('True Positive Rate', fontname="Arial",
fontsize=24)
ax2.set_title('areaUnderROC = {0:0.2f}'\
.format(areaUnderROC), fontsize=24, fontname="Arial",)
ax2.legend(loc="lower right", fontsize=24, fancybox=True)
# Adjust the subplot layout, because the logit one may take more space
# than usual, due to y-tick labels like "1 - 10^{-3}"
# plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10,
# right=0.95, hspace=0.25,wspace=0.35)
#Normalize training and testing data
def scale_data(x_train, x_test=None):
features_to_scale = x_train.copy().columns
scaler = pp.StandardScaler()
print(scaler.fit(x_train[features_to_scale]))
x_train.loc[:, features_to_scale] = \
scaler.transform(x_train[features_to_scale])
#normalize test dataset with the mean and std of train data set
x_test.loc[:, features_to_scale] = \
scaler.transform(x_test[features_to_scale])
return x_train, x_test
#seperate input and labels
def get_x_y(data=data_original):
data_x = data.copy().drop(['Class', 'Time'], axis=1)
data_y = data['Class'].copy()
return data_x, data_y
#split the train and test data
def data_split(data_x, data_y):
x_train, x_test, y_train, y_test = \
train_test_split(data_x,data_y,test_size=0.25,
stratify=data_y,random_state=2020)
# stratify 针对不均衡数据
return x_train, x_test, y_train, y_test
#put all together
def data_process(data=data_original):
data_x, data_y = get_x_y(data)
x_train, x_test, y_train, y_test \
= data_split(data_x, data_y)
#do not touch the test data by any means!!!!
x_train, x_test = scale_data(x_train, x_test)
return x_train, x_test, y_train, y_test
x_train, x_test_original, y_train, y_test_original \
= data_process(data_original)
x_train.shape, x_test_original.shape, \
y_train.shape, y_test_original.shape
print(f'No. of fraud in test dataset:\
{x_test_original[y_test_original==1].shape[0]}')
def build_model_1(x_train, y_train):
# parameters to be tuned
logitreg_parameters = {'C': np.power(10.0, np.arange(-9, 1)),
'solver' : ('lbfgs', 'liblinear') }
model_1 = LogisticRegression(#solver='liblinear',
class_weight='balanced',
#uses the values of y to automatically adjust weights
warm_start=True,
#reuse the solution of the previous call to fit
# as initialization
max_iter = 300,
#Maximum number of iterations taken for the solvers to converge.
random_state=2020,
#so results can be reproduced
)
logitreg_grid = GridSearchCV(model_1, param_grid = \
logitreg_parameters,scoring = 'f1', n_jobs = 1, cv=5)
logitreg_grid.fit(x_train, y_train)
return logitreg_grid
model_1 = build_model_1(x_train, y_train)
model_1.best_estimator_
model_1.best_score_
# 0 and 1 two clasese
y_pred_prob_test_1 = model_1.predict_proba(x_test_original)[:,1]
# number of fraud is 123 in test dataset
Threshold = 0.5
y_pred_int_test_1 = y_pred_prob_test_1 > Threshold
pd.Series(y_pred_int_test_1).value_counts()
metrics_summary(y_test_original, y_pred_int_test_1)
#prepare data
x_train_, x_cv, y_train_, y_cv = \
train_test_split(x_train, y_train,
test_size=0.25,
stratify=y_train,
random_state=2020)
def build_model_2(x_train, y_train, x_cv, y_cv ):
#most of the parsmeters are default
params_lightGB = {
'task': 'train',
'application':'binary',
'num_class':1,
'boosting': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
'metric_freq':50,
'is_training_metric':False,
'max_depth':4,
'num_leaves': 10,
'learning_rate': 0.01,
'feature_fraction': 1.0,
'bagging_fraction': 1.0,
'bagging_freq': 0,
'bagging_seed': 2018,
'verbose': -1,
'num_threads':16
}
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_cv, y_cv, reference=lgb_train)
model_2 = lgb.train(params_lightGB, lgb_train,
num_boost_round=2000,
valid_sets=lgb_eval,
early_stopping_rounds=200,
verbose_eval=False)
return model_2
x_train_.shape, y_train_.shape, x_cv.shape, y_cv.shape
model_2 = build_model_2(x_train_, y_train_, x_cv, y_cv)
y_pred_prob_test_2 = model_2.predict(x_test_original)
y_pred_int_test_2 = y_pred_prob_test_2 > Threshold
pd.DataFrame(y_pred_int_test_2).value_counts()
metrics_summary(y_test_original, y_pred_int_test_2)
可以看出,accuracy 非常高,这是我们所预料的,但是 recall = 77% 相对一般
callbacks = [EarlyStopping(monitor='loss', patience=3), \
ReduceLROnPlateau(monitor='val_loss', factor=0.2, \
patience=3, min_lr=0.001)]
def build_model_3(x_train, y_train, x_cv, y_cv, input_dim=29):
model_3 = Sequential([
Dense(input_dim = input_dim, units = 32,
activation = 'relu'),
Dense(units = 16, activation = 'relu'),
Dropout(0.5),
Dense(units = 8, activation = 'relu'),
# Dense(units = 4, activation = 'relu'),
Dense(units =1, activation = 'sigmoid'),])
model_3.compile(optimizer = 'adam',
loss = 'binary_crossentropy',
metrics = ['accuracy'])
model_3.fit(x_train, y_train,
validation_data = (x_cv, y_cv),
batch_size = 64,
epochs = 50,
callbacks=callbacks)
return model_3
model_3 = build_model_3(x_train_, y_train_, \
x_cv, y_cv, input_dim=29)
y_pred_prob_test_3 = model_3.predict(x_test_original)
y_pred_int_test_3 = y_pred_prob_test_3 > Threshold
y_pred_int_test_3.shape
pd.DataFrame(y_pred_int_test_3).value_counts()
metrics_summary(y_test_original, y_pred_int_test_3)
那么现在把这些模型堆叠起来
其实 stacking 就像是向原始数据中加入新的特征,并且新的特征来自于一级模型的预测,那么先来做一下特征工程,把所有数据堆叠一下。
def data_stack( x, y, m_1=model_1, m_2=model_2, m_3=model_3):
# All required parameters must be placed before any
# default arguments.
'''
x: features
y: labels
m_1, m_2, m_3: 3 models
'''
# build a container to hold all the prediction from 3 models
pred_all = pd.DataFrame(data=[], index=y.index)
pred_1 = m_1.predict_proba(x)[:,1]
pred_1_df = pd.DataFrame(pred_1, index=y.index)
pred_2 = m_2.predict(x,num_iteration=m_2.best_iteration)
pred_2_df = pd.DataFrame(pred_2, index=y.index)
pred_3 = m_3.predict(x).reshape(x.shape[0]) #to 1D shape
pred_3_df = pd.DataFrame(pred_3, index=y.index)
# join all the predictions together
pred_all = pred_all.join(pred_1_df.astype(float),
how='left',rsuffix="0")\
.join(pred_2_df.astype(float),
how='left',rsuffix="1")\
.join(pred_3_df.astype(float),
how='left',rsuffix="2")
pred_all.columns = ['pred_1', 'pred_2','pred_3']
# final training data will be the merge of training data
# and all the predictions
x_pred = x.merge(pred_all, \
left_index=True, right_index=True)
print(x_pred.shape)
return x_pred
x_train_stack = data_stack(x_train, y_train)
x_test_stack = data_stack(x_test_original, y_test_original)
plot_corr(data=x_train_stack,target=y_train)
我们发现一级模型得到的 prediction 是高度相关的,这并不意外。1st model 的信息泄露到 2nd model 因为他们共享了一样的训练数据。只要测试集是原封不动的,我们反而更愿意有更多的信息涌入 2nd model 从而有更好的预测结果。
继续对堆叠数据进行数据处理
#normalize training and testing data
x_train_stack, x_test_stack = scale_data(x_train_stack, x_test_stack)
#split the traning data to train and validation
x_train_stack_, x_cv_stack, y_train_, y_cv_ = \
train_test_split(x_train_stack, y_train,
test_size=0.25,
stratify=y_train,
random_state=2020)
#stratify mean samplling with the ratio of each class percentage in #all data.
x_train_stack_.shape, x_cv_stack.shape, y_train_.shape, y_cv.shape
注意:
如果单独的算法得到的结果都很好,那么合在一起会更好;如果其中一个算法要远远强于其他的,那么合在一起并不会有什么提升,顶多就和最好的那个一样好。
model_2_stack = build_model_2(x_train_stack_, y_train_, x_cv_stack, y_cv_)
y_pred_prob_test_2_stack = model_2_stack.predict(x_test_stack)
y_pred_int_test_2_stack = y_pred_prob_test_2_stack > Threshold
pd.DataFrame(y_pred_int_test_2_stack).value_counts()
metrics_summary(y_test_original, y_pred_int_test_2_stack)
发现 recall 从 0.77 提升到 0.85,在 F1 score 和 average precision 也有一定提高
model_3_stack = build_model_3(x_train_stack_, y_train_, \
x_cv_stack, y_cv_, input_dim=32)
y_pred_prob_test_3_stack = model_3_stack.predict(x_test_stack)
y_pred_int_test_3_stack = y_pred_prob_test_3_stack > Threshold
y_pred_int_test_3_stack.shape
pd.DataFrame(y_pred_int_test_3_stack).value_counts()
metrics_summary(y_test_original, y_pred_int_test_3_stack)
precision 增加了,但是 recall 降低了
model_1_stack = build_model_1(x_train_stack, y_train)
model_1_stack.best_score_
其实还不错
y_pred_prob_test_1_stack = model_1_stack.predict_proba(x_test_stack)[:,1]# 0 and 1 two clases
y_pred_int_test_1_stack = y_pred_prob_test_1_stack > Threshold
pd.Series(y_pred_int_test_1_stack).value_counts()
metrics_summary(y_test_original, y_pred_int_test_1_stack)
多多少少在 stack 之后比之前有了一些提升,而且也不必 lightGBM 和 DNN 差
由于 meta labeling 需要对输入数据和label都要添加新的特征,需要对之前的代码做一些调整
def data_meta(id, x, y, model):
#get prediction from model 1
Threshold = 0.5
pred_prob_meta = model.predict_proba(x)[:,1]
pred_prob_meta = pd.Series(pred_prob_meta, \
index=x.index,
name=f'pred_{id}_meta')
pred_int_meta = pred_prob_meta > Threshold
y_meta = pd.Series(y & pred_int_meta, name=f'y_train_meta_{id}')
x_meta = x.join(pred_int_meta)
return x_meta, y_meta
x_train_meta_1, y_train_meta_1 = \
data_meta(1, x_train, y_train, model_1)
x_train_meta_1.shape, y_train_meta_1.shape
plot_corr(x_train_meta_1, y_train_meta_1);
看得出,从一级模型而来的预测值和 label 有相当强的相关性
# test data
x_test_meta_1, y_test_meta_1 = \
data_meta(1, x_test_original, y_test_original, model_1)
x_test_meta_1.shape, y_test_meta_1.shape
x_train_meta_1, x_test_meta_1 = scale_data( \
x_train_meta_1, x_test_meta_1)
x_train_meta_1_, x_cv_meta_1, y_train_meta_1_, y_cv_meta_1 = \
train_test_split(x_train_meta_1, y_train_meta_1,
test_size=0.25,
stratify=y_train_meta_1,
random_state=2020)
#stratify mean samplling with the ratio of each class percentage in #all data.
x_train_meta_1_.shape, x_cv_meta_1.shape, y_train_meta_1_.shape, y_cv_meta_1.shape
model_2_meta_1 = build_model_2( \
x_train_meta_1_, y_train_meta_1_, x_cv_meta_1, y_cv_meta_1)
y_pred_prob_test_2_meta_1 = model_2_meta_1.predict(x_test_meta_1)
y_pred_int_test_2_meta_1 = y_pred_prob_test_2_meta_1 > Threshold
pd.DataFrame(y_pred_int_test_2_meta_1).value_counts()
在我们从 meta model 得到预测值之后,我们把这个结果和一级模型合并起来
final_pred_2_meta_1 = y_pred_int_test_2_meta_1 & y_pred_int_test_1
pd.DataFrame(final_pred_2_meta_1).value_counts()
metrics_summary(y_test_original, final_pred_2_meta_1)
虽然总体看起来没什么不同,但是拆开成不同的指标来看,可以看出 precision 和 recall 更均衡了
#if you receive an error message, try to run the data process again.
model_3_meta_1 = build_model_3( \
x_train_meta_1_, y_train_meta_1_, \
x_cv_meta_1, y_cv_meta_1, input_dim=30)
y_pred_prob_test_3_meta_1 = model_3_meta_1.predict(x_test_meta_1)
y_pred_int_test_3_meta_1 = y_pred_prob_test_3_meta_1 > Threshold
pd.DataFrame(y_pred_int_test_3_meta_1).value_counts()
# combine the meta prediction with primary prediction
final_pred_3_meta_1 = y_pred_int_test_3_meta_1.flatten() & y_pred_int_test_1
final_pred_3_meta_1.shape
metrics_summary(y_test_original, final_pred_3_meta_1)
貌似 precision 更高了,但代价是 recall 降低了一些
def data_meta_2(id, x, y, m_1, m_2):
'''
id: the id of new columns
x: input features
y: labels
m_1: model 1, here logreg
m_2: model 2
'''
pred_prob_meta_1 = m_1.predict_proba(x)[:,1]
pred_prob_meta_1 = pd.Series(pred_prob_meta_1, \
index=x.index,
name=f'pred_{id}_meta')
pred_int_meta_1 = pred_prob_meta_1 > Threshold
pred_prob_meta_2 = m_2.predict(x)
#as DNN give 2D prediction that needs to be flatten to 1D for
#combination
pred_prob_meta_2 = pd.Series(pred_prob_meta_2.flatten(), \
index=x.index,
name=f'pred_{id+1}_meta')
pred_int_meta_2 = pred_prob_meta_2 > Threshold
y_meta = pd.Series(y & pred_int_meta_1 & pred_int_meta_2, \
name=f'y_train_meta_{id}')
x_meta = x.join(pred_int_meta_1).join(pred_int_meta_2)
return x_meta, y_meta
#meta_1_2: meta data from 1 model and 2 model
x_train_meta_1_2, y_train_meta_1_2 = \
data_meta_2(1, x_train, y_train, model_1, model_2)
x_test_meta_1_2, y_test_meta_1_2 = \
data_meta_2(1, x_test_original, y_test_original, model_1, model_2)
x_train_meta_1_2, x_test_meta_1_2 = \
scale_data(x_train_meta_1_2, x_test_meta_1_2)
x_train_meta_1_2_, x_cv_meta_1_2, y_train_meta_1_2_, y_cv_meta_1_2 = \
train_test_split(x_train_meta_1_2, y_train_meta_1_2,
test_size=0.25,
stratify=y_train_meta_1_2,
random_state=2020)
#stratify mean samplling with the ratio of each class percentage in #all data.
x_train_meta_1_2_.shape, x_cv_meta_1_2.shape, \
y_train_meta_1_2_.shape, y_cv_meta_1_2.shape
model_3_meta_1_2 = build_model_3( \
x_train_meta_1_2_, y_train_meta_1_2_, \
x_cv_meta_1_2, y_cv_meta_1_2, input_dim=31)
y_pred_prob_test_3_meta_1_2 = model_3_meta_1_2.predict(x_test_meta_1_2)
y_pred_int_test_3_meta_1_2 = y_pred_prob_test_3_meta_1_2 > Threshold
pd.DataFrame(y_pred_int_test_3_meta_1_2).value_counts()
# combine the meta prediction with primary prediction
final_pred_3_meta_1_2 = \
y_pred_int_test_3_meta_1_2.flatten() & \
y_pred_int_test_1 & y_pred_int_test_2
pd.Series(final_pred_3_meta_1_2).value_counts()
最后一步并没有任何改变,看看具体的指标
metrics_summary(y_test_original, y_pred_int_test_3_meta_1_2)
综合起来还是不错的,但是 recall 还是稍微低了一点
由于看起来 lightGBM 作为 2nd model 会更好一些,那么再来试一试
#meta_1_3: meta data from 1 model and 2 model
#process the train dataset
x_train_meta_1_3, y_train_meta_1_3 = \
data_meta_2(1, x_train, y_train, model_1, model_3)
#meta_1_3: meta data from 1st model and 3rd model
#process the test dataset
x_test_meta_1_3, y_test_meta_1_3 = \
data_meta_2(1, x_test_original, y_test_original, model_1, model_3)
#normalize the dataset
x_train_meta_1_3, x_test_meta_1_3 = \
scale_data(x_train_meta_1_3, x_test_meta_1_3)
#do a train, validation split
x_train_meta_1_3_, x_cv_meta_1_3, y_train_meta_1_3_, y_cv_meta_1_3 = \
train_test_split(x_train_meta_1_3, y_train_meta_1_3,
test_size=0.25,
stratify=y_train_meta_1_3,
random_state=2020)
model_2_meta_1_3 = build_model_2( \
x_train_meta_1_3_, y_train_meta_1_3_, \
x_cv_meta_1_3, y_cv_meta_1_3)
y_pred_prob_test_2_meta_1_3 = model_2_meta_1_3.predict(x_test_meta_1_3)
y_pred_int_test_2_meta_1_3 = y_pred_prob_test_2_meta_1_3 > Threshold
# combine the meta prediction with primary prediction
final_pred_2_meta_1_3 = \
y_pred_int_test_2_meta_1_3 & \
y_pred_int_test_1 & y_pred_int_test_3.flatten()
pd.Series(final_pred_2_meta_1_3).value_counts()
metrics_summary(y_test_original, final_pred_2_meta_1_3)
可以看到 precision 达到 93%,recall 达到 81%,比之前的都好!
我们知道,stacking 和 meta labeling 有点像特征工程一样对于之前的训练集加入新的特征。但是跟原来的特征相比,新增的特征真的会更重要么?这值得分析,scikit-learn 的 feature_importance
函数可以帮我们来了解那些特征的重要性。
def plot_feature_importance(model, X , importance_type = 'split'):
# split就是特征在所有决策树中被用来分割的总次数。
# gain就是特征在所有决策树种被用来分割后带来的增益(gain)总和
feature_imp = pd.DataFrame({'Value':model.
feature_importance(importance_type),
'Feature':X.columns})
f, ax = plt.subplots(figsize=(40, 30))
ax.set_title(f'LightGBM Features Importance by {importance_type}', fontsize=75, fontname="Arial")
ax.set_xlabel('Features', fontname="Arial", fontsize=70)
ax.set_ylabel('Importance', fontname="Arial", fontsize=70)
ax.tick_params(labelsize=50)
sns.barplot(x="Value", y="Feature",
data=feature_imp.sort_values(by="Value",
ascending=False), ax=ax)
plot_feature_importance(model_2_meta_1_3, x_train_meta_1_3_)
plot_feature_importance(model_2_meta_1_3, x_train_meta_1_3_, 'gain')
可以看到,两种情况都显示 meta data 远比原来的特征更重要
plot_feature_importance(model_2_meta_1, x_train_meta_1_)
plot_feature_importance(model_2_meta_1, x_train_meta_1_, 'gain')
我们知道从一级模型到次级模型是有信息泄露的,但是这是我们想要的结果,只要我们保证测试集的完整性即可。
如果存在从测试集到训练集的信息泄露情况的话,那这种影响会被放大,DNN 善于利用这一弊病而得到高分。
data_original.columns[:-1]
#normalize all the data in one go.
features_to_scale = data_original.columns[1:-1]
scaler = pp.StandardScaler()
data_original.loc[:, features_to_scale] = scaler.fit_transform(data_original[features_to_scale])
#split training and testing dataset afterwards.
x_train_cv, x_test, y_train_cv, y_test \
= train_test_split(data_original.loc[:, features_to_scale], data_original.Class, test_size=0.25,\
stratify=data_original.Class,random_state=2020)
在给金融资产数据打标签的整个流程分为两步:
确定基础标签 ybase:用〖三隔栏方法〗一贴介绍的方法
确定元标签 ymeta:即是否按着头寸方向交易
类比之前的例子,我们首先建立一个初级模型(记着要尽量提高查准率)来预测头寸方向,但在增加真正类的情况下也增加了假正类,这是交易中最不原因看到的(预测要交易但是错了已经损失真金白银)。
随后我们对模型预测的正例使用元标签,并建立次级模型来提高查准率。该模型的主要目的是从已经挑选出的机会中再一次筛选投资标的。
当次级模型是机器学习相关模型时,就有意思了。因此机器学习中的分类器不仅能返回类别,而且可以返回类别对应的概率,概率越大,预测该类别的信心越足,那么在交易时不就可以增加头寸大小了么?
元标签方法可以看成是一个次级模型,其美妙之处在于,你可以把元标签方法可加载任何初级模型上,不管它是
机器学习模型
计量经济学公式
基本面分析
技术分析
人主观看法
它有以下几点优势:
综上,最好的结果是用 lightGBM 的元标签法作为次级模型,逻辑回归和DNN作为一级模型。
对于金融资产数据,元标签是指在第一个模型已经确定头寸方向的情况下,希望通过第二个模型来确定头寸大小。
而预测该类的概率是任何机器学习的分类模型的副产品,在 scikit-learn 中,用 predict_proba()
可以得到预测概率。
假设我们用随机森林预测出概率为 p,在实际交易中,一种决策可以是
当初级模型用主观看法,而次级模型用客观数据,这种投资方法称为量化基本面投资(Quantitative Fundamental, Quantamental)。
Quantamental refers to an investment strategy that combines quantitative approaches using computers, mathematical models, and big data with fundamental methods that analyze individual company cash flows, growth, and risk to generate better risk-adjusted returns.
量化基本面投资其实是一种对基本面投资和量化投资的融合,是将计算机算法与人类的分析结合起来的一种 1+1>2 的新型投资方式。