import pandas as pd
import numpy as np
import warnings
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from scipy import stats
from dateutil.relativedelta import relativedelta
import sklearn as skr
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
import warnings
warnings.filterwarnings('ignore')
# 设置数据集路径
dataset_path = 'Dataset/'
# 读取数据
data_balance = pd.read_csv(dataset_path+'user_balance_table.csv')
# 为数据集添加时间戳
data_balance['date'] = pd.to_datetime(data_balance['report_date'], format= "%Y%m%d")
data_balance['day'] = data_balance['date'].dt.day
data_balance['month'] = data_balance['date'].dt.month
data_balance['year'] = data_balance['date'].dt.year
data_balance['week'] = data_balance['date'].dt.week
data_balance['weekday'] = data_balance['date'].dt.weekday
data_balance.head()
# 聚合时间数据
total_balance = data_balance.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum().reset_index()
# 生成测试集区段数据(时间区段为2014/9/1之后)
start = datetime.datetime(2014,9,1)
testdata = []
while start != datetime.datetime(2014,10,1):
temp = [start, np.nan, np.nan]
testdata.append(temp)
start += datetime.timedelta(days = 1)
testdata = pd.DataFrame(testdata)
testdata.columns = total_balance.columns
# 拼接数据集
total_balance = pd.concat([total_balance, testdata], axis = 0)
# 为数据集添加时间戳
total_balance['day'] = total_balance['date'].dt.day
total_balance['month'] = total_balance['date'].dt.month
total_balance['year'] = total_balance['date'].dt.year
total_balance['week'] = total_balance['date'].dt.week
total_balance['weekday'] = total_balance['date'].dt.weekday
import matplotlib.pylab as plt
# 画出每日总购买与赎回量的时间序列图
fig = plt.figure(figsize=(20,6))
plt.plot(total_balance['date'], total_balance['total_purchase_amt'],label='purchase')
plt.plot(total_balance['date'], total_balance['total_redeem_amt'],label='redeem')
plt.legend(loc='best')
plt.title("The lineplot of total amount of Purchase and Redeem from July.13 to Sep.14")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
# 画出4月份以后的时间序列图
total_balance_1 = total_balance[total_balance['date'] >= np.datetime64(datetime.date(2014,4,1))]
fig = plt.figure(figsize=(20,6))
plt.plot(total_balance_1['date'], total_balance_1['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_1['date'], total_balance_1['total_redeem_amt'], label = 'redeem')
plt.legend(loc = 'best')
plt.title("The lineplot of total amount of Purchase and Redeem from April.14 to Sep.14")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
# 分别画出每个月中每天购买赎回量的时间序列图
fig = plt.figure(figsize=(15,15))
plt.subplot(4,1,1)
plt.title("The time series of total amount of Purchase and Redeem for August, July, June, May respectively")
total_balance_2 = total_balance[total_balance['date'] >= np.datetime64(datetime.date(2014,8,1))]
plt.plot(total_balance_2['date'], total_balance_2['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_2['date'], total_balance_2['total_redeem_amt'], label = 'redeem')
plt.legend()
total_balance_3 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,7,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,8,1)))]
plt.subplot(4,1,2)
plt.plot(total_balance_3['date'], total_balance_3['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_3['date'], total_balance_3['total_redeem_amt'], label = 'redeem')
plt.legend()
total_balance_4 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,6,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,7,1)))]
plt.subplot(4,1,3)
plt.plot(total_balance_4['date'], total_balance_4['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_4['date'], total_balance_4['total_redeem_amt'], label = 'redeem')
plt.legend()
total_balance_5 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,5,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,6,1)))]
plt.subplot(4,1,4)
plt.plot(total_balance_5['date'], total_balance_5['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_5['date'], total_balance_5['total_redeem_amt'], label = 'redeem')
plt.legend()
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
# 分别画出13年8月与9月每日购买赎回量的时序图
fig = plt.figure(figsize=(15,9))
total_balance_last8 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,8,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,9,1)))]
plt.subplot(2,1,1)
plt.plot(total_balance_last8['date'], total_balance_last8['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_last8['date'], total_balance_last8['total_redeem_amt'], label = 'redeem')
plt.legend()
total_balance_last9 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,9,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,10,1)))]
plt.subplot(2,1,2)
plt.plot(total_balance_last9['date'], total_balance_last9['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_last9['date'], total_balance_last9['total_redeem_amt'], label = 'redeem')
plt.legend()
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
# 画出每个翌日的数据分布于整体数据的分布图
plt.figure(figsize=(15,10))
scatter_para = {'marker':'.', 's':3, 'alpha':0.3}
line_kws = {'color':'k'}
plt.subplot(2,2,1)
plt.title('The distrubution of total purchase')
sns.violinplot(x='weekday', y='total_purchase_amt', data = total_balance_1, scatter_kws=scatter_para, line_kws=line_kws)
plt.subplot(2,2,2)
plt.title('The distrubution of total purchase')
sns.distplot(total_balance_1['total_purchase_amt'].dropna())
plt.subplot(2,2,3)
plt.title('The distrubution of total redeem')
sns.violinplot(x='weekday', y='total_redeem_amt', data = total_balance_1, scatter_kws=scatter_para, line_kws=line_kws)
plt.subplot(2,2,4)
plt.title('The distrubution of total redeem')
sns.distplot(total_balance_1['total_redeem_amt'].dropna())
# 按翌日对数据聚合后取均值
week_sta = total_balance_1[['total_purchase_amt', 'total_redeem_amt', 'weekday']].groupby('weekday', as_index=False).mean()
# 分析翌日的中位数特征
plt.figure(figsize=(12, 5))
ax = plt.subplot(1,2,1)
plt.title('The barplot of average total purchase with each weekday')
ax = sns.barplot(x="weekday", y="total_purchase_amt", data=week_sta, label='Purchase')
ax.legend()
ax = plt.subplot(1,2,2)
plt.title('The barplot of average total redeem with each weekday')
ax = sns.barplot(x="weekday", y="total_redeem_amt", data=week_sta, label='Redeem')
ax.legend()
# 画出翌日的箱型图 (箱形图最大的优点就是不受异常值的影响,可以以一种相对稳定的方式描述数据的离散分布情况。)
plt.figure(figsize=(12, 5))
ax = plt.subplot(1,2,1)
plt.title('The boxplot of total purchase with each weekday')
ax = sns.boxplot(x="weekday", y="total_purchase_amt", data=total_balance_1,palette = 'RdBu')
ax = plt.subplot(1,2,2)
plt.title('The boxplot of total redeem with each weekday')
ax = sns.boxplot(x="weekday", y="total_redeem_amt", data=total_balance_1, palette = 'RdBu')
# 使用OneHot方法将翌日特征划分,获取划分后特征
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
total_balance = total_balance.reset_index()
week_feature = encoder.fit_transform(total_balance['weekday'].ravel().reshape(-1, 1)).toarray()
# list to array --> https://zhuanlan.zhihu.com/p/261636002
# reshape是在不改变数据内容的情况下,改变一个数组的格式,参数及返回值,reshape(-1,1)令计算机计算出展开所有的行数,然后转换。
week_feature = pd.DataFrame(week_feature,columns=['weekday_onehot']*len(week_feature[0])) # 字符串复制
feature = pd.concat([total_balance, week_feature], axis = 1)[['total_purchase_amt', 'total_redeem_amt','weekday_onehot','date']]
feature.columns = list(feature.columns[0:2]) + [x+str(i) for i,x in enumerate(feature.columns[2:-1])] + ['date']
feature.head()
# 画出划分后翌日特征与标签的斯皮尔曼相关性
f, ax = plt.subplots(figsize = (20, 6))
plt.subplot(1,2,1)
plt.title('The spearman coleration between total purchase and each weekday')
sns.heatmap(feature[[x for x in feature.columns if x not in ['total_redeem_amt', 'date'] ]].corr('spearman'),linewidths = 0.1, vmax = 0.2, vmin=-0.2)
plt.subplot(1,2,2)
plt.title('The spearman coleration between total redeem and each weekday')
sns.heatmap(feature[[x for x in feature.columns if x not in ['total_purchase_amt', 'date'] ]].corr('spearman'),linewidths = 0.1, vmax = 0.2, vmin=-0.2)
# 测试翌日特征与标签的独立性 Ref: https://github.com/ChuanyuXue/MVTest
# pip install mvtest
from mvtpy.mvtest import mvtest
mv = mvtest()
mv.test(total_balance_1['total_purchase_amt'], total_balance_1['weekday'])
# 画出每个月的购买总量分布估计图(kdeplot)
plt.figure(figsize=(15,10))
plt.title('The Probability Density of total purchase amount in Each Month')
plt.ylabel('Probability')
plt.xlabel('Amount')
for i in range(7, 12):
sns.kdeplot(total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,i,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,i+1,1)))]['total_purchase_amt'],label='13年'+str(i)+'月')
for i in range(1, 9):
sns.kdeplot(total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,i,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_purchase_amt'],label='14年'+str(i)+'月')
# 画出每个月的赎回总量分布估计图(kdeplot)
plt.figure(figsize=(15,10))
plt.title('The Probability Density of total redeem amount in Each Month')
plt.ylabel('Probability')
plt.xlabel('Amount')
for i in range(7, 12):
sns.kdeplot(total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,i,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,i+1,1)))]['total_redeem_amt'],label='13年'+str(i)+'月')
for i in range(1, 9):
sns.kdeplot(total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,i,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_redeem_amt'],label='14年'+str(i)+'月')
# 画出14年五六七八月份的分布估计图
plt.figure(figsize=(12,10))
ax = plt.subplot(2,1,1)
plt.title('The Probability Density of total purchase and redeem amount from May.14 to August.14')
plt.ylabel('Probability')
plt.xlabel('Amount')
ax = sns.kdeplot(total_balance_2['total_purchase_amt'],label='八月')
ax = sns.kdeplot(total_balance_3['total_purchase_amt'],label='七月')
ax = sns.kdeplot(total_balance_4['total_purchase_amt'],label='六月')
ax = sns.kdeplot(total_balance_5['total_purchase_amt'],color='Black',label='五月')
ax = plt.subplot(2,1,2)
plt.ylabel('Probability')
plt.xlabel('Amount')
ax = sns.kdeplot(total_balance_2['total_redeem_amt'],label='八月')
ax = sns.kdeplot(total_balance_3['total_redeem_amt'],label='七月')
ax = sns.kdeplot(total_balance_4['total_redeem_amt'],label='六月')
ax = sns.kdeplot(total_balance_5['total_redeem_amt'],color='Black',label='五月')
# 画出13年八月到九月份的分布估计图
total_balance_last_7 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,7,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,8,1)))]
total_balance_last_8 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,8,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,9,1)))]
total_balance_last_9 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,9,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,10,1)))]
total_balance_last_10 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,10,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,11,1)))]
plt.figure(figsize=(12,10))
ax = plt.subplot(2,1,1)
plt.title('The Probability Density of total purchase and redeem amount from Aug.13 to Sep.13')
plt.ylabel('Probability')
plt.xlabel('Amount')
ax = sns.kdeplot(total_balance_last_8['total_purchase_amt'],label='August')
ax = sns.kdeplot(total_balance_last_7['total_purchase_amt'],label='July')
ax = sns.kdeplot(total_balance_last_9['total_purchase_amt'],color='Red',label='September')
ax = plt.subplot(2,1,2)
plt.ylabel('Probability')
plt.xlabel('Amount')
ax = sns.kdeplot(total_balance_last_8['total_redeem_amt'],label='August')
ax = sns.kdeplot(total_balance_last_7['total_redeem_amt'],label='July')
ax = sns.kdeplot(total_balance_last_9['total_redeem_amt'],color='Red',label='September')
ax = sns.kdeplot(total_balance_last_10['total_redeem_amt'],color='Black',label='Novermber')
# 按照每天聚合数据集
day_sta = total_balance_2[['total_purchase_amt', 'total_redeem_amt', 'day']].groupby('day', as_index=False).mean()
# 获取聚合后每月购买分布的柱状图
plt.figure(figsize = (12,5))
ax = sns.barplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase',palette='cividis')
ax = sns.lineplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase', color = 'red')
ax.legend()
plt.title("The total Purchase in Aug.14")
# 获取聚合后每月赎回分布的柱状图
plt.figure(figsize = (12,5))
ax = sns.barplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem', palette='cividis')
ax = sns.lineplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem', color = 'red')
ax.legend()
plt.title("The total Redeem in Aug.14")
# 画出13年九月份的分布图
plt.figure(figsize=(12,10))
day_sta = total_balance_last_9[['total_purchase_amt', 'total_redeem_amt', 'day']].groupby('day', as_index=False).mean()
plt.subplot(2,1,1)
plt.title("The total Purchase in Sep.13")
ax = sns.barplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase', palette= 'cividis')
ax = sns.lineplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase', color = 'red')
plt.subplot(2,1,2)
plt.title("The total Redeem in Sep.13")
bx = sns.barplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem', palette= 'cividis')
bx = sns.lineplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem', color = 'red')
bx.legend()
We find that the data from last year in Sep has very limited week feature
There are some strange day in Sep:
# 画出历史所有天的热力图
test = np.zeros((max(total_balance_1['week']) - min(total_balance_1['week']) + 1, 7))
test[total_balance_1['week'] - min(total_balance_1['week']), total_balance_1['weekday']] = total_balance_1['total_purchase_amt']
f, ax = plt.subplots(figsize = (10, 4))
sns.heatmap(test,linewidths = 0.1, ax=ax)
ax.set_title("Purchase")
ax.set_xlabel('weekday')
ax.set_ylabel('week')
test = np.zeros((max(total_balance_1['week']) - min(total_balance_1['week']) + 1, 7))
test[total_balance_1['week'] - min(total_balance_1['week']), total_balance_1['weekday']] = total_balance_1['total_redeem_amt']
f, ax = plt.subplots(figsize = (10, 4))
sns.heatmap(test,linewidths = 0.1, ax=ax)
ax.set_title("Redeem")
ax.set_xlabel('weekday')
ax.set_ylabel('week')
From the heat map we find that the data of week 4 and weekday 6 is very strange, and week 12 weekday 2 either
# 对于热力图中异常点的数据分析.1
total_balance_1[(total_balance_1['week'] == 4 + min(total_balance_1['week'])) & (total_balance_1['weekday'] == 6)]
2014-5-4 is a special day in China, It is the first workday after the Labour day!
# 对于热力图中异常点的数据分析.2
total_balance_1[(total_balance_1['week'] == 12 + min(total_balance_1['week'])) & (total_balance_1['weekday'] == 2)]
In 2016-06-25 They Redeemed a lot but Purchase a little:
Others
# 获取节假日的数据
qingming = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,4,5))) & (total_balance['date'] < np.datetime64(datetime.date(2014,4,8)))]
labour = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,5,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,5,4)))]
duanwu = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,5,31))) & (total_balance['date'] < np.datetime64(datetime.date(2014,6,3)))]
data618 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,6,10))) & (total_balance['date'] < np.datetime64(datetime.date(2014,6,20)))]
# 画出节假日与平时的均值
fig = plt.figure(figsize = (12,5))
index_list = ['清明','劳动节','端午','618','平均']
label_list = [np.mean(qingming['total_purchase_amt']), np.mean(labour['total_purchase_amt']),np.mean(duanwu['total_purchase_amt']),np.mean(data618['total_purchase_amt']),np.mean(total_balance_1['total_purchase_amt'])]
plt.bar(index_list, label_list, label="Purchase")
index_list = ['清明.','劳动.','端午.','618.','平均.']
label_list = [np.mean(qingming['total_redeem_amt']), np.mean(labour['total_redeem_amt']),np.mean(duanwu['total_redeem_amt']),np.mean(data618['total_redeem_amt']),np.mean(total_balance_1['total_redeem_amt'])]
plt.bar(index_list, label_list, label="Redeem")
plt.title("The average of different holiday")
plt.ylabel("Amount")
plt.legend()
plt.show()
# 画出节假日购买量与其所处翌日的对比
plt.figure(figsize = (12,5))
import numpy as np
import matplotlib.pyplot as plt
size = 4
x = np.arange(size)
total_width, n = 0.8, 2
width = total_width / n
x = x - (total_width - width) / 2
a = [176250006, 167825284, 162844282,321591063]
b = [225337516, 241859315, 225337516,307635449]
plt.bar(x, a, width=width, label='Holiday_Purchase')
plt.bar(x + width, b, width=width, label='Normal_Purchase')
plt.xticks(x + width / 2, ('清明', '劳动节', '端午', '618'))
plt.title("节假日购买量与其所处翌日的对比")
plt.legend()
plt.show()
# 画出节假日赎回量与其所处翌日的对比
plt.figure(figsize = (12,5))
import numpy as np
import matplotlib.pyplot as plt
size = 4
x = np.arange(size)
total_width, n = 0.8, 2
width = total_width / n
x = x - (total_width - width) / 2
a = [159914308, 154717620, 154366940,291016763]
b = [235439685, 240364238, 235439685,313310347]
plt.bar(x, a, width=width, label='Holiday_Redeem')
plt.bar(x + width, b, width=width, label='Normal_Redeem')
plt.xticks(x + width / 2, ('清明', '劳动节', '端午', '618'))
plt.title("节假日赎回量与其所处翌日的对比")
plt.legend()
plt.show()
# 画出清明节与周边日期的时序图
plt.figure(figsize = (12,5))
qingming_around = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,4,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,4,13)))]
ax = sns.lineplot(x="date", y="total_purchase_amt", data=qingming_around, label='Purchase')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=qingming_around, label='Redeem', ax=ax)
ax = sns.scatterplot(x="date", y="total_purchase_amt", data=qingming, ax = ax)
ax = sns.scatterplot(x="date", y="total_redeem_amt", data=qingming, ax=ax)
plt.title("清明节与周边日期的时序图")
ax.legend()
# 画出劳动节与周边日期的时序图
plt.figure(figsize = (12,5))
labour_around = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,4,25))) & (total_balance['date'] < np.datetime64(datetime.date(2014,5,10)))]
ax = sns.lineplot(x="date", y="total_purchase_amt", data=labour_around, label='Purchase')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=labour_around, label='Redeem', ax=ax)
ax = sns.scatterplot(x="date", y="total_purchase_amt", data=labour, ax=ax)
ax = sns.scatterplot(x="date", y="total_redeem_amt", data=labour, ax=ax)
plt.title("劳动节与周边日期的时序图")
ax.legend()
# 画出端午节与周边日期的时序图
plt.figure(figsize = (12,5))
duanwu_around = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,5,25))) & (total_balance['date'] < np.datetime64(datetime.date(2014,6,7)))]
ax = sns.lineplot(x="date", y="total_purchase_amt", data=duanwu_around, label='Purchase')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=duanwu_around, label='Redeem', ax=ax)
ax = sns.scatterplot(x="date", y="total_purchase_amt", data=duanwu, ax=ax)
ax = sns.scatterplot(x="date", y="total_redeem_amt", data=duanwu, ax=ax)
plt.title("端午节与周边日期的时序图")
ax.legend()
# 画出中秋与周边日期的时序图
plt.figure(figsize = (12,5))
zhongqiu = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,9,19))) & (total_balance['date'] < np.datetime64(datetime.date(2013,9,22)))]
zhongqiu_around = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,9,14))) & (total_balance['date'] < np.datetime64(datetime.date(2013,9,28)))]
ax = sns.lineplot(x="date", y="total_purchase_amt", data=zhongqiu_around, label='Purchase')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=zhongqiu_around, label='Redeem', ax=ax)
ax = sns.scatterplot(x="date", y="total_purchase_amt", data=zhongqiu, ax=ax)
ax = sns.scatterplot(x="date", y="total_redeem_amt", data=zhongqiu, ax=ax)
plt.title("中秋与周边日期的时序图(in 2013)")
ax.legend()
# 画出用户交易纪录的箱型图
sns.boxplot(data_balance['total_purchase_amt'])
plt.title("The abnormal value of total purchase")
# 对于购买2e8的用户的交易行为分析
data_balance[data_balance['user_id'] == 14592].sort_values(by = 'total_redeem_amt',axis = 0,ascending = False).head()
1311 Bought 2E Seal 1E
1312 Seal 1E
1405 Bought 0.9E
1406 Seal 1E
# 画出单笔交易为2e8的那天的总交易量及附近几天的交易量
plt.figure(figsize = (12,5))
e2 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,11,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,11,10)))]
ax = sns.barplot(x="day", y="total_purchase_amt", data=e2, label='2E')
ax = sns.lineplot(x="day", y="total_purchase_amt", data=e2, label='2E')
plt.title("The influence of the big deal with 200 million purchasing(Red Bar)")
ax.legend()
# 画出每日单笔最大交易的时序图
plt.figure(figsize=(20, 6))
ax = sns.lineplot(x="date", y="total_purchase_amt", data=data_balance[['total_purchase_amt', 'date']].groupby('date', as_index=False).max(), label='最大购买')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=data_balance[['total_redeem_amt', 'date']].groupby('date', as_index=False).max(), label='最大赎回')
plt.title("The Biggest deal happend in each day")
# 画出每日单笔最大交易以及总交易额的时序图
plt.figure(figsize=(25, 8))
ax = sns.lineplot(x="date", y="total_purchase_amt", data=data_balance[['total_purchase_amt', 'date']].groupby('date', as_index=False).max(), label='MAX_PURCHASE')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=data_balance[['total_redeem_amt', 'date']].groupby('date', as_index=False).max(), label='MAX_REDEEM')
ax = sns.lineplot(x="date", y="total_purchase_amt", data=data_balance[['total_purchase_amt', 'date']].groupby('date', as_index=False).sum(), label='TOTAL_PURCHASE')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=data_balance[['total_redeem_amt', 'date']].groupby('date', as_index=False).sum(), label='TOTAL_REDEEM')
# 画出每个月大额交易的频次直方图
plt.figure(figsize=(12, 5))
big_frequancy = data_balance[(data_balance['total_purchase_amt'] > 10000000) | (data_balance['total_redeem_amt'] > 10000000)][['month','year','user_id']].groupby(['year','month'], as_index=False).count()
big_frequancy['i'] = big_frequancy['year'] + big_frequancy['month'] / 100
ax = sns.barplot(x="i", y="user_id", data=big_frequancy)
plt.title("The frequency of super big deal(larger than 100 million) in each month")
# 获取大额交易的数据集
data_balance['big_purchase'] = 0
data_balance.loc[data_balance['total_purchase_amt'] > 1000000, 'big_purchase'] = 1
data_balance['big_redeem'] = 0
data_balance.loc[data_balance['total_redeem_amt'] > 1000000, 'big_redeem'] = 1
# 对大额交易按每天做聚合操作
big_purchase = data_balance[data_balance['big_purchase'] == 1].groupby(['date'], as_index=False)['total_purchase_amt'].sum()
small_purchase = data_balance[data_balance['big_purchase'] == 0].groupby(['date'], as_index=False)['total_purchase_amt'].sum()
big_redeem = data_balance[data_balance['big_redeem'] == 1].groupby(['date'], as_index=False)['total_redeem_amt'].sum()
small_redeem = data_balance[data_balance['big_redeem'] == 0].groupby(['date'], as_index=False)['total_redeem_amt'].sum()
# 画出大额交易与小额交易的时序分布图
fig = plt.figure(figsize=(20,6))
plt.plot(big_purchase['date'], big_purchase['total_purchase_amt'],label='big_purchase', color = 'red')
plt.plot(big_redeem['date'], big_redeem['total_redeem_amt'],label='big_redeem', color = 'orange')
plt.plot(small_purchase['date'], small_purchase['total_purchase_amt'],label='small_purchase', color = 'blue')
plt.plot(small_redeem['date'], small_redeem['total_redeem_amt'],label='small_redeem', color = 'dodgerblue')
plt.legend(loc='best')
plt.title("The time series of big deal of Purchase and Redeem from July.13 to Sep.14")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
# 画出大额交易与小额交易的分布估计图
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
for i in range(4, 9):
sns.kdeplot(big_purchase[(big_purchase['date'] >= np.datetime64(datetime.date(2014,i,1))) & (big_purchase['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_purchase_amt'],label='14Y,'+str(i)+'M')
plt.title('BIG PURCHASE')
plt.subplot(2,2,2)
for i in range(4, 9):
sns.kdeplot(small_purchase[(small_purchase['date'] >= np.datetime64(datetime.date(2014,i,1))) & (small_purchase['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_purchase_amt'],label='14Y,'+str(i)+'M')
plt.title('SMALL PURCHASE')
plt.subplot(2,2,3)
for i in range(4, 9):
sns.kdeplot(big_redeem[(big_redeem['date'] >= np.datetime64(datetime.date(2014,i,1))) & (big_redeem['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_redeem_amt'],label='14Y,'+str(i)+'M')
plt.title('BIG REDEEM')
plt.subplot(2,2,4)
for i in range(4, 9):
sns.kdeplot(small_redeem[(small_redeem['date'] >= np.datetime64(datetime.date(2014,i,1))) & (small_redeem['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_redeem_amt'],label='14Y,'+str(i)+'M')
plt.title('SMALL REDEEM')
# 添加时间戳
big_purchase['weekday'] = big_purchase['date'].dt.weekday
small_purchase['weekday'] = small_purchase['date'].dt.weekday
big_redeem['weekday'] = big_redeem['date'].dt.weekday
small_redeem['weekday'] = small_redeem['date'].dt.weekday
# 分析大额小额的翌日分布
plt.figure(figsize=(15, 10))
ax = plt.subplot(2,2,1)
ax = sns.boxplot(x="weekday", y="total_purchase_amt", data=big_purchase[big_purchase['date'] >= np.datetime64(datetime.date(2014,4,1))])
plt.title('BIG PURCHASE')
ax = plt.subplot(2,2,2)
ax = sns.boxplot(x="weekday", y="total_redeem_amt", data=big_redeem[big_redeem['date'] >= np.datetime64(datetime.date(2014,4,1))])
plt.title('BIG REDEEM')
ax = plt.subplot(2,2,3)
ax = sns.boxplot(x="weekday", y="total_purchase_amt", data=small_purchase[small_purchase['date'] >= np.datetime64(datetime.date(2014,4,1))])
plt.title('SMALL PURCHASE')
ax = plt.subplot(2,2,4)
ax = sns.boxplot(x="weekday", y="total_redeem_amt", data=small_redeem[small_redeem['date'] >= np.datetime64(datetime.date(2014,4,1))])
plt.title('SMALL REDEEM')
# 截断数据集
data_balance_1 = data_balance[data_balance['date'] > datetime.datetime(2014,4,1)]
# 画出用户交易纪录表中其他变量与标签的相关性图
plt.figure(figsize = (12,5))
feature = ['total_purchase_amt','total_redeem_amt', 'report_date', 'tBalance', 'yBalance',
'direct_purchase_amt', 'purchase_bal_amt', 'purchase_bank_amt',
'consume_amt', 'transfer_amt', 'tftobal_amt',
'tftocard_amt', 'share_amt']
sns.heatmap(data_balance_1[feature].corr(), linewidths = 0.1)
plt.title("The coleration between each feature in User_Balance_Table")
# 读取银行利率并添加时间戳
bank = pd.read_csv(dataset_path + "mfd_bank_shibor.csv")
bank = bank.rename(columns = {'mfd_date': 'date'})
bank_features = [x for x in bank.columns if x not in ['date']]
bank['date'] = pd.to_datetime(bank['date'], format= "%Y%m%d")
bank['day'] = bank['date'].dt.day
bank['month'] = bank['date'].dt.month
bank['year'] = bank['date'].dt.year
bank['week'] = bank['date'].dt.week
bank['weekday'] = bank['date'].dt.weekday
# 读取支付宝利率并添加时间戳
share = pd.read_csv(dataset_path + 'mfd_day_share_interest.csv')
share = share.rename(columns = {'mfd_date': 'date'})
share_features = [x for x in share.columns if x not in ['date']]
share['date'] = pd.to_datetime(share['date'], format= "%Y%m%d")
share['day'] = share['date'].dt.day
share['month'] = share['date'].dt.month
share['year'] = share['date'].dt.year
share['week'] = share['date'].dt.week
share['weekday'] = share['date'].dt.weekday
# 画出上一天银行及支付宝利率与标签的相关性图
bank['last_date'] = bank['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("The coleration between each lastday bank rate and total purchase")
temp = pd.merge(bank[['last_date']+bank_features], total_balance, left_on='last_date', right_on='date')[['total_purchase_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)
plt.subplot(1,3,3)
plt.title("The coleration between each lastday bank rate and total redeem")
temp = pd.merge(bank[['last_date']+bank_features], total_balance, left_on='last_date', right_on='date')[['total_redeem_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)
# 画出上一星期银行及支付宝利率与标签的相关性图
bank['last_week'] = bank['week'] + 1
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("The coleration between each last week bank rate and total purchase")
temp = pd.merge(bank[['last_week','weekday']+bank_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_purchase_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)
plt.subplot(1,3,3)
plt.title("The coleration between each last week bank rate and total redeem")
temp = pd.merge(bank[['last_week','weekday']+bank_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_redeem_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)
# 分别画出上一星期银行及支付宝利率与大额小额数据的相关性图
bank['last_date'] = bank['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("The coleration of Small Rate purchase")
temp = pd.merge(bank[['last_date']+bank_features], small_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)
plt.subplot(1,3,3)
plt.title("The coleration of Small Rate redeem")
temp = pd.merge(bank[['last_date']+bank_features], small_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)
bank['last_date'] = bank['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("The coleration of Big Rate purchase")
temp = pd.merge(bank[['last_date']+bank_features], big_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)
plt.subplot(1,3,3)
plt.title("The coleration of Big Rate redeem")
temp = pd.merge(bank[['last_date']+bank_features], big_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)
# 画出银行利率的时序图
plt.figure(figsize=(15,5))
for i in bank_features:
plt.plot(bank['date'], bank[[i]] ,label=i)
plt.legend()
plt.title("The time series of bank rate")
plt.xlabel("Time")
plt.ylabel("Rate")
# 画出部分银行利率与购买量的时序图
fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(bank['date'], bank['Interest_3_M'],'b',label="Interest_3_M")
plt.plot(bank['date'], bank['Interest_6_M'],'cyan',label="Interest_6_M")
plt.plot(bank['date'], bank['Interest_9_M'],'skyblue',label="Interest_9_M")
plt.legend()
ax2=ax1.twinx()
plt.plot(total_balance['date'], total_balance['total_purchase_amt'],'g',label="Total purchase")
plt.legend(loc=2)
plt.title("The time series of bank rate and purchase")
plt.xlabel("Time")
plt.ylabel("Rate & Amount")
plt.show()
# 画出部分银行利率与赎回量的时序图
fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(bank['date'], bank['Interest_3_M'],'b',label="Interest_3_M")
plt.plot(bank['date'], bank['Interest_6_M'],'cyan',label="Interest_6_M")
plt.plot(bank['date'], bank['Interest_9_M'],'skyblue',label="Interest_9_M")
plt.legend()
ax2=ax1.twinx()
plt.plot(total_balance['date'], total_balance['total_redeem_amt'],'g',label="Total redeem")
plt.legend(loc=2)
plt.title("The time series of bank rate and redeem")
plt.xlabel("Time")
plt.ylabel("Rate & Amount")
plt.show()
# 画出支付宝利率与标签的相关性图
share['last_date'] = share['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
temp = pd.merge(share[['last_date']+share_features], total_balance, left_on='last_date', right_on='date')[['total_purchase_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0)
plt.subplot(1,3,3)
temp = pd.merge(share[['last_date']+share_features], total_balance, left_on='last_date', right_on='date')[['total_redeem_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0)
# 画出银行利率与标签的相关性图
share['last_week'] = share['week'] + 1
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
temp = pd.merge(share[['last_week','weekday']+share_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_purchase_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0)
plt.subplot(1,3,3)
temp = pd.merge(share[['last_week','weekday']+share_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_redeem_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0)
# 画出支付宝利率与购买量的时序图
fig,ax1 = plt.subplots(figsize=(15,5))
for i in share_features:
plt.plot(share['date'], share[i],'b',label=i)
break
plt.legend()
ax2=ax1.twinx()
plt.plot(total_balance['date'], total_balance['total_purchase_amt'],'g',label="Total purchase")
plt.legend(loc=2)
plt.show()
# 画出支付宝利率与赎回量的时序图
fig,ax1 = plt.subplots(figsize=(15,5))
for i in share_features:
plt.plot(share['date'], share[i],'b',label=i)
break
plt.legend()
ax2=ax1.twinx()
plt.plot(total_balance['date'], total_balance['total_redeem_amt'],'g',label="Total redeem")
plt.legend(loc=2)
plt.show()
# 画出大额小额数据与支付宝利率的相关性图
share['last_date'] = share['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
temp = pd.merge(share[['last_date']+share_features], small_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)
plt.title("SMALL PURCHASE")
plt.subplot(1,3,3)
plt.title("SMALL REDEEM")
temp = pd.merge(share[['last_date']+share_features], small_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)
share['last_date'] = share['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("BIG PURCHASE")
temp = pd.merge(share[['last_date']+share_features], big_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)
plt.subplot(1,3,3)
plt.title("BIG REDEEM")
temp = pd.merge(share[['last_date']+share_features], big_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)
# 画出银行利率与支付宝利率的时序图
fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(bank['date'], bank['Interest_3_M'],c='g',label= 'BANK')
plt.legend()
ax2=ax1.twinx()
plt.plot(share['date'], share['mfd_daily_yield'],label='SHARE')
plt.legend(loc=2)
plt.show()
It seems that:
前面的分析可以看出,某些用户的交易额很大,对于日交易量很明显,这里统计四月份后依旧活跃的大额用户(大于100万)
# 获得大额用户的集合
temp = data_balance[(data_balance['total_purchase_amt'] >= 1000000) | (data_balance['total_redeem_amt'] >= 1000000)]
big_users_set = set(temp[temp['date'] >= datetime.datetime(2014,4,1)]['user_id'])
len(big_users_set)
max(data_balance['user_id'])
len(big_users_set)/max(data_balance['user_id'])*100
大额用户占3903个,占总数的13.92%
# 标记大额用户
data_balance['big_user'] = 0
data_balance.loc[data_balance['user_id'].isin(big_users_set), 'big_user'] = 1
# 统计大额用户与小额用户的日总交易额的区别
total_balance_bigNsmall = data_balance.groupby(['date','big_user'],
as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
# 画出大额用户与小额用户交易的日总交易量图
fig = plt.figure(figsize=(20,8))
plt.plot(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['date'], total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['total_purchase_amt'],label='big_purchase')
plt.plot(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['date'], total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['total_redeem_amt'],label='big_redeem')
plt.plot(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['date'], total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['total_purchase_amt'],label='small_purchase')
plt.plot(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['date'], total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['total_redeem_amt'],label='small_redeem')
plt.legend(loc='best')
plt.title("The time series of big and small user of Purchase and Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
# 统计大额小额用户购买量占比
np.sum(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['total_purchase_amt']) / np.sum(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['total_purchase_amt'])
# 统计大额小额用户赎回量占比
np.sum(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['total_redeem_amt']) / np.sum(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['total_redeem_amt'])
# 画出非0交易的分布图
frequency = data_balance[(data_balance['direct_purchase_amt'] != 0) | (data_balance['total_redeem_amt'] != 0)][['user_id','tBalance']].groupby('user_id', as_index=False).count()
sns.distplot(frequency['tBalance'])
# 获取频繁交易用户集合
hot_users_set = set(frequency[frequency['tBalance'] > 30]['user_id'])
# 获取频繁用户的交易纪录
data_balance['is_hot_users'] = 0
data_balance.loc[data_balance['user_id'].isin(hot_users_set) , 'is_hot_users'] = 1
# 统计频繁用户与非频繁用户的日总交易额的区别
total_balance_hotNcold = data_balance.groupby(['date','is_hot_users'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
# 绘制频繁用户与非频繁用户总购买赎回量的时序图
fig = plt.figure(figsize=(20,8))
plt.plot(total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 1]['date'], total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 1]['total_purchase_amt'],label='hot_purchase')
plt.plot(total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 1]['date'], total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 1]['total_redeem_amt'],label='hot_redeem')
plt.plot(total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 0]['date'], total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 0]['total_purchase_amt'],label='cold_purchase')
plt.plot(total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 0]['date'], total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 0]['total_redeem_amt'],label='cold_redeem')
plt.legend(loc='best')
plt.title("The time series of big and small user of Purchase and Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
# 画出冷启动用户与老用户交易的箱型图
temp = data_balance[['year','month','user_id','total_purchase_amt','total_redeem_amt']].groupby(['year','month','user_id'], as_index=False).sum()
user_old_set = set()
plt.figure(figsize=(10,30))
for i in range(1, 9):
newset = set(temp[(temp['year'] == 2014) & (temp['month'] == i)]['user_id'])
this_month = data_balance[(data_balance['year'] == 2014) & (data_balance['month'] == i)]
this_month['cold'] = 0
this_month.loc[this_month['user_id'].isin(newset - user_old_set), 'cold'] = 1
plt.subplot(4,2,i)
plt.title('This month : ' + str(i))
sns.boxplot(x="cold", y="total_purchase_amt" , data=this_month[(this_month['direct_purchase_amt'] != 0) | (this_month['total_redeem_amt'] != 0)])
user_old_set = user_old_set | newset
plt.show()
# 用户的其他属性
users = pd.read_csv('Dataset/user_profile_table.csv')
users.head()
# 添加城市、星座、性别
data_balance = pd.merge(data_balance, users, on='user_id')
# 统计每个城市用户的日总交易额的区别并绘制分布估计图
fig = plt.figure(figsize=(10,5))
for i in np.unique(data_balance['city']):
temp = data_balance.groupby(['date','city'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
ax = sns.kdeplot( temp[temp['city'] == i]['total_purchase_amt'],label=i)
plt.legend(loc='best')
plt.title("The time series of different city of Purchase")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
fig = plt.figure(figsize=(10,5))
for i in np.unique(data_balance['city']):
temp = data_balance.groupby(['date','city'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
ax = sns.kdeplot( temp[temp['city'] == i]['total_redeem_amt'],label=i)
plt.legend(loc='best')
plt.title("The time series of different city of Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
# 统计每个性别用户的日总交易额的区别,并绘制时序图
temp = data_balance.groupby(['date','sex'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
fig = plt.figure(figsize=(20,6))
plt.plot(temp[temp['sex'] == 1]['date'], temp[temp['sex'] == 1]['total_purchase_amt'],label='Male')
plt.plot(temp[temp['sex'] == 0]['date'], temp[temp['sex'] == 0]['total_purchase_amt'],label='Female')
plt.legend(loc='best')
plt.title("The time series of two gender user of Purchase")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
fig = plt.figure(figsize=(20,6))
plt.plot(temp[temp['sex'] == 1]['date'], temp[temp['sex'] == 1]['total_purchase_amt'],label='Male')
plt.plot(temp[temp['sex'] == 0]['date'], temp[temp['sex'] == 0]['total_redeem_amt'],label='Female')
plt.legend(loc='best')
plt.title("The time series of two gender user of Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
data_balance['constellation']
# 统计每个星座用户的日总交易额的区别 并绘制分布估计图
fig = plt.figure(figsize=(12,5))
for i in np.unique(data_balance['constellation']):
temp = data_balance.groupby(['date','constellation'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
ax = sns.kdeplot( temp[temp['constellation'] == i]['total_purchase_amt'],label=i)
plt.legend(loc='best')
plt.title("The time series of small deal of Purchase and Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
fig = plt.figure(figsize=(12,5))
for i in np.unique(data_balance['constellation']):
temp = data_balance.groupby(['date','constellation'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
ax = sns.kdeplot( temp[temp['constellation'] == i]['total_redeem_amt'],label=i)
plt.legend(loc='best')
plt.title("The time series of small deal of Purchase and Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
# 统计每个性别用户的日总交易额的区别
temp = data_balance.groupby(['date'], as_index=False)['direct_purchase_amt','share_amt'].sum()
# 画出每日利息的增长/直接购买量的时序图
fig = plt.figure(figsize=(20,6))
plt.plot(temp['date'], temp['share_amt'] / temp['direct_purchase_amt'] ,label='Rate')
plt.legend(loc='best')
plt.title("The rate of Share / Purchase")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
# 加载支付宝利率数据
share = pd.read_csv('Dataset/mfd_day_share_interest.csv')
share = share.rename(columns = {'mfd_date': 'date'})
share_features = [x for x in share.columns if x not in ['date']]
share['date'] = pd.to_datetime(share['date'], format= "%Y%m%d")
share['day'] = share['date'].dt.day
share['month'] = share['date'].dt.month
share['year'] = share['date'].dt.year
share['week'] = share['date'].dt.week
share['weekday'] = share['date'].dt.weekday
# 绘制支付宝利率与交易额的时序图
fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(temp['date'], temp['share_amt'],'b',label="Share_amt")
plt.legend()
ax2=ax1.twinx()
plt.plot(share['date'], share['mfd_daily_yield'],'g',label="Share rate")
plt.legend(loc=2)
plt.title("The correlation between share rate and share amount")
plt.xlabel("Time")
plt.ylabel("Rate & Amount")
plt.show()
# 支付宝利率与每日利息的增长/直接购买量的时序图
fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(temp['date'], temp['share_amt'] / temp['direct_purchase_amt'],'b',label="Share_amt / Direct_amt")
plt.legend()
ax2=ax1.twinx()
plt.plot(share['date'], share['mfd_daily_yield'],'g',label="Share rate")
plt.legend(loc=2)
plt.title("The correlation between share rate and Share/Purchase")
plt.xlabel("Time")
plt.ylabel("Rate & Amount")
plt.show()
直接购买里面有两种
# 聚合两种不同购买方式
temp = data_balance.groupby(['date'], as_index=False)['purchase_bal_amt','purchase_bank_amt'].sum()
# 画出不同购买方式日购买量的时序图
fig = plt.figure(figsize=(20,6))
plt.plot(temp['date'], temp['purchase_bal_amt'],label='Bal')
plt.plot(temp['date'], temp['purchase_bank_amt'],label='Bank')
plt.legend(loc='best')
plt.title("The purchase of bal and bank")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
# 画出不同赎回方式日赎回量的时序图
temp = data_balance.groupby(['date'], as_index=False)['tftobal_amt','tftocard_amt'].sum()
fig = plt.figure(figsize=(20,6))
plt.plot(temp['date'], temp['tftobal_amt'],label='Bal')
plt.plot(temp['date'], temp['tftocard_amt'],label='Bank')
plt.legend(loc='best')
plt.title("The redeem of bal and bank")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
It seems that:
# Load the balance data
def load_data(path: str = 'user_balance_table.csv')->pd.DataFrame:
data_balance = pd.read_csv(path)
data_balance = add_timestamp(data_balance)
return data_balance.reset_index(drop=True)
# add tiemstamp to dataset
def add_timestamp(data: pd.DataFrame, time_index: str = 'report_date')->pd.DataFrame:
data_balance = data.copy()
data_balance['date'] = pd.to_datetime(data_balance[time_index], format= "%Y%m%d")
data_balance['day'] = data_balance['date'].dt.day
data_balance['month'] = data_balance['date'].dt.month
data_balance['year'] = data_balance['date'].dt.year
data_balance['week'] = data_balance['date'].dt.week
data_balance['weekday'] = data_balance['date'].dt.weekday
return data_balance.reset_index(drop=True)
# total amount
def get_total_balance(data: pd.DataFrame, date: str = '2014-03-31')->pd.DataFrame:
df_tmp = data.copy()
df_tmp = df_tmp.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum()
df_tmp.reset_index(inplace=True)
return df_tmp[(df_tmp['date']>= date)].reset_index(drop=True)
# Generate the test data
def generate_test_data(data: pd.DataFrame)->pd.DataFrame:
total_balance = data.copy()
start = datetime.datetime(2014,9,1)
testdata = []
while start != datetime.datetime(2014,10,15):
temp = [start, np.nan, np.nan]
testdata.append(temp)
start += datetime.timedelta(days = 1)
testdata = pd.DataFrame(testdata)
testdata.columns = total_balance.columns
total_balance = pd.concat([total_balance, testdata], axis = 0)
total_balance = total_balance.reset_index(drop=True)
return total_balance.reset_index(drop=True)
# Load user's information
def load_user_information(path: str = 'user_profile_table.csv')->pd.DataFrame:
return pd.read_csv(path)
# 载入数据
balance_data = load_data('Dataset/user_balance_table.csv')
balance_data = add_timestamp(balance_data)
total_balance = get_total_balance(balance_data, date = '2014-03-01')
total_balance = generate_test_data(total_balance)
total_balance = add_timestamp(total_balance, 'date')
# 创建数据的深层拷贝
data = total_balance.copy()
# 定义生成时间序列规则预测结果的方法
def generate_base(df: pd.DataFrame, month_index: int)->pd.DataFrame:
# 选中固定时间段的数据集
total_balance = df.copy()
total_balance = total_balance[['date','total_purchase_amt','total_redeem_amt']]
total_balance = total_balance[(total_balance['date'] >= datetime.datetime(2014,3,1)) & (total_balance['date'] < datetime.datetime(2014, month_index, 1))]
# 加入时间戳
total_balance['weekday'] = total_balance['date'].dt.weekday
total_balance['day'] = total_balance['date'].dt.day
total_balance['week'] = total_balance['date'].dt.week
total_balance['month'] = total_balance['date'].dt.month
# 统计翌日因子
mean_of_each_weekday = total_balance[['weekday']+['total_purchase_amt','total_redeem_amt']].groupby('weekday',as_index=False).mean()
for name in ['total_purchase_amt','total_redeem_amt']:
mean_of_each_weekday = mean_of_each_weekday.rename(columns={name: name+'_weekdaymean'})
mean_of_each_weekday['total_purchase_amt_weekdaymean'] /= np.mean(total_balance['total_purchase_amt'])
mean_of_each_weekday['total_redeem_amt_weekdaymean'] /= np.mean(total_balance['total_redeem_amt'])
# 合并统计结果到原数据集
total_balance = pd.merge(total_balance, mean_of_each_weekday, on='weekday', how='left')
# 分别统计翌日在(1~31)号出现的频次
weekday_count = total_balance[['day','weekday','date']].groupby(['day','weekday'],as_index=False).count()
weekday_count = pd.merge(weekday_count, mean_of_each_weekday, on='weekday')
# 依据频次对翌日因子进行加权,获得日期因子
weekday_count['total_purchase_amt_weekdaymean'] *= weekday_count['date'] / len(np.unique(total_balance['month']))
weekday_count['total_redeem_amt_weekdaymean'] *= weekday_count['date'] / len(np.unique(total_balance['month']))
day_rate = weekday_count.drop(['weekday','date'],axis=1).groupby('day',as_index=False).sum()
# 将训练集中所有日期的均值剔除日期残差得到base
day_mean = total_balance[['day'] + ['total_purchase_amt','total_redeem_amt']].groupby('day',as_index=False).mean()
day_pre = pd.merge(day_mean, day_rate, on='day', how='left')
day_pre['total_purchase_amt'] /= day_pre['total_purchase_amt_weekdaymean']
day_pre['total_redeem_amt'] /= day_pre['total_redeem_amt_weekdaymean']
# 生成测试集数据
for index, row in day_pre.iterrows():
if month_index in (2,4,6,9) and row['day'] == 31:
break
day_pre.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day']))
# 基于base与翌日因子获得最后的预测结果
day_pre['weekday'] = day_pre.date.dt.weekday
day_pre = day_pre[['date','weekday']+['total_purchase_amt','total_redeem_amt']]
day_pre = pd.merge(day_pre, mean_of_each_weekday,on='weekday')
day_pre['total_purchase_amt'] *= day_pre['total_purchase_amt_weekdaymean']
day_pre['total_redeem_amt'] *= day_pre['total_redeem_amt_weekdaymean']
day_pre = day_pre.sort_values('date')[['date']+['total_purchase_amt','total_redeem_amt']]
return day_pre
# 生成预测结果(以及残差)
base_list = []
for i in range(4, 10):
base_list.append(generate_base(data, i).reset_index(drop=True))
base = pd.concat(base_list).reset_index(drop=True)
for i in ['total_purchase_amt','total_redeem_amt']:
base = base.rename(columns={i: i+'_base'})
data = pd.merge(data.reset_index(drop=True), base.reset_index(drop=True), on='date', how='left').reset_index(drop=True)
data['purchase_residual'] = data['total_purchase_amt'] / data['total_purchase_amt_base']
data['redeem_residual'] = data['total_redeem_amt'] / data['total_redeem_amt_base']
# 对结果表重命名
data = data[['date','purchase_residual','redeem_residual','total_purchase_amt_base', 'total_redeem_amt_base']]
for i in data.columns:
if i == 'date':
data[i] = data[i].astype(str)
data[i] = data[i].str.replace('-','')
data.columns = [['date'] + ['total_purchase_amt','total_redeem_amt'] + ['total_purchase_predicted_by_cycle','total_redeem_predicted_by_cycle'] ]
# 保存预测结果到本地
data.to_csv('Dataset/base.csv',index=False)
library(plyr)
library(dplyr)
library(forecast)
library(data.table)
library(tseries)
options(warn=-1)
user_balance=read.csv("Dataset/user_balance_table.csv")
user_balance$report_date=as.Date(as.character(user_balance$report_date),format="%Y%m%d")
temp=order(user_balance$report_date)
user_balance=user_balance[temp,]
data=ddply(user_balance,.(report_date),function(D){colwise(sum)(D[,c(-1,-2)])})
train_data = data[274:396,]
test_data = data[397:427,]
head(train_data)
head(test_data)
get_score<- function(purchase_pred,redeem_pred,purchase_true,redeem_true,h=0.3){
out = data.frame(purchase_pred ,redeem_pred,purchase_true,redeem_true )
out$purchase_mape = abs(out$purchase_pred-out$purchase_true)/
out$purchase_true
out$redeem_mape = abs(out$redeem_pred-out$redeem_true)/
out$redeem_true
score = sum(exp(-out$purchase_mape/h)*10)* 0.45 +
sum(exp(-out$redeem_mape/h)*10)* 0.55
return(score)
}
# purchase
purchase_ts=ts(train_data$total_purchase_amt,
frequency=7,start=c(1,1)) ## 生成时间序列
purchase_stl_res = stl(purchase_ts,s.window="periodic",robust=TRUE) ## stl分解
purchase_pred_test = forecast(purchase_stl_res,h = 31)$mean ## 得到测试集预测结果
# redeem
redeem_ts = ts(train_data$total_redeem_amt,
frequency=7,start=c(1,1))
redeem_stl_res = stl(redeem_ts,s.window="periodic",robust=TRUE)
redeem_pred_test = forecast(redeem_stl_res,h = 31)$mean
get_score(purchase_pred_test,redeem_pred_test,test_data$total_purchase_amt,
test_data$total_redeem_amt)
purchase_del_ts=ts(purchase_ts-purchase_stl_res$time.series[,1],
frequency=7,start=c(1,1))
plot(purchase_del_ts)
acf(purchase_del_ts)
adf.test(purchase_del_ts)
for(i in 1:2){print(Box.test(purchase_del_ts,type = "Box-Pierce",
lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(purchase_del_ts,type = "Ljung-Box",
lag = i*6)$p.value)}
pacf(purchase_del_ts)
acf(purchase_del_ts)
# redeem
redeem_del_ts=ts(redeem_ts-redeem_stl_res$time.series[,1],
frequency=7,start=c(1,1))
plot(redeem_del_ts)
acf(redeem_del_ts)
adf.test(redeem_del_ts)
plot(diff(redeem_del_ts)) # 差分后的时序图
acf(diff(redeem_del_ts))
adf.test(diff(redeem_del_ts))
for(i in 1:2){print(Box.test(diff(redeem_del_ts),type = "Box-Pierce",
lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(diff(redeem_del_ts),type = "Ljung-Box",
lag = i*6)$p.value)}
pacf(diff(redeem_del_ts))#拖尾
acf(diff(redeem_del_ts)) #截尾
# purchase
purchase_del_fit = arima(purchase_del_ts/1000,order = c(1,0,1))
purchase_del_fit
# 显著性检验
for(i in 1:2){print(Box.test(purchase_del_fit$residuals,type = "Ljung-Box",
lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(purchase_del_fit$residuals,type = 'Box-Pierce',
lag = i*6)$p.value)}
purchase_pred_test = forecast(purchase_stl_res$time.series[,1],31)$mean+
predict(purchase_del_fit,n.ahead = 31)$pred*1000
# redeem
redeem_del_fit = arima(redeem_del_ts,order = c(0,1,1))
# 显著性检验
for(i in 1:2){print(Box.test(redeem_del_fit$residuals,
type = "Ljung-Box",lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(redeem_del_fit$residuals,
type = 'Box-Pierce',lag = i*6)$p.value)}
redeem_pred_test = predict(redeem_del_fit,n.ahead = 31)$pred+
forecast(redeem_stl_res$time.series[,1],h=31)$mean
get_score(purchase_pred_test,redeem_pred_test,test_data$total_purchase_amt,
test_data$total_redeem_amt)
#####去除周期后 auto.arima
#####purchase======
autoFit = auto.arima(purchase_del_ts/1000,d=0,seasonal = F,trace = T )
for(i in 1:2){print(Box.test(autoFit$residuals,type = "Ljung-Box",lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(autoFit$residuals,type = 'Box-Pierce',lag = i*6)$p.value)}
purchase_pred_test = forecast(purchase_stl_res$time.series[,1],31)$mean+
forecast(autoFit,h=31)$mean*1000
autoFit = auto.arima(redeem_del_ts,d=1,seasonal = F,trace = TRUE)
###模型显著性检验====
for(i in 1:2){print(Box.test(autoFit$residuals,type = "Ljung-Box",lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(autoFit$residuals,type = 'Box-Pierce',lag = i*6)$p.value)}
redeem_pred_test = forecast(autoFit,h=31)$mean+
forecast(redeem_stl_res$time.series[,1],h=31)$mean
get_score(purchase_pred_test,redeem_pred_test,test_data$total_purchase_amt,
test_data$total_redeem_amt)
# prophet预测
holidays_and_effect_days <- data_frame(
holiday = 'holidays_and_effect_days',
ds = as.Date(c('2014-04-03', '2014-04-04',
'2014-04-05', '2014-04-06', '2014-04-07',
'2014-04-08', '2014-04-09',
'2014-04-29', '2014-04-30',
'2014-05-01', '2014-05-02', '2014-05-03',
'2014-05-04', '2014-05-05',
'2014-05-29', '2014-05-30',
'2014-05-31', '2014-06-01', '2014-06-02',
'2014-06-03', '2014-06-04')),
lower_window = 0,
upper_window = 0
)
holidays <- data_frame(
holiday = 'true_holidays',
ds = as.Date(c('2014-04-05', '2014-04-06', '2014-04-07',
'2014-05-01', '2014-05-02', '2014-05-03',
'2014-05-31', '2014-06-01', '2014-06-02')),
lower_window = 0,
upper_window = 0
)
holidays <- bind_rows(holidays_and_effect_days, holidays)
###purchase=========
Myprophet_purchase = prophet::prophet(changepoint_prior_scale=0.5,weekly_seasonality=T,
holidays= holidays)
df_purchase = data.frame(train_data$report_date ,train_data$total_purchase_amt )
colnames(df_purchase) <-c('ds','y')
Myprophet_purchase = prophet::fit.prophet(Myprophet_purchase,df_purchase)
future_purchase = prophet::make_future_dataframe(Myprophet_purchase,periods=31,
include_history=F)
forecast_purchase = predict(Myprophet_purchase, future_purchase)
purchase_pred_test = forecast_purchase$yhat
###redeem=====
Myprophet_redeem = prophet::prophet(changepoint_prior_scale=0.5,weekly_seasonality=True,
holidays= holidays)
df_redeem = data.frame(train_data$report_date ,train_data$total_redeem_amt )
colnames(df_redeem) <-c('ds','y')
df_redeem
Myprophet_redeem = prophet::fit.prophet(Myprophet_redeem,df_redeem)
future_redeem = prophet::make_future_dataframe(Myprophet_redeem,periods=31,
include_history=F)
forecast_redeem = predict(Myprophet_redeem, future_redeem)
redeem_pred_test = forecast_redeem$yhat
get_score(purchase_pred_test,redeem_pred_test,test_data$total_purchase_amt,
test_data$total_redeem_amt)