资金流入流出预测-挑战Baseline (上）¶

数据分析（1）¶

数据准备
时间序列分析
翌日特征分析
月份特诊分析
日期特征分析
节假日分析
节假日周围日期分析
异常值分析
用户交易纪录表中其他变量相关分析
银行、支付宝利率分析

数据分析（2）¶

大小额用户分析
用户交易频次分析
用户其他属性分析
其他分析
小结

时间序列规则¶

辅助函数整合
时间序列规则设定
结果预测并保存

时间序列建模分析（R)¶

数据准备
评估方式
平稳性分析
ARIMA建模&检验
prophet模型预测

1. 数据准备¶

import pandas as  pd
import numpy as np
import warnings 
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import datetime 
from scipy import stats
from dateutil.relativedelta import relativedelta
import sklearn as skr

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

import warnings
warnings.filterwarnings('ignore')

# 设置数据集路径

dataset_path = 'Dataset/'

# 读取数据

data_balance = pd.read_csv(dataset_path+'user_balance_table.csv')

# 为数据集添加时间戳

data_balance['date'] = pd.to_datetime(data_balance['report_date'], format= "%Y%m%d")
data_balance['day'] = data_balance['date'].dt.day
data_balance['month'] = data_balance['date'].dt.month
data_balance['year'] = data_balance['date'].dt.year
data_balance['week'] = data_balance['date'].dt.week
data_balance['weekday'] = data_balance['date'].dt.weekday

data_balance.head()

2. 时间序列分析¶

# 聚合时间数据
total_balance = data_balance.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum().reset_index()

# 生成测试集区段数据(时间区段为2014/9/1之后)

start = datetime.datetime(2014,9,1)
testdata = []
while start != datetime.datetime(2014,10,1):
    temp = [start, np.nan, np.nan]
    testdata.append(temp)
    start += datetime.timedelta(days = 1)
testdata = pd.DataFrame(testdata)
testdata.columns = total_balance.columns

# 拼接数据集

total_balance = pd.concat([total_balance, testdata], axis = 0)

# 为数据集添加时间戳

total_balance['day'] = total_balance['date'].dt.day
total_balance['month'] = total_balance['date'].dt.month
total_balance['year'] = total_balance['date'].dt.year
total_balance['week'] = total_balance['date'].dt.week
total_balance['weekday'] = total_balance['date'].dt.weekday

import matplotlib.pylab as plt

# 画出每日总购买与赎回量的时间序列图

fig = plt.figure(figsize=(20,6))
plt.plot(total_balance['date'], total_balance['total_purchase_amt'],label='purchase')
plt.plot(total_balance['date'], total_balance['total_redeem_amt'],label='redeem')

plt.legend(loc='best')
plt.title("The lineplot of total amount of Purchase and Redeem from July.13 to Sep.14")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

# 画出4月份以后的时间序列图

total_balance_1 = total_balance[total_balance['date'] >= np.datetime64(datetime.date(2014,4,1))]
fig = plt.figure(figsize=(20,6))
plt.plot(total_balance_1['date'], total_balance_1['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_1['date'], total_balance_1['total_redeem_amt'], label = 'redeem')
plt.legend(loc = 'best')
plt.title("The lineplot of total amount of Purchase and Redeem from April.14 to Sep.14")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

# 分别画出每个月中每天购买赎回量的时间序列图

fig = plt.figure(figsize=(15,15))

plt.subplot(4,1,1)
plt.title("The time series of total amount of Purchase and Redeem for August, July, June, May respectively")

total_balance_2 = total_balance[total_balance['date'] >= np.datetime64(datetime.date(2014,8,1))]
plt.plot(total_balance_2['date'], total_balance_2['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_2['date'], total_balance_2['total_redeem_amt'], label = 'redeem')
plt.legend()


total_balance_3 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,7,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,8,1)))]
plt.subplot(4,1,2)
plt.plot(total_balance_3['date'], total_balance_3['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_3['date'], total_balance_3['total_redeem_amt'], label = 'redeem')
plt.legend()


total_balance_4 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,6,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,7,1)))]
plt.subplot(4,1,3)
plt.plot(total_balance_4['date'], total_balance_4['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_4['date'], total_balance_4['total_redeem_amt'], label = 'redeem')
plt.legend()


total_balance_5 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,5,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,6,1)))]
plt.subplot(4,1,4)
plt.plot(total_balance_5['date'], total_balance_5['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_5['date'], total_balance_5['total_redeem_amt'], label = 'redeem')
plt.legend()

plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

# 分别画出13年8月与9月每日购买赎回量的时序图

fig = plt.figure(figsize=(15,9))

total_balance_last8 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,8,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,9,1)))]
plt.subplot(2,1,1)
plt.plot(total_balance_last8['date'], total_balance_last8['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_last8['date'], total_balance_last8['total_redeem_amt'], label = 'redeem')
plt.legend()

total_balance_last9 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,9,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,10,1)))]
plt.subplot(2,1,2)
plt.plot(total_balance_last9['date'], total_balance_last9['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_last9['date'], total_balance_last9['total_redeem_amt'], label = 'redeem')
plt.legend()

plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

3. 翌日特征分析¶

# 画出每个翌日的数据分布于整体数据的分布图

plt.figure(figsize=(15,10))
scatter_para = {'marker':'.', 's':3, 'alpha':0.3}
line_kws = {'color':'k'}
plt.subplot(2,2,1)
plt.title('The distrubution of total purchase')
sns.violinplot(x='weekday', y='total_purchase_amt', data = total_balance_1, scatter_kws=scatter_para, line_kws=line_kws)
plt.subplot(2,2,2)
plt.title('The distrubution of total purchase')
sns.distplot(total_balance_1['total_purchase_amt'].dropna())
plt.subplot(2,2,3)
plt.title('The distrubution of total redeem')
sns.violinplot(x='weekday', y='total_redeem_amt', data = total_balance_1, scatter_kws=scatter_para, line_kws=line_kws)
plt.subplot(2,2,4)
plt.title('The distrubution of total redeem')
sns.distplot(total_balance_1['total_redeem_amt'].dropna())

<matplotlib.axes._subplots.AxesSubplot at 0x1f713054a48>

# 按翌日对数据聚合后取均值

week_sta = total_balance_1[['total_purchase_amt', 'total_redeem_amt', 'weekday']].groupby('weekday', as_index=False).mean()

# 分析翌日的中位数特征

plt.figure(figsize=(12, 5))
ax = plt.subplot(1,2,1)
plt.title('The barplot of average total purchase with each weekday')
ax = sns.barplot(x="weekday", y="total_purchase_amt", data=week_sta, label='Purchase')
ax.legend()
ax = plt.subplot(1,2,2)
plt.title('The barplot of average total redeem with each weekday')
ax = sns.barplot(x="weekday", y="total_redeem_amt", data=week_sta, label='Redeem')
ax.legend()

<matplotlib.legend.Legend at 0x1f712f600c8>

# 画出翌日的箱型图 （箱形图最大的优点就是不受异常值的影响，可以以一种相对稳定的方式描述数据的离散分布情况。）

plt.figure(figsize=(12, 5))
ax = plt.subplot(1,2,1)
plt.title('The boxplot of total purchase with each weekday')
ax = sns.boxplot(x="weekday", y="total_purchase_amt", data=total_balance_1,palette = 'RdBu')
ax = plt.subplot(1,2,2)
plt.title('The boxplot of total redeem with each weekday')
ax = sns.boxplot(x="weekday", y="total_redeem_amt", data=total_balance_1, palette = 'RdBu')

# 使用OneHot方法将翌日特征划分，获取划分后特征

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
total_balance = total_balance.reset_index()
week_feature = encoder.fit_transform(total_balance['weekday'].ravel().reshape(-1, 1)).toarray() 
# list to array --> https://zhuanlan.zhihu.com/p/261636002
# reshape是在不改变数据内容的情况下，改变一个数组的格式，参数及返回值，reshape(-1,1)令计算机计算出展开所有的行数，然后转换。
week_feature = pd.DataFrame(week_feature,columns=['weekday_onehot']*len(week_feature[0])) # 字符串复制
feature = pd.concat([total_balance, week_feature], axis = 1)[['total_purchase_amt', 'total_redeem_amt','weekday_onehot','date']]
feature.columns = list(feature.columns[0:2]) + [x+str(i) for i,x in enumerate(feature.columns[2:-1])] + ['date']

feature.head()

# 画出划分后翌日特征与标签的斯皮尔曼相关性

f, ax = plt.subplots(figsize = (20, 6))
plt.subplot(1,2,1)
plt.title('The spearman coleration between total purchase and each weekday')
sns.heatmap(feature[[x for x in feature.columns if x not in ['total_redeem_amt', 'date'] ]].corr('spearman'),linewidths = 0.1, vmax = 0.2, vmin=-0.2)
plt.subplot(1,2,2)
plt.title('The spearman coleration between total redeem and each weekday')
sns.heatmap(feature[[x for x in feature.columns if x not in ['total_purchase_amt', 'date'] ]].corr('spearman'),linewidths = 0.1,  vmax = 0.2, vmin=-0.2)

<matplotlib.axes._subplots.AxesSubplot at 0x1f712e34b08>

# 测试翌日特征与标签的独立性 Ref: https://github.com/ChuanyuXue/MVTest

# pip install mvtest
from mvtpy.mvtest import mvtest
mv = mvtest()
mv.test(total_balance_1['total_purchase_amt'], total_balance_1['weekday'])

{'Tn': 6.75, 'p-value': [0, 0.01]}

4. 月份特征分析¶

# 画出每个月的购买总量分布估计图(kdeplot)

plt.figure(figsize=(15,10))
plt.title('The Probability Density of total purchase amount in Each Month')
plt.ylabel('Probability')
plt.xlabel('Amount')
for i in range(7, 12):
    sns.kdeplot(total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,i,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,i+1,1)))]['total_purchase_amt'],label='13年'+str(i)+'月')
for i in range(1, 9):
    sns.kdeplot(total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,i,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_purchase_amt'],label='14年'+str(i)+'月')

# 画出每个月的赎回总量分布估计图(kdeplot)

plt.figure(figsize=(15,10))
plt.title('The Probability Density of total redeem amount in Each Month')
plt.ylabel('Probability')
plt.xlabel('Amount')
for i in range(7, 12):
    sns.kdeplot(total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,i,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,i+1,1)))]['total_redeem_amt'],label='13年'+str(i)+'月')
for i in range(1, 9):
    sns.kdeplot(total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,i,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_redeem_amt'],label='14年'+str(i)+'月')

# 画出14年五六七八月份的分布估计图

plt.figure(figsize=(12,10))

ax = plt.subplot(2,1,1)
plt.title('The Probability Density of total purchase and redeem amount from May.14 to August.14')
plt.ylabel('Probability')
plt.xlabel('Amount')
ax = sns.kdeplot(total_balance_2['total_purchase_amt'],label='八月')
ax = sns.kdeplot(total_balance_3['total_purchase_amt'],label='七月')
ax = sns.kdeplot(total_balance_4['total_purchase_amt'],label='六月')
ax = sns.kdeplot(total_balance_5['total_purchase_amt'],color='Black',label='五月')
ax = plt.subplot(2,1,2)
plt.ylabel('Probability')
plt.xlabel('Amount')
ax = sns.kdeplot(total_balance_2['total_redeem_amt'],label='八月')
ax = sns.kdeplot(total_balance_3['total_redeem_amt'],label='七月')
ax = sns.kdeplot(total_balance_4['total_redeem_amt'],label='六月')
ax = sns.kdeplot(total_balance_5['total_redeem_amt'],color='Black',label='五月')

# 画出13年八月到九月份的分布估计图

total_balance_last_7 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,7,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,8,1)))]
total_balance_last_8 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,8,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,9,1)))]
total_balance_last_9 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,9,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,10,1)))]
total_balance_last_10 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,10,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,11,1)))]
plt.figure(figsize=(12,10))
ax = plt.subplot(2,1,1)
plt.title('The Probability Density of total purchase and redeem amount from Aug.13 to Sep.13')
plt.ylabel('Probability')
plt.xlabel('Amount')
ax = sns.kdeplot(total_balance_last_8['total_purchase_amt'],label='August')
ax = sns.kdeplot(total_balance_last_7['total_purchase_amt'],label='July')
ax = sns.kdeplot(total_balance_last_9['total_purchase_amt'],color='Red',label='September')

ax = plt.subplot(2,1,2)
plt.ylabel('Probability')
plt.xlabel('Amount')
ax = sns.kdeplot(total_balance_last_8['total_redeem_amt'],label='August')
ax = sns.kdeplot(total_balance_last_7['total_redeem_amt'],label='July')
ax = sns.kdeplot(total_balance_last_9['total_redeem_amt'],color='Red',label='September')
ax = sns.kdeplot(total_balance_last_10['total_redeem_amt'],color='Black',label='Novermber')

5. 日期特征分析¶

# 按照每天聚合数据集

day_sta = total_balance_2[['total_purchase_amt', 'total_redeem_amt', 'day']].groupby('day', as_index=False).mean()

# 获取聚合后每月购买分布的柱状图
plt.figure(figsize = (12,5))
ax = sns.barplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase',palette='cividis')
ax = sns.lineplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase', color = 'red')
ax.legend()
plt.title("The total Purchase in Aug.14")

Text(0.5, 1.0, 'The total Purchase in Aug.14')

# 获取聚合后每月赎回分布的柱状图
plt.figure(figsize = (12,5))
ax = sns.barplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem', palette='cividis')
ax = sns.lineplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem', color = 'red')
ax.legend()
plt.title("The total Redeem in Aug.14")

Text(0.5, 1.0, 'The total Redeem in Aug.14')

# 画出13年九月份的分布图

plt.figure(figsize=(12,10))
day_sta = total_balance_last_9[['total_purchase_amt', 'total_redeem_amt', 'day']].groupby('day', as_index=False).mean()
plt.subplot(2,1,1)
plt.title("The total Purchase in Sep.13")
ax = sns.barplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase', palette= 'cividis')
ax = sns.lineplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase', color = 'red')
plt.subplot(2,1,2)
plt.title("The total Redeem in Sep.13")
bx = sns.barplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem', palette= 'cividis')
bx = sns.lineplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem', color = 'red')
bx.legend()

<matplotlib.legend.Legend at 0x1f712b320c8>

We find that the data from last year in Sep has very limited week feature

There are some strange day in Sep:

1st day
2nd day
16th day(Purchase a lot)---Monday & 3days before MidAutumn Festirval
11th day and 25th day(Redeem a lot)---Both of Wednesday
18 19 20(Both Purchase and Redeem is very low)

# 画出历史所有天的热力图

test = np.zeros((max(total_balance_1['week']) - min(total_balance_1['week']) + 1, 7))
test[total_balance_1['week'] - min(total_balance_1['week']), total_balance_1['weekday']] = total_balance_1['total_purchase_amt']

f, ax = plt.subplots(figsize = (10, 4))
sns.heatmap(test,linewidths = 0.1, ax=ax)
ax.set_title("Purchase")
ax.set_xlabel('weekday')
ax.set_ylabel('week')

test = np.zeros((max(total_balance_1['week']) - min(total_balance_1['week']) + 1, 7))
test[total_balance_1['week'] - min(total_balance_1['week']), total_balance_1['weekday']] = total_balance_1['total_redeem_amt']

f, ax = plt.subplots(figsize = (10, 4))
sns.heatmap(test,linewidths = 0.1, ax=ax)
ax.set_title("Redeem")
ax.set_xlabel('weekday')
ax.set_ylabel('week')

Text(71.0, 0.5, 'week')

From the heat map we find that the data of week 4 and weekday 6 is very strange, and week 12 weekday 2 either

# 对于热力图中异常点的数据分析.1

total_balance_1[(total_balance_1['week'] == 4 + min(total_balance_1['week'])) & (total_balance_1['weekday'] == 6)]

2014-5-4 is a special day in China, It is the first workday after the Labour day!

# 对于热力图中异常点的数据分析.2

total_balance_1[(total_balance_1['week'] == 12 + min(total_balance_1['week'])) & (total_balance_1['weekday'] == 2)]

In 2016-06-25 They Redeemed a lot but Purchase a little:

6. 节假日分析¶

The QingMing festerval (April.5 - April.7)
The Labour day (May.1 - May.5)
The DuanWu festeval (May.31 - June.2)
The MidAutumn festeval (Sep.6 - Sep.8)

Others

Mother day(May.13)
Father day(June. 17)
TianMao 618 sales(June 10 - June 20)
Teachers' day(Sep 9)

# 获取节假日的数据

qingming = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,4,5))) & (total_balance['date'] < np.datetime64(datetime.date(2014,4,8)))]
labour = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,5,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,5,4)))]
duanwu = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,5,31))) & (total_balance['date'] < np.datetime64(datetime.date(2014,6,3)))]
data618 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,6,10))) & (total_balance['date'] < np.datetime64(datetime.date(2014,6,20)))]

# 画出节假日与平时的均值

fig = plt.figure(figsize = (12,5))
index_list = ['清明','劳动节','端午','618','平均']
label_list = [np.mean(qingming['total_purchase_amt']), np.mean(labour['total_purchase_amt']),np.mean(duanwu['total_purchase_amt']),np.mean(data618['total_purchase_amt']),np.mean(total_balance_1['total_purchase_amt'])]
plt.bar(index_list, label_list, label="Purchase")

index_list = ['清明.','劳动.','端午.','618.','平均.']
label_list = [np.mean(qingming['total_redeem_amt']), np.mean(labour['total_redeem_amt']),np.mean(duanwu['total_redeem_amt']),np.mean(data618['total_redeem_amt']),np.mean(total_balance_1['total_redeem_amt'])]
plt.bar(index_list, label_list, label="Redeem")
plt.title("The average of different holiday")
plt.ylabel("Amount")
plt.legend()
plt.show()

# 画出节假日购买量与其所处翌日的对比

plt.figure(figsize = (12,5))
import numpy as np
import matplotlib.pyplot as plt
size = 4
x = np.arange(size)

total_width, n = 0.8, 2    
width = total_width / n
x = x - (total_width - width) / 2

a = [176250006, 167825284, 162844282,321591063]
b = [225337516, 241859315, 225337516,307635449]

plt.bar(x, a,  width=width, label='Holiday_Purchase')
plt.bar(x + width, b, width=width, label='Normal_Purchase')
plt.xticks(x + width / 2, ('清明', '劳动节', '端午', '618'))
plt.title("节假日购买量与其所处翌日的对比")
plt.legend()
plt.show()

# 画出节假日赎回量与其所处翌日的对比
plt.figure(figsize = (12,5))
import numpy as np
import matplotlib.pyplot as plt
size = 4
x = np.arange(size)

total_width, n = 0.8, 2     
width = total_width / n
x = x - (total_width - width) / 2

a = [159914308, 154717620, 154366940,291016763]
b = [235439685, 240364238, 235439685,313310347]

plt.bar(x, a,  width=width, label='Holiday_Redeem')
plt.bar(x + width, b, width=width, label='Normal_Redeem')
plt.xticks(x + width / 2, ('清明', '劳动节', '端午', '618'))
plt.title("节假日赎回量与其所处翌日的对比")
plt.legend()
plt.show()

7. 节假日周围日期分析¶

# 画出清明节与周边日期的时序图

plt.figure(figsize = (12,5))
qingming_around = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,4,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,4,13)))]
ax = sns.lineplot(x="date", y="total_purchase_amt", data=qingming_around, label='Purchase')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=qingming_around, label='Redeem', ax=ax)
ax = sns.scatterplot(x="date", y="total_purchase_amt", data=qingming, ax = ax)
ax = sns.scatterplot(x="date", y="total_redeem_amt", data=qingming, ax=ax)
plt.title("清明节与周边日期的时序图")
ax.legend()

<matplotlib.legend.Legend at 0x1f71400a548>

# 画出劳动节与周边日期的时序图

plt.figure(figsize = (12,5))
labour_around = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,4,25))) & (total_balance['date'] < np.datetime64(datetime.date(2014,5,10)))]
ax = sns.lineplot(x="date", y="total_purchase_amt", data=labour_around, label='Purchase')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=labour_around, label='Redeem', ax=ax)
ax = sns.scatterplot(x="date", y="total_purchase_amt", data=labour, ax=ax)
ax = sns.scatterplot(x="date", y="total_redeem_amt", data=labour, ax=ax)
plt.title("劳动节与周边日期的时序图")
ax.legend()

<matplotlib.legend.Legend at 0x1f7140d8d48>

# 画出端午节与周边日期的时序图
plt.figure(figsize = (12,5))
duanwu_around = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,5,25))) & (total_balance['date'] < np.datetime64(datetime.date(2014,6,7)))]
ax = sns.lineplot(x="date", y="total_purchase_amt", data=duanwu_around, label='Purchase')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=duanwu_around, label='Redeem', ax=ax)
ax = sns.scatterplot(x="date", y="total_purchase_amt", data=duanwu, ax=ax)
ax = sns.scatterplot(x="date", y="total_redeem_amt", data=duanwu, ax=ax)
plt.title("端午节与周边日期的时序图")
ax.legend()

<matplotlib.legend.Legend at 0x1f712cf9bc8>

# 画出中秋与周边日期的时序图
plt.figure(figsize = (12,5))
zhongqiu = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,9,19))) & (total_balance['date'] < np.datetime64(datetime.date(2013,9,22)))]
zhongqiu_around = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,9,14))) & (total_balance['date'] < np.datetime64(datetime.date(2013,9,28)))]
ax = sns.lineplot(x="date", y="total_purchase_amt", data=zhongqiu_around, label='Purchase')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=zhongqiu_around, label='Redeem', ax=ax)
ax = sns.scatterplot(x="date", y="total_purchase_amt", data=zhongqiu, ax=ax)
ax = sns.scatterplot(x="date", y="total_redeem_amt", data=zhongqiu, ax=ax)
plt.title("中秋与周边日期的时序图(in 2013)")
ax.legend()

<matplotlib.legend.Legend at 0x1f713f4ec48>

8. 异常值分析¶

# 画出用户交易纪录的箱型图

sns.boxplot(data_balance['total_purchase_amt'])
plt.title("The abnormal value of total purchase")

Text(0.5, 1.0, 'The abnormal value of total purchase')

# 对于购买2e8的用户的交易行为分析

data_balance[data_balance['user_id'] == 14592].sort_values(by = 'total_redeem_amt',axis = 0,ascending = False).head()

1311 Bought 2E Seal 1E 1312 Seal 1E 1405 Bought 0.9E 1406 Seal 1E

# 画出单笔交易为2e8的那天的总交易量及附近几天的交易量

plt.figure(figsize = (12,5))
e2 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,11,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,11,10)))]
ax = sns.barplot(x="day", y="total_purchase_amt", data=e2, label='2E')
ax = sns.lineplot(x="day", y="total_purchase_amt", data=e2, label='2E')
plt.title("The influence of the big deal with 200 million purchasing(Red Bar)")
ax.legend()

<matplotlib.legend.Legend at 0x1f713fee808>

# 画出每日单笔最大交易的时序图

plt.figure(figsize=(20, 6))
ax = sns.lineplot(x="date", y="total_purchase_amt", data=data_balance[['total_purchase_amt', 'date']].groupby('date', as_index=False).max(), label='最大购买')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=data_balance[['total_redeem_amt', 'date']].groupby('date', as_index=False).max(), label='最大赎回')
plt.title("The Biggest deal happend in each day")

Text(0.5, 1.0, 'The Biggest deal happend in each day')

# 画出每日单笔最大交易以及总交易额的时序图

plt.figure(figsize=(25, 8))
ax = sns.lineplot(x="date", y="total_purchase_amt", data=data_balance[['total_purchase_amt', 'date']].groupby('date', as_index=False).max(), label='MAX_PURCHASE')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=data_balance[['total_redeem_amt', 'date']].groupby('date', as_index=False).max(), label='MAX_REDEEM')
ax = sns.lineplot(x="date", y="total_purchase_amt", data=data_balance[['total_purchase_amt', 'date']].groupby('date', as_index=False).sum(), label='TOTAL_PURCHASE')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=data_balance[['total_redeem_amt', 'date']].groupby('date', as_index=False).sum(), label='TOTAL_REDEEM')

# 画出每个月大额交易的频次直方图

plt.figure(figsize=(12, 5))
big_frequancy = data_balance[(data_balance['total_purchase_amt'] > 10000000) | (data_balance['total_redeem_amt'] > 10000000)][['month','year','user_id']].groupby(['year','month'], as_index=False).count()
big_frequancy['i'] = big_frequancy['year']  + big_frequancy['month'] / 100
ax = sns.barplot(x="i", y="user_id", data=big_frequancy)
plt.title("The frequency of super big deal(larger than 100 million) in each month")

Text(0.5, 1.0, 'The frequency of super big deal(larger than 100 million) in each month')

# 获取大额交易的数据集

data_balance['big_purchase'] = 0
data_balance.loc[data_balance['total_purchase_amt'] > 1000000, 'big_purchase'] = 1
data_balance['big_redeem'] = 0
data_balance.loc[data_balance['total_redeem_amt'] > 1000000, 'big_redeem'] = 1

# 对大额交易按每天做聚合操作

big_purchase = data_balance[data_balance['big_purchase'] == 1].groupby(['date'], as_index=False)['total_purchase_amt'].sum()
small_purchase = data_balance[data_balance['big_purchase'] == 0].groupby(['date'], as_index=False)['total_purchase_amt'].sum()
big_redeem = data_balance[data_balance['big_redeem'] == 1].groupby(['date'], as_index=False)['total_redeem_amt'].sum()
small_redeem = data_balance[data_balance['big_redeem'] == 0].groupby(['date'], as_index=False)['total_redeem_amt'].sum()

# 画出大额交易与小额交易的时序分布图

fig = plt.figure(figsize=(20,6))
plt.plot(big_purchase['date'], big_purchase['total_purchase_amt'],label='big_purchase', color = 'red')
plt.plot(big_redeem['date'], big_redeem['total_redeem_amt'],label='big_redeem', color = 'orange')

plt.plot(small_purchase['date'], small_purchase['total_purchase_amt'],label='small_purchase', color = 'blue')
plt.plot(small_redeem['date'], small_redeem['total_redeem_amt'],label='small_redeem', color = 'dodgerblue')
plt.legend(loc='best')
plt.title("The time series of big deal of Purchase and Redeem from July.13 to Sep.14")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

# 画出大额交易与小额交易的分布估计图

plt.figure(figsize=(15,10))

plt.subplot(2,2,1)
for i in range(4, 9):
    sns.kdeplot(big_purchase[(big_purchase['date'] >= np.datetime64(datetime.date(2014,i,1))) & (big_purchase['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_purchase_amt'],label='14Y,'+str(i)+'M')
plt.title('BIG PURCHASE')

plt.subplot(2,2,2)
for i in range(4, 9):
    sns.kdeplot(small_purchase[(small_purchase['date'] >= np.datetime64(datetime.date(2014,i,1))) & (small_purchase['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_purchase_amt'],label='14Y,'+str(i)+'M')
plt.title('SMALL PURCHASE')

plt.subplot(2,2,3)
for i in range(4, 9):
    sns.kdeplot(big_redeem[(big_redeem['date'] >= np.datetime64(datetime.date(2014,i,1))) & (big_redeem['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_redeem_amt'],label='14Y,'+str(i)+'M')
plt.title('BIG REDEEM')

plt.subplot(2,2,4)
for i in range(4, 9):
    sns.kdeplot(small_redeem[(small_redeem['date'] >= np.datetime64(datetime.date(2014,i,1))) & (small_redeem['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_redeem_amt'],label='14Y,'+str(i)+'M')
plt.title('SMALL REDEEM')

Text(0.5, 1.0, 'SMALL REDEEM')

# 添加时间戳

big_purchase['weekday'] = big_purchase['date'].dt.weekday
small_purchase['weekday'] = small_purchase['date'].dt.weekday
big_redeem['weekday'] = big_redeem['date'].dt.weekday
small_redeem['weekday'] = small_redeem['date'].dt.weekday

# 分析大额小额的翌日分布

plt.figure(figsize=(15, 10))

ax = plt.subplot(2,2,1)
ax = sns.boxplot(x="weekday", y="total_purchase_amt", data=big_purchase[big_purchase['date'] >= np.datetime64(datetime.date(2014,4,1))])
plt.title('BIG PURCHASE')

ax = plt.subplot(2,2,2)
ax = sns.boxplot(x="weekday", y="total_redeem_amt", data=big_redeem[big_redeem['date'] >= np.datetime64(datetime.date(2014,4,1))])
plt.title('BIG REDEEM')

ax = plt.subplot(2,2,3)
ax = sns.boxplot(x="weekday", y="total_purchase_amt", data=small_purchase[small_purchase['date'] >= np.datetime64(datetime.date(2014,4,1))])
plt.title('SMALL PURCHASE')

ax = plt.subplot(2,2,4)
ax = sns.boxplot(x="weekday", y="total_redeem_amt", data=small_redeem[small_redeem['date'] >= np.datetime64(datetime.date(2014,4,1))])
plt.title('SMALL REDEEM')

Text(0.5, 1.0, 'SMALL REDEEM')

9. 用户交易纪录表中其他变量相关分析¶

# 截断数据集

data_balance_1 = data_balance[data_balance['date'] > datetime.datetime(2014,4,1)]

# 画出用户交易纪录表中其他变量与标签的相关性图


plt.figure(figsize = (12,5))
feature = ['total_purchase_amt','total_redeem_amt', 'report_date', 'tBalance', 'yBalance', 
       'direct_purchase_amt', 'purchase_bal_amt', 'purchase_bank_amt',
        'consume_amt', 'transfer_amt', 'tftobal_amt',
       'tftocard_amt', 'share_amt']

sns.heatmap(data_balance_1[feature].corr(), linewidths = 0.1)   
plt.title("The coleration between each feature in User_Balance_Table")

Text(0.5, 1, 'The coleration between each feature in User_Balance_Table')

10. 银行、支付宝利率分析¶

# 读取银行利率并添加时间戳

bank = pd.read_csv(dataset_path + "mfd_bank_shibor.csv")
bank = bank.rename(columns = {'mfd_date': 'date'})
bank_features = [x for x in bank.columns if x not in ['date']]
bank['date'] = pd.to_datetime(bank['date'], format= "%Y%m%d")
bank['day'] = bank['date'].dt.day
bank['month'] = bank['date'].dt.month
bank['year'] = bank['date'].dt.year
bank['week'] = bank['date'].dt.week
bank['weekday'] = bank['date'].dt.weekday

# 读取支付宝利率并添加时间戳

share = pd.read_csv(dataset_path + 'mfd_day_share_interest.csv')
share = share.rename(columns = {'mfd_date': 'date'})
share_features = [x for x in share.columns if x not in ['date']]
share['date'] = pd.to_datetime(share['date'], format= "%Y%m%d")
share['day'] = share['date'].dt.day
share['month'] = share['date'].dt.month
share['year'] = share['date'].dt.year
share['week'] = share['date'].dt.week
share['weekday'] = share['date'].dt.weekday

# 画出上一天银行及支付宝利率与标签的相关性图

bank['last_date'] = bank['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("The coleration between each lastday bank rate and total purchase")
temp = pd.merge(bank[['last_date']+bank_features], total_balance, left_on='last_date', right_on='date')[['total_purchase_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)  
plt.subplot(1,3,3)
plt.title("The coleration between each lastday bank rate and total redeem")
temp = pd.merge(bank[['last_date']+bank_features], total_balance, left_on='last_date', right_on='date')[['total_redeem_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)

<matplotlib.axes._subplots.AxesSubplot at 0x1f7165be8c8>

# 画出上一星期银行及支付宝利率与标签的相关性图

bank['last_week'] = bank['week'] + 1
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("The coleration between each last week bank rate and total purchase")
temp = pd.merge(bank[['last_week','weekday']+bank_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_purchase_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)  
plt.subplot(1,3,3)
plt.title("The coleration between each last week bank rate and total redeem")
temp = pd.merge(bank[['last_week','weekday']+bank_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_redeem_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)

<matplotlib.axes._subplots.AxesSubplot at 0x1f716677608>

# 分别画出上一星期银行及支付宝利率与大额小额数据的相关性图

bank['last_date'] = bank['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("The coleration of Small Rate purchase")
temp = pd.merge(bank[['last_date']+bank_features], small_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)  
plt.subplot(1,3,3)
plt.title("The coleration of Small Rate redeem")
temp = pd.merge(bank[['last_date']+bank_features], small_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)  

bank['last_date'] = bank['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("The coleration of Big Rate purchase")
temp = pd.merge(bank[['last_date']+bank_features], big_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)  
plt.subplot(1,3,3)
plt.title("The coleration of Big Rate redeem")
temp = pd.merge(bank[['last_date']+bank_features], big_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)

<matplotlib.axes._subplots.AxesSubplot at 0x1f718487608>

# 画出银行利率的时序图

plt.figure(figsize=(15,5))
for i in bank_features:
    plt.plot(bank['date'], bank[[i]] ,label=i)
plt.legend()
plt.title("The time series of bank rate")
plt.xlabel("Time")
plt.ylabel("Rate")

Text(0, 0.5, 'Rate')

# 画出部分银行利率与购买量的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(bank['date'], bank['Interest_3_M'],'b',label="Interest_3_M")
plt.plot(bank['date'], bank['Interest_6_M'],'cyan',label="Interest_6_M")
plt.plot(bank['date'], bank['Interest_9_M'],'skyblue',label="Interest_9_M")

plt.legend()

ax2=ax1.twinx()
plt.plot(total_balance['date'], total_balance['total_purchase_amt'],'g',label="Total purchase")

plt.legend(loc=2)
plt.title("The time series of bank rate and purchase")
plt.xlabel("Time")
plt.ylabel("Rate & Amount")
plt.show()

# 画出部分银行利率与赎回量的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(bank['date'], bank['Interest_3_M'],'b',label="Interest_3_M")
plt.plot(bank['date'], bank['Interest_6_M'],'cyan',label="Interest_6_M")
plt.plot(bank['date'], bank['Interest_9_M'],'skyblue',label="Interest_9_M")

plt.legend()

ax2=ax1.twinx()
plt.plot(total_balance['date'], total_balance['total_redeem_amt'],'g',label="Total redeem")

plt.legend(loc=2)
plt.title("The time series of bank rate and redeem")
plt.xlabel("Time")
plt.ylabel("Rate & Amount")

plt.show()

The information for Share rate¶

# 画出支付宝利率与标签的相关性图

share['last_date'] = share['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
temp = pd.merge(share[['last_date']+share_features], total_balance, left_on='last_date', right_on='date')[['total_purchase_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0)  
plt.subplot(1,3,3)
temp = pd.merge(share[['last_date']+share_features], total_balance, left_on='last_date', right_on='date')[['total_redeem_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0)

<matplotlib.axes._subplots.AxesSubplot at 0x1f718600848>

# 画出银行利率与标签的相关性图

share['last_week'] = share['week'] + 1
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
temp = pd.merge(share[['last_week','weekday']+share_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_purchase_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0)  
plt.subplot(1,3,3)
temp = pd.merge(share[['last_week','weekday']+share_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_redeem_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0)

<matplotlib.axes._subplots.AxesSubplot at 0x1f718a04208>

# 画出支付宝利率与购买量的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
for i in share_features:
    plt.plot(share['date'], share[i],'b',label=i)
    break
plt.legend()
ax2=ax1.twinx()
plt.plot(total_balance['date'], total_balance['total_purchase_amt'],'g',label="Total purchase")
plt.legend(loc=2)
plt.show()

# 画出支付宝利率与赎回量的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
for i in share_features:
    plt.plot(share['date'], share[i],'b',label=i)
    break
plt.legend()
ax2=ax1.twinx()
plt.plot(total_balance['date'], total_balance['total_redeem_amt'],'g',label="Total redeem")
plt.legend(loc=2)
plt.show()

# 画出大额小额数据与支付宝利率的相关性图

share['last_date'] = share['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
temp = pd.merge(share[['last_date']+share_features], small_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)  
plt.title("SMALL PURCHASE")
plt.subplot(1,3,3)
plt.title("SMALL REDEEM")
temp = pd.merge(share[['last_date']+share_features], small_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)  

share['last_date'] = share['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("BIG PURCHASE")
temp = pd.merge(share[['last_date']+share_features], big_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)  
plt.subplot(1,3,3)
plt.title("BIG REDEEM")
temp = pd.merge(share[['last_date']+share_features], big_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)

<matplotlib.axes._subplots.AxesSubplot at 0x1f71e018088>

# 画出银行利率与支付宝利率的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(bank['date'], bank['Interest_3_M'],c='g',label= 'BANK')

plt.legend()
ax2=ax1.twinx()
plt.plot(share['date'], share['mfd_daily_yield'],label='SHARE')
plt.legend(loc=2)
plt.show()

11. 小结¶

It seems that:

The influence of share is more likely to act on Purchase
The influence of bank rate is more likely to act on Redeem
The influence of share rate is for short
The influence of bank rate is for long

based on above analysis, we can simply find these features:¶

the weekday
is it weekend
is it holidy
the distance from the start of week(monday)
the distance from the end of week(sunday)
the distance from the holiday centre(centre of QingMing DuanWu Labour ZhongQiu)
the distance from the start of month
the distance from the end of month
the mean/max/min value of the same week in last month
the value in last day of last month

1. 大小额数据分析¶

前面的分析可以看出，某些用户的交易额很大，对于日交易量很明显，这里统计四月份后依旧活跃的大额用户(大于100万)

# 获得大额用户的集合

temp = data_balance[(data_balance['total_purchase_amt'] >= 1000000) | (data_balance['total_redeem_amt'] >= 1000000)]
big_users_set = set(temp[temp['date'] >= datetime.datetime(2014,4,1)]['user_id'])

len(big_users_set)

3903

max(data_balance['user_id'])

28041

len(big_users_set)/max(data_balance['user_id'])*100

13.918904461324487

大额用户占3903个，占总数的13.92%

# 标记大额用户

data_balance['big_user'] = 0
data_balance.loc[data_balance['user_id'].isin(big_users_set), 'big_user'] = 1

# 统计大额用户与小额用户的日总交易额的区别

total_balance_bigNsmall = data_balance.groupby(['date','big_user'], 
                                               as_index=False)['total_purchase_amt','total_redeem_amt'].sum()

# 画出大额用户与小额用户交易的日总交易量图

fig = plt.figure(figsize=(20,8))
plt.plot(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['date'], total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['total_purchase_amt'],label='big_purchase')
plt.plot(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['date'], total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['total_redeem_amt'],label='big_redeem')

plt.plot(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['date'], total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['total_purchase_amt'],label='small_purchase')
plt.plot(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['date'], total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['total_redeem_amt'],label='small_redeem')
plt.legend(loc='best')
plt.title("The time series of big and small user of Purchase and Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

# 统计大额小额用户购买量占比

np.sum(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['total_purchase_amt']) / np.sum(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['total_purchase_amt'])

5.1456178397775805

# 统计大额小额用户赎回量占比

np.sum(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['total_redeem_amt']) / np.sum(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['total_redeem_amt'])

5.422141272341089

2. 分析用户的交易频次¶

# 画出非0交易的分布图

frequency = data_balance[(data_balance['direct_purchase_amt'] != 0) | (data_balance['total_redeem_amt'] != 0)][['user_id','tBalance']].groupby('user_id', as_index=False).count()
sns.distplot(frequency['tBalance'])

<matplotlib.axes._subplots.AxesSubplot at 0x1f71e8ca508>

# 获取频繁交易用户集合

hot_users_set = set(frequency[frequency['tBalance'] > 30]['user_id'])

# 获取频繁用户的交易纪录

data_balance['is_hot_users'] = 0
data_balance.loc[data_balance['user_id'].isin(hot_users_set) , 'is_hot_users'] = 1

# 统计频繁用户与非频繁用户的日总交易额的区别

total_balance_hotNcold = data_balance.groupby(['date','is_hot_users'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()

# 绘制频繁用户与非频繁用户总购买赎回量的时序图

fig = plt.figure(figsize=(20,8))
plt.plot(total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 1]['date'], total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 1]['total_purchase_amt'],label='hot_purchase')
plt.plot(total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 1]['date'], total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 1]['total_redeem_amt'],label='hot_redeem')

plt.plot(total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 0]['date'], total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 0]['total_purchase_amt'],label='cold_purchase')
plt.plot(total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 0]['date'], total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 0]['total_redeem_amt'],label='cold_redeem')
plt.legend(loc='best')
plt.title("The time series of big and small user of Purchase and Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

# 画出冷启动用户与老用户交易的箱型图

temp = data_balance[['year','month','user_id','total_purchase_amt','total_redeem_amt']].groupby(['year','month','user_id'], as_index=False).sum()
user_old_set = set()
plt.figure(figsize=(10,30))
for i in range(1, 9):
    newset = set(temp[(temp['year'] == 2014) & (temp['month'] == i)]['user_id'])
    this_month = data_balance[(data_balance['year'] == 2014) & (data_balance['month'] == i)]
    this_month['cold'] = 0
    this_month.loc[this_month['user_id'].isin(newset - user_old_set), 'cold'] = 1
    plt.subplot(4,2,i)
    plt.title('This month : ' + str(i))
    sns.boxplot(x="cold", y="total_purchase_amt" , data=this_month[(this_month['direct_purchase_amt'] != 0) | (this_month['total_redeem_amt'] != 0)])
    user_old_set = user_old_set | newset
plt.show()

3. 用户其他属性分析¶

# 用户的其他属性
users = pd.read_csv('Dataset/user_profile_table.csv')
users.head()

# 添加城市、星座、性别

data_balance = pd.merge(data_balance, users, on='user_id')

# 统计每个城市用户的日总交易额的区别并绘制分布估计图

fig = plt.figure(figsize=(10,5))
for i in np.unique(data_balance['city']):
    temp = data_balance.groupby(['date','city'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
    ax = sns.kdeplot( temp[temp['city'] == i]['total_purchase_amt'],label=i)
plt.legend(loc='best')
plt.title("The time series of different city of Purchase")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

fig = plt.figure(figsize=(10,5))
for i in np.unique(data_balance['city']):
    temp = data_balance.groupby(['date','city'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
    ax = sns.kdeplot( temp[temp['city'] == i]['total_redeem_amt'],label=i)
plt.legend(loc='best')
plt.title("The time series of different city of Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

# 统计每个性别用户的日总交易额的区别，并绘制时序图

temp = data_balance.groupby(['date','sex'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()

fig = plt.figure(figsize=(20,6))
plt.plot(temp[temp['sex'] == 1]['date'], temp[temp['sex'] == 1]['total_purchase_amt'],label='Male')
plt.plot(temp[temp['sex'] == 0]['date'], temp[temp['sex'] == 0]['total_purchase_amt'],label='Female')
plt.legend(loc='best')
plt.title("The time series of two gender user of Purchase")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

fig = plt.figure(figsize=(20,6))
plt.plot(temp[temp['sex'] == 1]['date'], temp[temp['sex'] == 1]['total_purchase_amt'],label='Male')
plt.plot(temp[temp['sex'] == 0]['date'], temp[temp['sex'] == 0]['total_redeem_amt'],label='Female')
plt.legend(loc='best')
plt.title("The time series of two gender user of Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

data_balance['constellation']

0          白羊座
1          白羊座
2          白羊座
3          白羊座
4          白羊座
          ... 
2840416    处女座
2840417    处女座
2840418    处女座
2840419    处女座
2840420    摩羯座
Name: constellation, Length: 2840421, dtype: object

# 统计每个星座用户的日总交易额的区别 并绘制分布估计图

fig = plt.figure(figsize=(12,5))
for i in np.unique(data_balance['constellation']):
    temp = data_balance.groupby(['date','constellation'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
    ax = sns.kdeplot( temp[temp['constellation'] == i]['total_purchase_amt'],label=i)
plt.legend(loc='best')
plt.title("The time series of small deal of Purchase and Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

fig = plt.figure(figsize=(12,5))
for i in np.unique(data_balance['constellation']):
    temp = data_balance.groupby(['date','constellation'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
    ax = sns.kdeplot( temp[temp['constellation'] == i]['total_redeem_amt'],label=i)
plt.legend(loc='best')
plt.title("The time series of small deal of Purchase and Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

4. 其他分析¶

# 统计每个性别用户的日总交易额的区别

temp = data_balance.groupby(['date'], as_index=False)['direct_purchase_amt','share_amt'].sum()

# 画出每日利息的增长/直接购买量的时序图

fig = plt.figure(figsize=(20,6))
plt.plot(temp['date'],  temp['share_amt'] / temp['direct_purchase_amt'] ,label='Rate')
plt.legend(loc='best')
plt.title("The rate of Share / Purchase")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

# 加载支付宝利率数据

share = pd.read_csv('Dataset/mfd_day_share_interest.csv')
share = share.rename(columns = {'mfd_date': 'date'})
share_features = [x for x in share.columns if x not in ['date']]
share['date'] = pd.to_datetime(share['date'], format= "%Y%m%d")
share['day'] = share['date'].dt.day
share['month'] = share['date'].dt.month
share['year'] = share['date'].dt.year
share['week'] = share['date'].dt.week
share['weekday'] = share['date'].dt.weekday

# 绘制支付宝利率与交易额的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(temp['date'], temp['share_amt'],'b',label="Share_amt")
plt.legend()

ax2=ax1.twinx()
plt.plot(share['date'], share['mfd_daily_yield'],'g',label="Share rate")

plt.legend(loc=2)
plt.title("The correlation between share rate and share amount")
plt.xlabel("Time")
plt.ylabel("Rate & Amount")

plt.show()

# 支付宝利率与每日利息的增长/直接购买量的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(temp['date'], temp['share_amt'] / temp['direct_purchase_amt'],'b',label="Share_amt / Direct_amt")
plt.legend()

ax2=ax1.twinx()
plt.plot(share['date'], share['mfd_daily_yield'],'g',label="Share rate")

plt.legend(loc=2)
plt.title("The correlation between share rate and Share/Purchase")
plt.xlabel("Time")
plt.ylabel("Rate & Amount")

plt.show()

直接购买里面有两种

# 聚合两种不同购买方式

temp = data_balance.groupby(['date'], as_index=False)['purchase_bal_amt','purchase_bank_amt'].sum()

# 画出不同购买方式日购买量的时序图

fig = plt.figure(figsize=(20,6))
plt.plot(temp['date'], temp['purchase_bal_amt'],label='Bal')
plt.plot(temp['date'], temp['purchase_bank_amt'],label='Bank')
plt.legend(loc='best')
plt.title("The purchase of bal and bank")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

# 画出不同赎回方式日赎回量的时序图

temp = data_balance.groupby(['date'], as_index=False)['tftobal_amt','tftocard_amt'].sum()
fig = plt.figure(figsize=(20,6))
plt.plot(temp['date'], temp['tftobal_amt'],label='Bal')
plt.plot(temp['date'], temp['tftocard_amt'],label='Bank')
plt.legend(loc='best')
plt.title("The redeem of bal and bank")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

5. 小结¶

It seems that:

The influence of share is more likely to act on Purchase
The influence of bank rate is more likely to act on Redeem
The influence of share rate is for short
The influence of bank rate is for long

based on above analysis, we can simply find these features:¶

the weekday
is it weekend
is it holidy
the distance from the start of week(monday)
the distance from the end of week(sunday)
the distance from the holiday centre(centre of QingMing DuanWu Labour ZhongQiu)
the distance from the start of month
the distance from the end of month
the mean/max/min value of the same week in last month
the value in last day of last month
Although Alipay or bank card buying is not very different, but the difference on the redeem is quite obvious
3903 large users account for 13.92% of the total

1. 辅助函数整合¶

# Load the balance data
def load_data(path: str = 'user_balance_table.csv')->pd.DataFrame:
    data_balance = pd.read_csv(path)
    data_balance = add_timestamp(data_balance)
    return data_balance.reset_index(drop=True)
    

# add tiemstamp to dataset
def add_timestamp(data: pd.DataFrame, time_index: str = 'report_date')->pd.DataFrame:
    data_balance = data.copy()
    data_balance['date'] = pd.to_datetime(data_balance[time_index], format= "%Y%m%d")
    data_balance['day'] = data_balance['date'].dt.day
    data_balance['month'] = data_balance['date'].dt.month
    data_balance['year'] = data_balance['date'].dt.year
    data_balance['week'] = data_balance['date'].dt.week
    data_balance['weekday'] = data_balance['date'].dt.weekday
    return data_balance.reset_index(drop=True)

# total amount
def get_total_balance(data: pd.DataFrame, date: str = '2014-03-31')->pd.DataFrame:
    df_tmp = data.copy()
    df_tmp = df_tmp.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum()
    df_tmp.reset_index(inplace=True)
    return df_tmp[(df_tmp['date']>= date)].reset_index(drop=True)

# Generate the test data
def generate_test_data(data: pd.DataFrame)->pd.DataFrame:
    total_balance = data.copy()
    start = datetime.datetime(2014,9,1)
    testdata = []
    while start != datetime.datetime(2014,10,15):
        temp = [start, np.nan, np.nan]
        testdata.append(temp)
        start += datetime.timedelta(days = 1)
    testdata = pd.DataFrame(testdata)
    testdata.columns = total_balance.columns

    total_balance = pd.concat([total_balance, testdata], axis = 0)
    total_balance = total_balance.reset_index(drop=True)
    return total_balance.reset_index(drop=True)

# Load user's information
def load_user_information(path: str = 'user_profile_table.csv')->pd.DataFrame:
    return pd.read_csv(path)

# 载入数据

balance_data = load_data('Dataset/user_balance_table.csv')
balance_data = add_timestamp(balance_data)
total_balance = get_total_balance(balance_data, date = '2014-03-01')
total_balance = generate_test_data(total_balance)
total_balance = add_timestamp(total_balance, 'date')

# 创建数据的深层拷贝

data = total_balance.copy()

2. 时间序列规则设定¶

# 定义生成时间序列规则预测结果的方法

def generate_base(df: pd.DataFrame, month_index: int)->pd.DataFrame:
    # 选中固定时间段的数据集
    total_balance = df.copy()
    total_balance = total_balance[['date','total_purchase_amt','total_redeem_amt']]
    total_balance = total_balance[(total_balance['date'] >= datetime.datetime(2014,3,1)) & (total_balance['date'] < datetime.datetime(2014, month_index, 1))]

    # 加入时间戳
    total_balance['weekday'] = total_balance['date'].dt.weekday
    total_balance['day'] = total_balance['date'].dt.day
    total_balance['week'] = total_balance['date'].dt.week
    total_balance['month'] = total_balance['date'].dt.month
    
    # 统计翌日因子
    mean_of_each_weekday = total_balance[['weekday']+['total_purchase_amt','total_redeem_amt']].groupby('weekday',as_index=False).mean()
    for name in ['total_purchase_amt','total_redeem_amt']:
        mean_of_each_weekday = mean_of_each_weekday.rename(columns={name: name+'_weekdaymean'})
    mean_of_each_weekday['total_purchase_amt_weekdaymean'] /= np.mean(total_balance['total_purchase_amt'])
    mean_of_each_weekday['total_redeem_amt_weekdaymean'] /= np.mean(total_balance['total_redeem_amt'])

    # 合并统计结果到原数据集
    total_balance = pd.merge(total_balance, mean_of_each_weekday, on='weekday', how='left')

    # 分别统计翌日在(1~31)号出现的频次
    weekday_count = total_balance[['day','weekday','date']].groupby(['day','weekday'],as_index=False).count()
    weekday_count = pd.merge(weekday_count, mean_of_each_weekday, on='weekday')

    # 依据频次对翌日因子进行加权，获得日期因子
    weekday_count['total_purchase_amt_weekdaymean'] *= weekday_count['date']   / len(np.unique(total_balance['month']))
    weekday_count['total_redeem_amt_weekdaymean'] *= weekday_count['date']  / len(np.unique(total_balance['month']))
    day_rate = weekday_count.drop(['weekday','date'],axis=1).groupby('day',as_index=False).sum()

    # 将训练集中所有日期的均值剔除日期残差得到base
    day_mean = total_balance[['day'] + ['total_purchase_amt','total_redeem_amt']].groupby('day',as_index=False).mean()
    day_pre = pd.merge(day_mean, day_rate, on='day', how='left')
    day_pre['total_purchase_amt'] /= day_pre['total_purchase_amt_weekdaymean']
    day_pre['total_redeem_amt'] /= day_pre['total_redeem_amt_weekdaymean']

    # 生成测试集数据
    for index, row in day_pre.iterrows():
        if month_index in (2,4,6,9) and row['day'] == 31:
            break
        day_pre.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day']))

    # 基于base与翌日因子获得最后的预测结果
    day_pre['weekday'] = day_pre.date.dt.weekday
    day_pre = day_pre[['date','weekday']+['total_purchase_amt','total_redeem_amt']]
    day_pre = pd.merge(day_pre, mean_of_each_weekday,on='weekday')
    day_pre['total_purchase_amt'] *= day_pre['total_purchase_amt_weekdaymean']
    day_pre['total_redeem_amt'] *= day_pre['total_redeem_amt_weekdaymean']

    day_pre = day_pre.sort_values('date')[['date']+['total_purchase_amt','total_redeem_amt']]
    return day_pre

3. 结果预测并保存¶

# 生成预测结果（以及残差）

base_list = []
for i in range(4, 10):
    base_list.append(generate_base(data, i).reset_index(drop=True))

base = pd.concat(base_list).reset_index(drop=True)
for i in ['total_purchase_amt','total_redeem_amt']:
    base = base.rename(columns={i: i+'_base'})

data = pd.merge(data.reset_index(drop=True), base.reset_index(drop=True), on='date', how='left').reset_index(drop=True)

data['purchase_residual'] = data['total_purchase_amt'] / data['total_purchase_amt_base']

data['redeem_residual'] = data['total_redeem_amt'] / data['total_redeem_amt_base']

# 对结果表重命名

data = data[['date','purchase_residual','redeem_residual','total_purchase_amt_base', 'total_redeem_amt_base']]
for i in data.columns:
    if i == 'date':
        data[i] = data[i].astype(str)
        data[i] = data[i].str.replace('-','')
data.columns = [['date'] + ['total_purchase_amt','total_redeem_amt'] + ['total_purchase_predicted_by_cycle','total_redeem_predicted_by_cycle'] ]

# 保存预测结果到本地

data.to_csv('Dataset/base.csv',index=False)

1. 数据准备¶

library(plyr)
library(dplyr)
library(forecast)
library(data.table)
library(tseries)
options(warn=-1)

Attaching package: 'dplyr'


The following objects are masked from 'package:plyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Registered S3 methods overwritten by 'forecast':
  method             from    
  fitted.fracdiff    fracdiff
  residuals.fracdiff fracdiff


Attaching package: 'data.table'


The following objects are masked from 'package:dplyr':

    between, first, last

user_balance=read.csv("Dataset/user_balance_table.csv")
user_balance$report_date=as.Date(as.character(user_balance$report_date),format="%Y%m%d")

temp=order(user_balance$report_date)
user_balance=user_balance[temp,]

data=ddply(user_balance,.(report_date),function(D){colwise(sum)(D[,c(-1,-2)])})

train_data = data[274:396,]
test_data = data[397:427,]

head(train_data)

head(test_data)

2. 评估方式¶

get_score<- function(purchase_pred,redeem_pred,purchase_true,redeem_true,h=0.3){
  out = data.frame(purchase_pred ,redeem_pred,purchase_true,redeem_true )
  out$purchase_mape = abs(out$purchase_pred-out$purchase_true)/
    out$purchase_true
  out$redeem_mape = abs(out$redeem_pred-out$redeem_true)/
    out$redeem_true
  
  score = sum(exp(-out$purchase_mape/h)*10)* 0.45 + 
    sum(exp(-out$redeem_mape/h)*10)* 0.55 
  return(score)
}

3. 平稳性分析¶

# purchase
purchase_ts=ts(train_data$total_purchase_amt,
               frequency=7,start=c(1,1))       ## 生成时间序列
purchase_stl_res = stl(purchase_ts,s.window="periodic",robust=TRUE)  ## stl分解
purchase_pred_test = forecast(purchase_stl_res,h = 31)$mean  ## 得到测试集预测结果



# redeem
redeem_ts = ts(train_data$total_redeem_amt,
               frequency=7,start=c(1,1))
redeem_stl_res = stl(redeem_ts,s.window="periodic",robust=TRUE)
redeem_pred_test = forecast(redeem_stl_res,h = 31)$mean


get_score(purchase_pred_test,redeem_pred_test,test_data$total_purchase_amt,
          test_data$total_redeem_amt)

purchase_del_ts=ts(purchase_ts-purchase_stl_res$time.series[,1],
                   frequency=7,start=c(1,1))

plot(purchase_del_ts)

acf(purchase_del_ts)

adf.test(purchase_del_ts)

	Augmented Dickey-Fuller Test

data:  purchase_del_ts
Dickey-Fuller = -4.3948, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary

for(i in 1:2){print(Box.test(purchase_del_ts,type = "Box-Pierce",
                             lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(purchase_del_ts,type = "Ljung-Box",
                             lag = i*6)$p.value)}

[1] 0.007097898
[1] 0.04290397
[1] 0.005825161
[1] 0.03343909

pacf(purchase_del_ts)

acf(purchase_del_ts)

# redeem
redeem_del_ts=ts(redeem_ts-redeem_stl_res$time.series[,1],
                   frequency=7,start=c(1,1))

plot(redeem_del_ts)

acf(redeem_del_ts)

adf.test(redeem_del_ts)

	Augmented Dickey-Fuller Test

data:  redeem_del_ts
Dickey-Fuller = -3.0508, Lag order = 4, p-value = 0.14
alternative hypothesis: stationary

plot(diff(redeem_del_ts)) # 差分后的时序图

acf(diff(redeem_del_ts))

adf.test(diff(redeem_del_ts))

	Augmented Dickey-Fuller Test

data:  diff(redeem_del_ts)
Dickey-Fuller = -7.1293, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary

for(i in 1:2){print(Box.test(diff(redeem_del_ts),type = "Box-Pierce",
                             lag = i*6)$p.value)}

for(i in 1:2){print(Box.test(diff(redeem_del_ts),type = "Ljung-Box",
                             lag = i*6)$p.value)}

[1] 0.004738446
[1] 0.0605687
[1] 0.003849948
[1] 0.04976641

pacf(diff(redeem_del_ts))#拖尾

acf(diff(redeem_del_ts)) #截尾

4. ARIMA建模¶

# purchase
purchase_del_fit = arima(purchase_del_ts/1000,order = c(1,0,1))

purchase_del_fit

Call:
arima(x = purchase_del_ts/1000, order = c(1, 0, 1))

Coefficients:
         ar1      ma1   intercept
      0.4187  -0.0593  265797.596
s.e.  0.2409   0.2584    7658.852

sigma^2 estimated as 2.784e+09:  log likelihood = -1512.04,  aic = 3032.09

# 显著性检验
for(i in 1:2){print(Box.test(purchase_del_fit$residuals,type = "Ljung-Box",
                             lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(purchase_del_fit$residuals,type = 'Box-Pierce',
                             lag = i*6)$p.value)}

[1] 0.9691384
[1] 0.9876937
[1] 0.9734135
[1] 0.9914217

purchase_pred_test = forecast(purchase_stl_res$time.series[,1],31)$mean+
  predict(purchase_del_fit,n.ahead = 31)$pred*1000

# redeem
redeem_del_fit = arima(redeem_del_ts,order = c(0,1,1))

# 显著性检验
for(i in 1:2){print(Box.test(redeem_del_fit$residuals,
                             type = "Ljung-Box",lag = i*6)$p.value)}


for(i in 1:2){print(Box.test(redeem_del_fit$residuals,
                             type = 'Box-Pierce',lag = i*6)$p.value)}

[1] 0.7397571
[1] 0.9739962
[1] 0.7547025
[1] 0.9783781

redeem_pred_test = predict(redeem_del_fit,n.ahead = 31)$pred+
  forecast(redeem_stl_res$time.series[,1],h=31)$mean

get_score(purchase_pred_test,redeem_pred_test,test_data$total_purchase_amt,
          test_data$total_redeem_amt)

#####去除周期后 auto.arima
#####purchase======
autoFit = auto.arima(purchase_del_ts/1000,d=0,seasonal = F,trace = T )

 ARIMA(2,0,2)           with non-zero mean : 3036.781
 ARIMA(0,0,0)           with non-zero mean : 3044.866
 ARIMA(1,0,0)           with non-zero mean : 3030.344
 ARIMA(0,0,1)           with non-zero mean : 3032.692
 ARIMA(0,0,0)           with zero mean     : 3429.418
 ARIMA(2,0,0)           with non-zero mean : 3032.426
 ARIMA(1,0,1)           with non-zero mean : 3032.428
 ARIMA(2,0,1)           with non-zero mean : 3034.588
 ARIMA(1,0,0)           with zero mean     : 3074.01

 Best model: ARIMA(1,0,0)           with non-zero mean

for(i in 1:2){print(Box.test(autoFit$residuals,type = "Ljung-Box",lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(autoFit$residuals,type = 'Box-Pierce',lag = i*6)$p.value)}

[1] 0.9697502
[1] 0.9881997
[1] 0.9739763
[1] 0.9917919

purchase_pred_test = forecast(purchase_stl_res$time.series[,1],31)$mean+
  forecast(autoFit,h=31)$mean*1000

autoFit = auto.arima(redeem_del_ts,d=1,seasonal = F,trace = TRUE)

 ARIMA(2,1,2)           with drift         : 4746.025
 ARIMA(0,1,0)           with drift         : 4781.675
 ARIMA(1,1,0)           with drift         : 4766.45
 ARIMA(0,1,1)           with drift         : 4742.536
 ARIMA(0,1,0)                              : 4779.628
 ARIMA(1,1,1)           with drift         : 4741.796
 ARIMA(2,1,1)           with drift         : 4743.756
 ARIMA(1,1,2)           with drift         : 4743.779
 ARIMA(0,1,2)           with drift         : 4741.605
 ARIMA(0,1,3)           with drift         : 4743.777
 ARIMA(1,1,3)           with drift         : 4745.991
 ARIMA(0,1,2)                              : 4739.482
 ARIMA(0,1,1)                              : 4740.435
 ARIMA(1,1,2)                              : 4741.617
 ARIMA(0,1,3)                              : 4741.616
 ARIMA(1,1,1)                              : 4739.681
 ARIMA(1,1,3)                              : 4743.793

 Best model: ARIMA(0,1,2)

###模型显著性检验====
for(i in 1:2){print(Box.test(autoFit$residuals,type = "Ljung-Box",lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(autoFit$residuals,type = 'Box-Pierce',lag = i*6)$p.value)}

[1] 0.9896333
[1] 0.9995391
[1] 0.9910407
[1] 0.9996895

redeem_pred_test = forecast(autoFit,h=31)$mean+
  forecast(redeem_stl_res$time.series[,1],h=31)$mean

get_score(purchase_pred_test,redeem_pred_test,test_data$total_purchase_amt,
          test_data$total_redeem_amt)

5. prophet模型预测¶

# prophet预测

holidays_and_effect_days <- data_frame(
  holiday = 'holidays_and_effect_days',
  ds = as.Date(c('2014-04-03', '2014-04-04', 
                 '2014-04-05', '2014-04-06', '2014-04-07', 
                 '2014-04-08', '2014-04-09',
                 
                 '2014-04-29', '2014-04-30', 
                 '2014-05-01', '2014-05-02', '2014-05-03',
                 '2014-05-04', '2014-05-05',
                 
                 '2014-05-29', '2014-05-30',
                 '2014-05-31', '2014-06-01', '2014-06-02', 
                 '2014-06-03', '2014-06-04')),
  lower_window = 0,
  upper_window = 0
)

holidays <- data_frame(
  holiday = 'true_holidays',
  ds = as.Date(c('2014-04-05', '2014-04-06', '2014-04-07',
                 '2014-05-01', '2014-05-02', '2014-05-03',
                 '2014-05-31', '2014-06-01', '2014-06-02')),
  lower_window = 0,
  upper_window = 0
)
holidays <- bind_rows(holidays_and_effect_days, holidays)



###purchase=========

Myprophet_purchase = prophet::prophet(changepoint_prior_scale=0.5,weekly_seasonality=T,
                                      holidays= holidays)
df_purchase = data.frame(train_data$report_date ,train_data$total_purchase_amt )
colnames(df_purchase) <-c('ds','y')
Myprophet_purchase = prophet::fit.prophet(Myprophet_purchase,df_purchase)

future_purchase = prophet::make_future_dataframe(Myprophet_purchase,periods=31,
                                                 include_history=F)
forecast_purchase = predict(Myprophet_purchase, future_purchase)
purchase_pred_test = forecast_purchase$yhat



###redeem=====
Myprophet_redeem = prophet::prophet(changepoint_prior_scale=0.5,weekly_seasonality=True,
                                    holidays= holidays)
df_redeem = data.frame(train_data$report_date ,train_data$total_redeem_amt )
colnames(df_redeem) <-c('ds','y')
df_redeem
Myprophet_redeem = prophet::fit.prophet(Myprophet_redeem,df_redeem)

future_redeem = prophet::make_future_dataframe(Myprophet_redeem,periods=31,
                                               include_history=F)
forecast_redeem = predict(Myprophet_redeem, future_redeem)
redeem_pred_test = forecast_redeem$yhat

get_score(purchase_pred_test,redeem_pred_test,test_data$total_purchase_amt,
          test_data$total_redeem_amt)

Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.

Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.

Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

	user_id	report_date	tBalance	yBalance	total_purchase_amt	...	category1	category2	category3	category4	date	day	month	year	week	weekday
0	1	20140805	20385	20383	2	...	NaN	NaN	NaN	NaN	2014-08-05	5	8	2014	32	1
1	1	20140808	20391	20389	2	...	NaN	NaN	NaN	NaN	2014-08-08	8	8	2014	32	4
2	1	20140811	20397	20395	2	...	NaN	NaN	NaN	NaN	2014-08-11	11	8	2014	33	0
3	1	20140814	20403	20401	2	...	NaN	NaN	NaN	NaN	2014-08-14	14	8	2014	33	3
4	1	20140817	20409	20407	2	...	NaN	NaN	NaN	NaN	2014-08-17	17	8	2014	33	6

	user_id	report_date	tBalance	yBalance	total_purchase_amt	direct_purchase_amt	purchase_bal_amt	purchase_bank_amt	total_redeem_amt	...	category1	category2	category3	category4	date	day	month	year	week	weekday
1453311	14592	20131104	99457728	0	201768328	201768328	201275171	493157	102310600	...	NaN	NaN	NaN	NaN	2013-11-04	4	11	2013	45	0
1453388	14592	20140616	0	98964529	1966014	1953569	0	1953569	100930543	...	NaN	NaN	NaN	NaN	2014-06-16	16	6	2014	25	0
1453227	14592	20131226	367063	98296082	17369	0	0	0	97946388	...	NaN	NaN	NaN	NaN	2013-12-26	26	12	2013	52	3
1453313	14592	20131105	97458675	99457728	4899446	4899446	4899446	0	6898499	...	NaN	NaN	NaN	NaN	2013-11-05	5	11	2013	45	1
1453355	14592	20140617	0	0	339679	339679	0	339679	339679	...	NaN	NaN	NaN	NaN	2014-06-17	17	6	2014	25	1

	report_date	tBalance	yBalance	total_purchase_amt	direct_purchase_amt	purchase_bal_amt	purchase_bank_amt	total_redeem_amt	consume_amt	transfer_amt	tftobal_amt	tftocard_amt	share_amt	category1	category2	category3	category4
	<date>	<dbl>	<dbl>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
274	2014-03-31	20417009416	20441977145	398884905	396026448	154073551	241952897	423852634	48345100	375507534	56545010	318962524	2858457	NA	NA	NA	NA
275	2014-04-01	20592900643	20417009416	453320585	450478122	146539591	303938531	277429358	49540539	227888819	37116613	190772206	2842463	NA	NA	NA	NA
276	2014-04-02	20675635695	20592900643	355347118	352449616	141024158	211425458	272612066	51040952	221571114	28301147	193269967	2897502	NA	NA	NA	NA
277	2014-04-03	20772907358	20675635695	363877120	360958034	165866758	195091276	266605457	45762040	220843417	46915541	173927876	2919086	NA	NA	NA	NA
278	2014-04-04	20824610615	20772907358	251895894	248968008	62838487	186129521	200192637	37871189	162321448	35154471	127166977	2927886	NA	NA	NA	NA
279	2014-04-05	20863747475	20824610615	202336542	199391149	68763348	130627801	163199682	43889726	119309956	23061774	96248182	2945393	NA	NA	NA	NA

	report_date	tBalance	yBalance	total_purchase_amt	direct_purchase_amt	purchase_bal_amt	purchase_bank_amt	total_redeem_amt	consume_amt	transfer_amt	tftobal_amt	tftocard_amt	share_amt	category1	category2	category3	category4
	<date>	<dbl>	<dbl>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
397	2014-08-01	20069760015	19947416138	374884735	372678958	135715324	236963634	252540858	84227601	168313257	31013339	137299918	2205777	NA	NA	NA	NA
398	2014-08-02	20086601920	20069760015	189092130	186889349	78478704	108410645	172250225	41930977	130319248	34359098	95960150	2202781	NA	NA	NA	NA
399	2014-08-03	20133302100	20086601920	173825397	171634865	62120575	109514290	127125217	40828571	86296646	23794226	62502420	2190532	NA	NA	NA	NA
400	2014-08-04	20141035460	20133302100	330640884	328404472	120904646	207499826	322907524	78477516	244430008	43969481	200460527	2236412	NA	NA	NA	NA
401	2014-08-05	20314109791	20141035460	394780870	392589715	164705556	227884159	221706539	74040082	147666457	35974791	111691666	2191155	NA	NA	NA	NA
402	2014-08-06	20320584213	20314109791	288821016	286598298	127641624	158956674	282346594	56127546	226219048	26815416	199403632	2222718	NA	NA	NA	NA

ds	y
<date>	<int>
2014-03-31	423852634
2014-04-01	277429358
2014-04-02	272612066
2014-04-03	266605457
2014-04-04	200192637
2014-04-05	163199682
2014-04-06	139576683
2014-04-07	176966561
2014-04-08	250015131
2014-04-09	289330278
2014-04-10	286914864
2014-04-11	277077434
2014-04-12	123295320
2014-04-13	178934722
2014-04-14	415986984
2014-04-15	285293076
2014-04-16	255914640
2014-04-17	265341592
2014-04-18	225952909
2014-04-19	146374940
2014-04-20	161057781
2014-04-21	295635256
2014-04-22	268810141
2014-04-23	278470936
2014-04-24	224536754
2014-04-25	227764292
2014-04-26	158122962
2014-04-27	191915377
2014-04-28	327724735
2014-04-29	307578349
...	...
2014-07-02	328950951
2014-07-03	289009780
2014-07-04	264494550
2014-07-05	272535138
2014-07-06	195530758
2014-07-07	317612569
2014-07-08	340453063
2014-07-09	269642881
2014-07-10	326009240
2014-07-11	240050748
2014-07-12	149081488
2014-07-13	199459990
2014-07-14	284753279
2014-07-15	261722182
2014-07-16	234775948
2014-07-17	298279385
2014-07-18	208671287
2014-07-19	155464283
2014-07-20	174462836
2014-07-21	434191479
2014-07-22	369043423
2014-07-23	308353077
2014-07-24	347622431
2014-07-25	262874791
2014-07-26	282653341
2014-07-27	166610652
2014-07-28	345986909
2014-07-29	303480103
2014-07-30	250117716
2014-07-31	277194379

	total_purchase_amt	total_redeem_amt	weekday_onehot0	weekday_onehot1	weekday_onehot2	weekday_onehot3	weekday_onehot4	date
0	32488348.0	5525022.0	1.0	0.0	0.0	0.0	0.0	2013-07-01
1	29037390.0	2554548.0	0.0	1.0	0.0	0.0	0.0	2013-07-02
2	27270770.0	5953867.0	0.0	0.0	1.0	0.0	0.0	2013-07-03
3	18321185.0	6410729.0	0.0	0.0	0.0	1.0	0.0	2013-07-04
4	11648749.0	2763587.0	0.0	0.0	0.0	0.0	1.0	2013-07-05

	user_id	sex	city	constellation
0	2	1	6411949	狮子座
1	12	1	6412149	摩羯座
2	22	1	6411949	双子座
3	23	1	6411949	双鱼座
4	25	1	6481949	双鱼座