资金流入流出预测-挑战Baseline (上)

1. 数据准备

In [1]:
import pandas as  pd
import numpy as np
import warnings 
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import datetime 
from scipy import stats
from dateutil.relativedelta import relativedelta
import sklearn as skr

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
# 设置数据集路径

dataset_path = 'Dataset/'
In [4]:
# 读取数据

data_balance = pd.read_csv(dataset_path+'user_balance_table.csv')
In [5]:
# 为数据集添加时间戳

data_balance['date'] = pd.to_datetime(data_balance['report_date'], format= "%Y%m%d")
data_balance['day'] = data_balance['date'].dt.day
data_balance['month'] = data_balance['date'].dt.month
data_balance['year'] = data_balance['date'].dt.year
data_balance['week'] = data_balance['date'].dt.week
data_balance['weekday'] = data_balance['date'].dt.weekday
In [6]:
data_balance.head()
Out[6]:
user_id report_date tBalance yBalance total_purchase_amt direct_purchase_amt purchase_bal_amt purchase_bank_amt total_redeem_amt consume_amt ... category1 category2 category3 category4 date day month year week weekday
0 1 20140805 20385 20383 2 0 0 0 0 0 ... NaN NaN NaN NaN 2014-08-05 5 8 2014 32 1
1 1 20140808 20391 20389 2 0 0 0 0 0 ... NaN NaN NaN NaN 2014-08-08 8 8 2014 32 4
2 1 20140811 20397 20395 2 0 0 0 0 0 ... NaN NaN NaN NaN 2014-08-11 11 8 2014 33 0
3 1 20140814 20403 20401 2 0 0 0 0 0 ... NaN NaN NaN NaN 2014-08-14 14 8 2014 33 3
4 1 20140817 20409 20407 2 0 0 0 0 0 ... NaN NaN NaN NaN 2014-08-17 17 8 2014 33 6

5 rows × 24 columns

2. 时间序列分析

In [7]:
# 聚合时间数据
total_balance = data_balance.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum().reset_index()

In [8]:
# 生成测试集区段数据(时间区段为2014/9/1之后)

start = datetime.datetime(2014,9,1)
testdata = []
while start != datetime.datetime(2014,10,1):
    temp = [start, np.nan, np.nan]
    testdata.append(temp)
    start += datetime.timedelta(days = 1)
testdata = pd.DataFrame(testdata)
testdata.columns = total_balance.columns
In [9]:
# 拼接数据集

total_balance = pd.concat([total_balance, testdata], axis = 0)
In [10]:
# 为数据集添加时间戳

total_balance['day'] = total_balance['date'].dt.day
total_balance['month'] = total_balance['date'].dt.month
total_balance['year'] = total_balance['date'].dt.year
total_balance['week'] = total_balance['date'].dt.week
total_balance['weekday'] = total_balance['date'].dt.weekday
In [11]:
import matplotlib.pylab as plt

In [12]:
# 画出每日总购买与赎回量的时间序列图

fig = plt.figure(figsize=(20,6))
plt.plot(total_balance['date'], total_balance['total_purchase_amt'],label='purchase')
plt.plot(total_balance['date'], total_balance['total_redeem_amt'],label='redeem')

plt.legend(loc='best')
plt.title("The lineplot of total amount of Purchase and Redeem from July.13 to Sep.14")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
In [13]:
# 画出4月份以后的时间序列图

total_balance_1 = total_balance[total_balance['date'] >= np.datetime64(datetime.date(2014,4,1))]
fig = plt.figure(figsize=(20,6))
plt.plot(total_balance_1['date'], total_balance_1['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_1['date'], total_balance_1['total_redeem_amt'], label = 'redeem')
plt.legend(loc = 'best')
plt.title("The lineplot of total amount of Purchase and Redeem from April.14 to Sep.14")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
In [14]:
# 分别画出每个月中每天购买赎回量的时间序列图

fig = plt.figure(figsize=(15,15))

plt.subplot(4,1,1)
plt.title("The time series of total amount of Purchase and Redeem for August, July, June, May respectively")

total_balance_2 = total_balance[total_balance['date'] >= np.datetime64(datetime.date(2014,8,1))]
plt.plot(total_balance_2['date'], total_balance_2['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_2['date'], total_balance_2['total_redeem_amt'], label = 'redeem')
plt.legend()


total_balance_3 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,7,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,8,1)))]
plt.subplot(4,1,2)
plt.plot(total_balance_3['date'], total_balance_3['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_3['date'], total_balance_3['total_redeem_amt'], label = 'redeem')
plt.legend()


total_balance_4 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,6,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,7,1)))]
plt.subplot(4,1,3)
plt.plot(total_balance_4['date'], total_balance_4['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_4['date'], total_balance_4['total_redeem_amt'], label = 'redeem')
plt.legend()


total_balance_5 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,5,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,6,1)))]
plt.subplot(4,1,4)
plt.plot(total_balance_5['date'], total_balance_5['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_5['date'], total_balance_5['total_redeem_amt'], label = 'redeem')
plt.legend()

plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
In [15]:
# 分别画出13年8月与9月每日购买赎回量的时序图

fig = plt.figure(figsize=(15,9))

total_balance_last8 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,8,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,9,1)))]
plt.subplot(2,1,1)
plt.plot(total_balance_last8['date'], total_balance_last8['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_last8['date'], total_balance_last8['total_redeem_amt'], label = 'redeem')
plt.legend()

total_balance_last9 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,9,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,10,1)))]
plt.subplot(2,1,2)
plt.plot(total_balance_last9['date'], total_balance_last9['total_purchase_amt'], label = 'purchase')
plt.plot(total_balance_last9['date'], total_balance_last9['total_redeem_amt'], label = 'redeem')
plt.legend()

plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

3. 翌日特征分析

In [16]:
# 画出每个翌日的数据分布于整体数据的分布图

plt.figure(figsize=(15,10))
scatter_para = {'marker':'.', 's':3, 'alpha':0.3}
line_kws = {'color':'k'}
plt.subplot(2,2,1)
plt.title('The distrubution of total purchase')
sns.violinplot(x='weekday', y='total_purchase_amt', data = total_balance_1, scatter_kws=scatter_para, line_kws=line_kws)
plt.subplot(2,2,2)
plt.title('The distrubution of total purchase')
sns.distplot(total_balance_1['total_purchase_amt'].dropna())
plt.subplot(2,2,3)
plt.title('The distrubution of total redeem')
sns.violinplot(x='weekday', y='total_redeem_amt', data = total_balance_1, scatter_kws=scatter_para, line_kws=line_kws)
plt.subplot(2,2,4)
plt.title('The distrubution of total redeem')
sns.distplot(total_balance_1['total_redeem_amt'].dropna())
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f713054a48>
In [17]:
# 按翌日对数据聚合后取均值

week_sta = total_balance_1[['total_purchase_amt', 'total_redeem_amt', 'weekday']].groupby('weekday', as_index=False).mean()

In [18]:
# 分析翌日的中位数特征

plt.figure(figsize=(12, 5))
ax = plt.subplot(1,2,1)
plt.title('The barplot of average total purchase with each weekday')
ax = sns.barplot(x="weekday", y="total_purchase_amt", data=week_sta, label='Purchase')
ax.legend()
ax = plt.subplot(1,2,2)
plt.title('The barplot of average total redeem with each weekday')
ax = sns.barplot(x="weekday", y="total_redeem_amt", data=week_sta, label='Redeem')
ax.legend()
Out[18]:
<matplotlib.legend.Legend at 0x1f712f600c8>

In [19]:
# 画出翌日的箱型图 (箱形图最大的优点就是不受异常值的影响,可以以一种相对稳定的方式描述数据的离散分布情况。)

plt.figure(figsize=(12, 5))
ax = plt.subplot(1,2,1)
plt.title('The boxplot of total purchase with each weekday')
ax = sns.boxplot(x="weekday", y="total_purchase_amt", data=total_balance_1,palette = 'RdBu')
ax = plt.subplot(1,2,2)
plt.title('The boxplot of total redeem with each weekday')
ax = sns.boxplot(x="weekday", y="total_redeem_amt", data=total_balance_1, palette = 'RdBu')
In [20]:
# 使用OneHot方法将翌日特征划分,获取划分后特征

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
total_balance = total_balance.reset_index()
week_feature = encoder.fit_transform(total_balance['weekday'].ravel().reshape(-1, 1)).toarray() 
# list to array --> https://zhuanlan.zhihu.com/p/261636002
# reshape是在不改变数据内容的情况下,改变一个数组的格式,参数及返回值,reshape(-1,1)令计算机计算出展开所有的行数,然后转换。
week_feature = pd.DataFrame(week_feature,columns=['weekday_onehot']*len(week_feature[0])) # 字符串复制
feature = pd.concat([total_balance, week_feature], axis = 1)[['total_purchase_amt', 'total_redeem_amt','weekday_onehot','date']]
feature.columns = list(feature.columns[0:2]) + [x+str(i) for i,x in enumerate(feature.columns[2:-1])] + ['date']
In [21]:
feature.head()
Out[21]:
total_purchase_amt total_redeem_amt weekday_onehot0 weekday_onehot1 weekday_onehot2 weekday_onehot3 weekday_onehot4 weekday_onehot5 weekday_onehot6 date
0 32488348.0 5525022.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2013-07-01
1 29037390.0 2554548.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 2013-07-02
2 27270770.0 5953867.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 2013-07-03
3 18321185.0 6410729.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2013-07-04
4 11648749.0 2763587.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2013-07-05
In [22]:
# 画出划分后翌日特征与标签的斯皮尔曼相关性

f, ax = plt.subplots(figsize = (20, 6))
plt.subplot(1,2,1)
plt.title('The spearman coleration between total purchase and each weekday')
sns.heatmap(feature[[x for x in feature.columns if x not in ['total_redeem_amt', 'date'] ]].corr('spearman'),linewidths = 0.1, vmax = 0.2, vmin=-0.2)
plt.subplot(1,2,2)
plt.title('The spearman coleration between total redeem and each weekday')
sns.heatmap(feature[[x for x in feature.columns if x not in ['total_purchase_amt', 'date'] ]].corr('spearman'),linewidths = 0.1,  vmax = 0.2, vmin=-0.2)
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f712e34b08>
In [23]:
# 测试翌日特征与标签的独立性 Ref: https://github.com/ChuanyuXue/MVTest

# pip install mvtest
from mvtpy.mvtest import mvtest
mv = mvtest()
mv.test(total_balance_1['total_purchase_amt'], total_balance_1['weekday'])
Out[23]:
{'Tn': 6.75, 'p-value': [0, 0.01]}

4. 月份特征分析

In [24]:
# 画出每个月的购买总量分布估计图(kdeplot)

plt.figure(figsize=(15,10))
plt.title('The Probability Density of total purchase amount in Each Month')
plt.ylabel('Probability')
plt.xlabel('Amount')
for i in range(7, 12):
    sns.kdeplot(total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,i,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,i+1,1)))]['total_purchase_amt'],label='13年'+str(i)+'月')
for i in range(1, 9):
    sns.kdeplot(total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,i,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_purchase_amt'],label='14年'+str(i)+'月')
In [25]:
# 画出每个月的赎回总量分布估计图(kdeplot)

plt.figure(figsize=(15,10))
plt.title('The Probability Density of total redeem amount in Each Month')
plt.ylabel('Probability')
plt.xlabel('Amount')
for i in range(7, 12):
    sns.kdeplot(total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,i,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,i+1,1)))]['total_redeem_amt'],label='13年'+str(i)+'月')
for i in range(1, 9):
    sns.kdeplot(total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,i,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_redeem_amt'],label='14年'+str(i)+'月')
In [26]:
# 画出14年五六七八月份的分布估计图

plt.figure(figsize=(12,10))

ax = plt.subplot(2,1,1)
plt.title('The Probability Density of total purchase and redeem amount from May.14 to August.14')
plt.ylabel('Probability')
plt.xlabel('Amount')
ax = sns.kdeplot(total_balance_2['total_purchase_amt'],label='八月')
ax = sns.kdeplot(total_balance_3['total_purchase_amt'],label='七月')
ax = sns.kdeplot(total_balance_4['total_purchase_amt'],label='六月')
ax = sns.kdeplot(total_balance_5['total_purchase_amt'],color='Black',label='五月')
ax = plt.subplot(2,1,2)
plt.ylabel('Probability')
plt.xlabel('Amount')
ax = sns.kdeplot(total_balance_2['total_redeem_amt'],label='八月')
ax = sns.kdeplot(total_balance_3['total_redeem_amt'],label='七月')
ax = sns.kdeplot(total_balance_4['total_redeem_amt'],label='六月')
ax = sns.kdeplot(total_balance_5['total_redeem_amt'],color='Black',label='五月')
In [27]:
# 画出13年八月到九月份的分布估计图

total_balance_last_7 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,7,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,8,1)))]
total_balance_last_8 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,8,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,9,1)))]
total_balance_last_9 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,9,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,10,1)))]
total_balance_last_10 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,10,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,11,1)))]
plt.figure(figsize=(12,10))
ax = plt.subplot(2,1,1)
plt.title('The Probability Density of total purchase and redeem amount from Aug.13 to Sep.13')
plt.ylabel('Probability')
plt.xlabel('Amount')
ax = sns.kdeplot(total_balance_last_8['total_purchase_amt'],label='August')
ax = sns.kdeplot(total_balance_last_7['total_purchase_amt'],label='July')
ax = sns.kdeplot(total_balance_last_9['total_purchase_amt'],color='Red',label='September')

ax = plt.subplot(2,1,2)
plt.ylabel('Probability')
plt.xlabel('Amount')
ax = sns.kdeplot(total_balance_last_8['total_redeem_amt'],label='August')
ax = sns.kdeplot(total_balance_last_7['total_redeem_amt'],label='July')
ax = sns.kdeplot(total_balance_last_9['total_redeem_amt'],color='Red',label='September')
ax = sns.kdeplot(total_balance_last_10['total_redeem_amt'],color='Black',label='Novermber')

5. 日期特征分析

In [28]:
# 按照每天聚合数据集

day_sta = total_balance_2[['total_purchase_amt', 'total_redeem_amt', 'day']].groupby('day', as_index=False).mean()
In [29]:
# 获取聚合后每月购买分布的柱状图
plt.figure(figsize = (12,5))
ax = sns.barplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase',palette='cividis')
ax = sns.lineplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase', color = 'red')
ax.legend()
plt.title("The total Purchase in Aug.14")
Out[29]:
Text(0.5, 1.0, 'The total Purchase in Aug.14')
In [30]:
# 获取聚合后每月赎回分布的柱状图
plt.figure(figsize = (12,5))
ax = sns.barplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem', palette='cividis')
ax = sns.lineplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem', color = 'red')
ax.legend()
plt.title("The total Redeem in Aug.14")
Out[30]:
Text(0.5, 1.0, 'The total Redeem in Aug.14')
In [31]:
# 画出13年九月份的分布图

plt.figure(figsize=(12,10))
day_sta = total_balance_last_9[['total_purchase_amt', 'total_redeem_amt', 'day']].groupby('day', as_index=False).mean()
plt.subplot(2,1,1)
plt.title("The total Purchase in Sep.13")
ax = sns.barplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase', palette= 'cividis')
ax = sns.lineplot(x="day", y="total_purchase_amt", data=day_sta, label='Purchase', color = 'red')
plt.subplot(2,1,2)
plt.title("The total Redeem in Sep.13")
bx = sns.barplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem', palette= 'cividis')
bx = sns.lineplot(x="day", y="total_redeem_amt", data=day_sta, label='Redeem', color = 'red')
bx.legend()
Out[31]:
<matplotlib.legend.Legend at 0x1f712b320c8>

We find that the data from last year in Sep has very limited week feature

There are some strange day in Sep:

  1. 1st day
  2. 2nd day
  3. 16th day(Purchase a lot)---Monday & 3days before MidAutumn Festirval
  4. 11th day and 25th day(Redeem a lot)---Both of Wednesday
  5. 18 19 20(Both Purchase and Redeem is very low)

image.png

In [32]:
# 画出历史所有天的热力图

test = np.zeros((max(total_balance_1['week']) - min(total_balance_1['week']) + 1, 7))
test[total_balance_1['week'] - min(total_balance_1['week']), total_balance_1['weekday']] = total_balance_1['total_purchase_amt']

f, ax = plt.subplots(figsize = (10, 4))
sns.heatmap(test,linewidths = 0.1, ax=ax)
ax.set_title("Purchase")
ax.set_xlabel('weekday')
ax.set_ylabel('week')

test = np.zeros((max(total_balance_1['week']) - min(total_balance_1['week']) + 1, 7))
test[total_balance_1['week'] - min(total_balance_1['week']), total_balance_1['weekday']] = total_balance_1['total_redeem_amt']

f, ax = plt.subplots(figsize = (10, 4))
sns.heatmap(test,linewidths = 0.1, ax=ax)
ax.set_title("Redeem")
ax.set_xlabel('weekday')
ax.set_ylabel('week')
Out[32]:
Text(71.0, 0.5, 'week')

From the heat map we find that the data of week 4 and weekday 6 is very strange, and week 12 weekday 2 either image.png

In [33]:
# 对于热力图中异常点的数据分析.1

total_balance_1[(total_balance_1['week'] == 4 + min(total_balance_1['week'])) & (total_balance_1['weekday'] == 6)]
Out[33]:
date total_purchase_amt total_redeem_amt day month year week weekday
307 2014-05-04 303087562.0 413222034.0 4 5 2014 18 6

2014-5-4 is a special day in China, It is the first workday after the Labour day! image.png

In [34]:
# 对于热力图中异常点的数据分析.2

total_balance_1[(total_balance_1['week'] == 12 + min(total_balance_1['week'])) & (total_balance_1['weekday'] == 2)]
Out[34]:
date total_purchase_amt total_redeem_amt day month year week weekday
359 2014-06-25 264663201.0 547295931.0 25 6 2014 26 2

In 2016-06-25 They Redeemed a lot but Purchase a little: image.png

image.png

6. 节假日分析

  1. The QingMing festerval (April.5 - April.7)
  2. The Labour day (May.1 - May.5)
  3. The DuanWu festeval (May.31 - June.2)
  4. The MidAutumn festeval (Sep.6 - Sep.8)

Others

  1. Mother day(May.13)
  2. Father day(June. 17)
  3. TianMao 618 sales(June 10 - June 20)
  4. Teachers' day(Sep 9)
In [35]:
# 获取节假日的数据

qingming = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,4,5))) & (total_balance['date'] < np.datetime64(datetime.date(2014,4,8)))]
labour = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,5,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,5,4)))]
duanwu = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,5,31))) & (total_balance['date'] < np.datetime64(datetime.date(2014,6,3)))]
data618 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,6,10))) & (total_balance['date'] < np.datetime64(datetime.date(2014,6,20)))]
In [36]:
# 画出节假日与平时的均值

fig = plt.figure(figsize = (12,5))
index_list = ['清明','劳动节','端午','618','平均']
label_list = [np.mean(qingming['total_purchase_amt']), np.mean(labour['total_purchase_amt']),np.mean(duanwu['total_purchase_amt']),np.mean(data618['total_purchase_amt']),np.mean(total_balance_1['total_purchase_amt'])]
plt.bar(index_list, label_list, label="Purchase")

index_list = ['清明.','劳动.','端午.','618.','平均.']
label_list = [np.mean(qingming['total_redeem_amt']), np.mean(labour['total_redeem_amt']),np.mean(duanwu['total_redeem_amt']),np.mean(data618['total_redeem_amt']),np.mean(total_balance_1['total_redeem_amt'])]
plt.bar(index_list, label_list, label="Redeem")
plt.title("The average of different holiday")
plt.ylabel("Amount")
plt.legend()
plt.show()
In [37]:
# 画出节假日购买量与其所处翌日的对比

plt.figure(figsize = (12,5))
import numpy as np
import matplotlib.pyplot as plt
size = 4
x = np.arange(size)

total_width, n = 0.8, 2    
width = total_width / n
x = x - (total_width - width) / 2

a = [176250006, 167825284, 162844282,321591063]
b = [225337516, 241859315, 225337516,307635449]

plt.bar(x, a,  width=width, label='Holiday_Purchase')
plt.bar(x + width, b, width=width, label='Normal_Purchase')
plt.xticks(x + width / 2, ('清明', '劳动节', '端午', '618'))
plt.title("节假日购买量与其所处翌日的对比")
plt.legend()
plt.show()
In [38]:
# 画出节假日赎回量与其所处翌日的对比
plt.figure(figsize = (12,5))
import numpy as np
import matplotlib.pyplot as plt
size = 4
x = np.arange(size)

total_width, n = 0.8, 2     
width = total_width / n
x = x - (total_width - width) / 2

a = [159914308, 154717620, 154366940,291016763]
b = [235439685, 240364238, 235439685,313310347]

plt.bar(x, a,  width=width, label='Holiday_Redeem')
plt.bar(x + width, b, width=width, label='Normal_Redeem')
plt.xticks(x + width / 2, ('清明', '劳动节', '端午', '618'))
plt.title("节假日赎回量与其所处翌日的对比")
plt.legend()
plt.show()

7. 节假日周围日期分析

In [39]:
# 画出清明节与周边日期的时序图

plt.figure(figsize = (12,5))
qingming_around = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,4,1))) & (total_balance['date'] < np.datetime64(datetime.date(2014,4,13)))]
ax = sns.lineplot(x="date", y="total_purchase_amt", data=qingming_around, label='Purchase')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=qingming_around, label='Redeem', ax=ax)
ax = sns.scatterplot(x="date", y="total_purchase_amt", data=qingming, ax = ax)
ax = sns.scatterplot(x="date", y="total_redeem_amt", data=qingming, ax=ax)
plt.title("清明节与周边日期的时序图")
ax.legend()
Out[39]:
<matplotlib.legend.Legend at 0x1f71400a548>
In [40]:
# 画出劳动节与周边日期的时序图

plt.figure(figsize = (12,5))
labour_around = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,4,25))) & (total_balance['date'] < np.datetime64(datetime.date(2014,5,10)))]
ax = sns.lineplot(x="date", y="total_purchase_amt", data=labour_around, label='Purchase')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=labour_around, label='Redeem', ax=ax)
ax = sns.scatterplot(x="date", y="total_purchase_amt", data=labour, ax=ax)
ax = sns.scatterplot(x="date", y="total_redeem_amt", data=labour, ax=ax)
plt.title("劳动节与周边日期的时序图")
ax.legend()
Out[40]:
<matplotlib.legend.Legend at 0x1f7140d8d48>
In [41]:
# 画出端午节与周边日期的时序图
plt.figure(figsize = (12,5))
duanwu_around = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2014,5,25))) & (total_balance['date'] < np.datetime64(datetime.date(2014,6,7)))]
ax = sns.lineplot(x="date", y="total_purchase_amt", data=duanwu_around, label='Purchase')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=duanwu_around, label='Redeem', ax=ax)
ax = sns.scatterplot(x="date", y="total_purchase_amt", data=duanwu, ax=ax)
ax = sns.scatterplot(x="date", y="total_redeem_amt", data=duanwu, ax=ax)
plt.title("端午节与周边日期的时序图")
ax.legend()
Out[41]:
<matplotlib.legend.Legend at 0x1f712cf9bc8>
In [42]:
# 画出中秋与周边日期的时序图
plt.figure(figsize = (12,5))
zhongqiu = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,9,19))) & (total_balance['date'] < np.datetime64(datetime.date(2013,9,22)))]
zhongqiu_around = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,9,14))) & (total_balance['date'] < np.datetime64(datetime.date(2013,9,28)))]
ax = sns.lineplot(x="date", y="total_purchase_amt", data=zhongqiu_around, label='Purchase')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=zhongqiu_around, label='Redeem', ax=ax)
ax = sns.scatterplot(x="date", y="total_purchase_amt", data=zhongqiu, ax=ax)
ax = sns.scatterplot(x="date", y="total_redeem_amt", data=zhongqiu, ax=ax)
plt.title("中秋与周边日期的时序图(in 2013)")
ax.legend()
Out[42]:
<matplotlib.legend.Legend at 0x1f713f4ec48>

8. 异常值分析

In [43]:
# 画出用户交易纪录的箱型图

sns.boxplot(data_balance['total_purchase_amt'])
plt.title("The abnormal value of total purchase")
Out[43]:
Text(0.5, 1.0, 'The abnormal value of total purchase')
In [44]:
# 对于购买2e8的用户的交易行为分析

data_balance[data_balance['user_id'] == 14592].sort_values(by = 'total_redeem_amt',axis = 0,ascending = False).head()
Out[44]:
user_id report_date tBalance yBalance total_purchase_amt direct_purchase_amt purchase_bal_amt purchase_bank_amt total_redeem_amt consume_amt ... category1 category2 category3 category4 date day month year week weekday
1453311 14592 20131104 99457728 0 201768328 201768328 201275171 493157 102310600 0 ... NaN NaN NaN NaN 2013-11-04 4 11 2013 45 0
1453388 14592 20140616 0 98964529 1966014 1953569 0 1953569 100930543 0 ... NaN NaN NaN NaN 2014-06-16 16 6 2014 25 0
1453227 14592 20131226 367063 98296082 17369 0 0 0 97946388 0 ... NaN NaN NaN NaN 2013-12-26 26 12 2013 52 3
1453313 14592 20131105 97458675 99457728 4899446 4899446 4899446 0 6898499 0 ... NaN NaN NaN NaN 2013-11-05 5 11 2013 45 1
1453355 14592 20140617 0 0 339679 339679 0 339679 339679 0 ... NaN NaN NaN NaN 2014-06-17 17 6 2014 25 1

5 rows × 24 columns

1311 Bought 2E Seal 1E 1312 Seal 1E 1405 Bought 0.9E 1406 Seal 1E

In [45]:
# 画出单笔交易为2e8的那天的总交易量及附近几天的交易量

plt.figure(figsize = (12,5))
e2 = total_balance[(total_balance['date'] >= np.datetime64(datetime.date(2013,11,1))) & (total_balance['date'] < np.datetime64(datetime.date(2013,11,10)))]
ax = sns.barplot(x="day", y="total_purchase_amt", data=e2, label='2E')
ax = sns.lineplot(x="day", y="total_purchase_amt", data=e2, label='2E')
plt.title("The influence of the big deal with 200 million purchasing(Red Bar)")
ax.legend()
Out[45]:
<matplotlib.legend.Legend at 0x1f713fee808>
In [46]:
# 画出每日单笔最大交易的时序图

plt.figure(figsize=(20, 6))
ax = sns.lineplot(x="date", y="total_purchase_amt", data=data_balance[['total_purchase_amt', 'date']].groupby('date', as_index=False).max(), label='最大购买')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=data_balance[['total_redeem_amt', 'date']].groupby('date', as_index=False).max(), label='最大赎回')
plt.title("The Biggest deal happend in each day")
Out[46]:
Text(0.5, 1.0, 'The Biggest deal happend in each day')
In [47]:
# 画出每日单笔最大交易以及总交易额的时序图

plt.figure(figsize=(25, 8))
ax = sns.lineplot(x="date", y="total_purchase_amt", data=data_balance[['total_purchase_amt', 'date']].groupby('date', as_index=False).max(), label='MAX_PURCHASE')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=data_balance[['total_redeem_amt', 'date']].groupby('date', as_index=False).max(), label='MAX_REDEEM')
ax = sns.lineplot(x="date", y="total_purchase_amt", data=data_balance[['total_purchase_amt', 'date']].groupby('date', as_index=False).sum(), label='TOTAL_PURCHASE')
ax = sns.lineplot(x="date", y="total_redeem_amt", data=data_balance[['total_redeem_amt', 'date']].groupby('date', as_index=False).sum(), label='TOTAL_REDEEM')
In [48]:
# 画出每个月大额交易的频次直方图

plt.figure(figsize=(12, 5))
big_frequancy = data_balance[(data_balance['total_purchase_amt'] > 10000000) | (data_balance['total_redeem_amt'] > 10000000)][['month','year','user_id']].groupby(['year','month'], as_index=False).count()
big_frequancy['i'] = big_frequancy['year']  + big_frequancy['month'] / 100
ax = sns.barplot(x="i", y="user_id", data=big_frequancy)
plt.title("The frequency of super big deal(larger than 100 million) in each month")
Out[48]:
Text(0.5, 1.0, 'The frequency of super big deal(larger than 100 million) in each month')
In [49]:
# 获取大额交易的数据集

data_balance['big_purchase'] = 0
data_balance.loc[data_balance['total_purchase_amt'] > 1000000, 'big_purchase'] = 1
data_balance['big_redeem'] = 0
data_balance.loc[data_balance['total_redeem_amt'] > 1000000, 'big_redeem'] = 1
In [50]:
# 对大额交易按每天做聚合操作

big_purchase = data_balance[data_balance['big_purchase'] == 1].groupby(['date'], as_index=False)['total_purchase_amt'].sum()
small_purchase = data_balance[data_balance['big_purchase'] == 0].groupby(['date'], as_index=False)['total_purchase_amt'].sum()
big_redeem = data_balance[data_balance['big_redeem'] == 1].groupby(['date'], as_index=False)['total_redeem_amt'].sum()
small_redeem = data_balance[data_balance['big_redeem'] == 0].groupby(['date'], as_index=False)['total_redeem_amt'].sum()
In [51]:
# 画出大额交易与小额交易的时序分布图

fig = plt.figure(figsize=(20,6))
plt.plot(big_purchase['date'], big_purchase['total_purchase_amt'],label='big_purchase', color = 'red')
plt.plot(big_redeem['date'], big_redeem['total_redeem_amt'],label='big_redeem', color = 'orange')

plt.plot(small_purchase['date'], small_purchase['total_purchase_amt'],label='small_purchase', color = 'blue')
plt.plot(small_redeem['date'], small_redeem['total_redeem_amt'],label='small_redeem', color = 'dodgerblue')
plt.legend(loc='best')
plt.title("The time series of big deal of Purchase and Redeem from July.13 to Sep.14")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
In [52]:
# 画出大额交易与小额交易的分布估计图

plt.figure(figsize=(15,10))

plt.subplot(2,2,1)
for i in range(4, 9):
    sns.kdeplot(big_purchase[(big_purchase['date'] >= np.datetime64(datetime.date(2014,i,1))) & (big_purchase['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_purchase_amt'],label='14Y,'+str(i)+'M')
plt.title('BIG PURCHASE')

plt.subplot(2,2,2)
for i in range(4, 9):
    sns.kdeplot(small_purchase[(small_purchase['date'] >= np.datetime64(datetime.date(2014,i,1))) & (small_purchase['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_purchase_amt'],label='14Y,'+str(i)+'M')
plt.title('SMALL PURCHASE')

plt.subplot(2,2,3)
for i in range(4, 9):
    sns.kdeplot(big_redeem[(big_redeem['date'] >= np.datetime64(datetime.date(2014,i,1))) & (big_redeem['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_redeem_amt'],label='14Y,'+str(i)+'M')
plt.title('BIG REDEEM')

plt.subplot(2,2,4)
for i in range(4, 9):
    sns.kdeplot(small_redeem[(small_redeem['date'] >= np.datetime64(datetime.date(2014,i,1))) & (small_redeem['date'] < np.datetime64(datetime.date(2014,i+1,1)))]['total_redeem_amt'],label='14Y,'+str(i)+'M')
plt.title('SMALL REDEEM')
Out[52]:
Text(0.5, 1.0, 'SMALL REDEEM')
In [53]:
# 添加时间戳

big_purchase['weekday'] = big_purchase['date'].dt.weekday
small_purchase['weekday'] = small_purchase['date'].dt.weekday
big_redeem['weekday'] = big_redeem['date'].dt.weekday
small_redeem['weekday'] = small_redeem['date'].dt.weekday
In [54]:
# 分析大额小额的翌日分布

plt.figure(figsize=(15, 10))

ax = plt.subplot(2,2,1)
ax = sns.boxplot(x="weekday", y="total_purchase_amt", data=big_purchase[big_purchase['date'] >= np.datetime64(datetime.date(2014,4,1))])
plt.title('BIG PURCHASE')

ax = plt.subplot(2,2,2)
ax = sns.boxplot(x="weekday", y="total_redeem_amt", data=big_redeem[big_redeem['date'] >= np.datetime64(datetime.date(2014,4,1))])
plt.title('BIG REDEEM')

ax = plt.subplot(2,2,3)
ax = sns.boxplot(x="weekday", y="total_purchase_amt", data=small_purchase[small_purchase['date'] >= np.datetime64(datetime.date(2014,4,1))])
plt.title('SMALL PURCHASE')

ax = plt.subplot(2,2,4)
ax = sns.boxplot(x="weekday", y="total_redeem_amt", data=small_redeem[small_redeem['date'] >= np.datetime64(datetime.date(2014,4,1))])
plt.title('SMALL REDEEM')
Out[54]:
Text(0.5, 1.0, 'SMALL REDEEM')

9. 用户交易纪录表中其他变量相关分析

In [55]:
# 截断数据集

data_balance_1 = data_balance[data_balance['date'] > datetime.datetime(2014,4,1)]
In [56]:
# 画出用户交易纪录表中其他变量与标签的相关性图


plt.figure(figsize = (12,5))
feature = ['total_purchase_amt','total_redeem_amt', 'report_date', 'tBalance', 'yBalance', 
       'direct_purchase_amt', 'purchase_bal_amt', 'purchase_bank_amt',
        'consume_amt', 'transfer_amt', 'tftobal_amt',
       'tftocard_amt', 'share_amt']

sns.heatmap(data_balance_1[feature].corr(), linewidths = 0.1)   
plt.title("The coleration between each feature in User_Balance_Table")
Out[56]:
Text(0.5, 1, 'The coleration between each feature in User_Balance_Table')

10. 银行、支付宝利率分析

In [57]:
# 读取银行利率并添加时间戳

bank = pd.read_csv(dataset_path + "mfd_bank_shibor.csv")
bank = bank.rename(columns = {'mfd_date': 'date'})
bank_features = [x for x in bank.columns if x not in ['date']]
bank['date'] = pd.to_datetime(bank['date'], format= "%Y%m%d")
bank['day'] = bank['date'].dt.day
bank['month'] = bank['date'].dt.month
bank['year'] = bank['date'].dt.year
bank['week'] = bank['date'].dt.week
bank['weekday'] = bank['date'].dt.weekday
In [58]:
# 读取支付宝利率并添加时间戳

share = pd.read_csv(dataset_path + 'mfd_day_share_interest.csv')
share = share.rename(columns = {'mfd_date': 'date'})
share_features = [x for x in share.columns if x not in ['date']]
share['date'] = pd.to_datetime(share['date'], format= "%Y%m%d")
share['day'] = share['date'].dt.day
share['month'] = share['date'].dt.month
share['year'] = share['date'].dt.year
share['week'] = share['date'].dt.week
share['weekday'] = share['date'].dt.weekday
In [59]:
# 画出上一天银行及支付宝利率与标签的相关性图

bank['last_date'] = bank['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("The coleration between each lastday bank rate and total purchase")
temp = pd.merge(bank[['last_date']+bank_features], total_balance, left_on='last_date', right_on='date')[['total_purchase_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)  
plt.subplot(1,3,3)
plt.title("The coleration between each lastday bank rate and total redeem")
temp = pd.merge(bank[['last_date']+bank_features], total_balance, left_on='last_date', right_on='date')[['total_redeem_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)
Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f7165be8c8>
In [60]:
# 画出上一星期银行及支付宝利率与标签的相关性图

bank['last_week'] = bank['week'] + 1
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("The coleration between each last week bank rate and total purchase")
temp = pd.merge(bank[['last_week','weekday']+bank_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_purchase_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)  
plt.subplot(1,3,3)
plt.title("The coleration between each last week bank rate and total redeem")
temp = pd.merge(bank[['last_week','weekday']+bank_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_redeem_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)
Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f716677608>
In [61]:
# 分别画出上一星期银行及支付宝利率与大额小额数据的相关性图

bank['last_date'] = bank['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("The coleration of Small Rate purchase")
temp = pd.merge(bank[['last_date']+bank_features], small_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)  
plt.subplot(1,3,3)
plt.title("The coleration of Small Rate redeem")
temp = pd.merge(bank[['last_date']+bank_features], small_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)  

bank['last_date'] = bank['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("The coleration of Big Rate purchase")
temp = pd.merge(bank[['last_date']+bank_features], big_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)  
plt.subplot(1,3,3)
plt.title("The coleration of Big Rate redeem")
temp = pd.merge(bank[['last_date']+bank_features], big_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+bank_features]
sns.heatmap(temp.corr(), linewidths = 0.05)
Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f718487608>
In [62]:
# 画出银行利率的时序图

plt.figure(figsize=(15,5))
for i in bank_features:
    plt.plot(bank['date'], bank[[i]] ,label=i)
plt.legend()
plt.title("The time series of bank rate")
plt.xlabel("Time")
plt.ylabel("Rate")
Out[62]:
Text(0, 0.5, 'Rate')
In [63]:
# 画出部分银行利率与购买量的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(bank['date'], bank['Interest_3_M'],'b',label="Interest_3_M")
plt.plot(bank['date'], bank['Interest_6_M'],'cyan',label="Interest_6_M")
plt.plot(bank['date'], bank['Interest_9_M'],'skyblue',label="Interest_9_M")

plt.legend()

ax2=ax1.twinx()
plt.plot(total_balance['date'], total_balance['total_purchase_amt'],'g',label="Total purchase")

plt.legend(loc=2)
plt.title("The time series of bank rate and purchase")
plt.xlabel("Time")
plt.ylabel("Rate & Amount")
plt.show()
In [64]:
# 画出部分银行利率与赎回量的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(bank['date'], bank['Interest_3_M'],'b',label="Interest_3_M")
plt.plot(bank['date'], bank['Interest_6_M'],'cyan',label="Interest_6_M")
plt.plot(bank['date'], bank['Interest_9_M'],'skyblue',label="Interest_9_M")

plt.legend()

ax2=ax1.twinx()
plt.plot(total_balance['date'], total_balance['total_redeem_amt'],'g',label="Total redeem")

plt.legend(loc=2)
plt.title("The time series of bank rate and redeem")
plt.xlabel("Time")
plt.ylabel("Rate & Amount")

plt.show()

The information for Share rate

In [65]:
# 画出支付宝利率与标签的相关性图

share['last_date'] = share['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
temp = pd.merge(share[['last_date']+share_features], total_balance, left_on='last_date', right_on='date')[['total_purchase_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0)  
plt.subplot(1,3,3)
temp = pd.merge(share[['last_date']+share_features], total_balance, left_on='last_date', right_on='date')[['total_redeem_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0)
Out[65]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f718600848>
In [66]:
# 画出银行利率与标签的相关性图

share['last_week'] = share['week'] + 1
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
temp = pd.merge(share[['last_week','weekday']+share_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_purchase_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0)  
plt.subplot(1,3,3)
temp = pd.merge(share[['last_week','weekday']+share_features], total_balance, left_on=['last_week','weekday'], right_on=['week','weekday'])[['total_redeem_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin = 0)
Out[66]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f718a04208>
In [67]:
# 画出支付宝利率与购买量的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
for i in share_features:
    plt.plot(share['date'], share[i],'b',label=i)
    break
plt.legend()
ax2=ax1.twinx()
plt.plot(total_balance['date'], total_balance['total_purchase_amt'],'g',label="Total purchase")
plt.legend(loc=2)
plt.show()
In [68]:
# 画出支付宝利率与赎回量的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
for i in share_features:
    plt.plot(share['date'], share[i],'b',label=i)
    break
plt.legend()
ax2=ax1.twinx()
plt.plot(total_balance['date'], total_balance['total_redeem_amt'],'g',label="Total redeem")
plt.legend(loc=2)
plt.show()
In [69]:
# 画出大额小额数据与支付宝利率的相关性图

share['last_date'] = share['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
temp = pd.merge(share[['last_date']+share_features], small_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)  
plt.title("SMALL PURCHASE")
plt.subplot(1,3,3)
plt.title("SMALL REDEEM")
temp = pd.merge(share[['last_date']+share_features], small_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)  

share['last_date'] = share['date'] + datetime.timedelta(days=1)
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.title("BIG PURCHASE")
temp = pd.merge(share[['last_date']+share_features], big_purchase, left_on='last_date', right_on='date')[['total_purchase_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)  
plt.subplot(1,3,3)
plt.title("BIG REDEEM")
temp = pd.merge(share[['last_date']+share_features], big_redeem, left_on='last_date', right_on='date')[['total_redeem_amt']+share_features]
sns.heatmap(temp.corr(), linewidths = 0.05, vmin=0)
Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f71e018088>
In [70]:
# 画出银行利率与支付宝利率的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(bank['date'], bank['Interest_3_M'],c='g',label= 'BANK')

plt.legend()
ax2=ax1.twinx()
plt.plot(share['date'], share['mfd_daily_yield'],label='SHARE')
plt.legend(loc=2)
plt.show()

11. 小结

It seems that:

  1. The influence of share is more likely to act on Purchase
  2. The influence of bank rate is more likely to act on Redeem
  3. The influence of share rate is for short
  4. The influence of bank rate is for long

based on above analysis, we can simply find these features:

  1. the weekday
  2. is it weekend
  3. is it holidy
  4. the distance from the start of week(monday)
  5. the distance from the end of week(sunday)
  6. the distance from the holiday centre(centre of QingMing DuanWu Labour ZhongQiu)
  7. the distance from the start of month
  8. the distance from the end of month
  9. the mean/max/min value of the same week in last month
  10. the value in last day of last month

1. 大小额数据分析

前面的分析可以看出,某些用户的交易额很大,对于日交易量很明显,这里统计四月份后依旧活跃的大额用户(大于100万)

In [71]:
# 获得大额用户的集合

temp = data_balance[(data_balance['total_purchase_amt'] >= 1000000) | (data_balance['total_redeem_amt'] >= 1000000)]
big_users_set = set(temp[temp['date'] >= datetime.datetime(2014,4,1)]['user_id'])
In [72]:
len(big_users_set)
Out[72]:
3903
In [73]:
max(data_balance['user_id'])
Out[73]:
28041
In [74]:
len(big_users_set)/max(data_balance['user_id'])*100
Out[74]:
13.918904461324487

大额用户占3903个,占总数的13.92%

In [75]:
# 标记大额用户

data_balance['big_user'] = 0
data_balance.loc[data_balance['user_id'].isin(big_users_set), 'big_user'] = 1
In [76]:
# 统计大额用户与小额用户的日总交易额的区别

total_balance_bigNsmall = data_balance.groupby(['date','big_user'], 
                                               as_index=False)['total_purchase_amt','total_redeem_amt'].sum()

In [77]:
# 画出大额用户与小额用户交易的日总交易量图

fig = plt.figure(figsize=(20,8))
plt.plot(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['date'], total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['total_purchase_amt'],label='big_purchase')
plt.plot(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['date'], total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['total_redeem_amt'],label='big_redeem')

plt.plot(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['date'], total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['total_purchase_amt'],label='small_purchase')
plt.plot(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['date'], total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['total_redeem_amt'],label='small_redeem')
plt.legend(loc='best')
plt.title("The time series of big and small user of Purchase and Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
In [78]:
# 统计大额小额用户购买量占比

np.sum(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['total_purchase_amt']) / np.sum(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['total_purchase_amt'])
Out[78]:
5.1456178397775805
In [79]:
# 统计大额小额用户赎回量占比

np.sum(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 1]['total_redeem_amt']) / np.sum(total_balance_bigNsmall[total_balance_bigNsmall['big_user'] == 0]['total_redeem_amt'])
Out[79]:
5.422141272341089

2. 分析用户的交易频次

In [80]:
# 画出非0交易的分布图

frequency = data_balance[(data_balance['direct_purchase_amt'] != 0) | (data_balance['total_redeem_amt'] != 0)][['user_id','tBalance']].groupby('user_id', as_index=False).count()
sns.distplot(frequency['tBalance'])
Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f71e8ca508>
In [81]:
# 获取频繁交易用户集合

hot_users_set = set(frequency[frequency['tBalance'] > 30]['user_id'])
In [82]:
# 获取频繁用户的交易纪录

data_balance['is_hot_users'] = 0
data_balance.loc[data_balance['user_id'].isin(hot_users_set) , 'is_hot_users'] = 1
In [83]:
# 统计频繁用户与非频繁用户的日总交易额的区别

total_balance_hotNcold = data_balance.groupby(['date','is_hot_users'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
In [84]:
# 绘制频繁用户与非频繁用户总购买赎回量的时序图

fig = plt.figure(figsize=(20,8))
plt.plot(total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 1]['date'], total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 1]['total_purchase_amt'],label='hot_purchase')
plt.plot(total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 1]['date'], total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 1]['total_redeem_amt'],label='hot_redeem')

plt.plot(total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 0]['date'], total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 0]['total_purchase_amt'],label='cold_purchase')
plt.plot(total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 0]['date'], total_balance_hotNcold[total_balance_hotNcold['is_hot_users'] == 0]['total_redeem_amt'],label='cold_redeem')
plt.legend(loc='best')
plt.title("The time series of big and small user of Purchase and Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
In [85]:
# 画出冷启动用户与老用户交易的箱型图

temp = data_balance[['year','month','user_id','total_purchase_amt','total_redeem_amt']].groupby(['year','month','user_id'], as_index=False).sum()
user_old_set = set()
plt.figure(figsize=(10,30))
for i in range(1, 9):
    newset = set(temp[(temp['year'] == 2014) & (temp['month'] == i)]['user_id'])
    this_month = data_balance[(data_balance['year'] == 2014) & (data_balance['month'] == i)]
    this_month['cold'] = 0
    this_month.loc[this_month['user_id'].isin(newset - user_old_set), 'cold'] = 1
    plt.subplot(4,2,i)
    plt.title('This month : ' + str(i))
    sns.boxplot(x="cold", y="total_purchase_amt" , data=this_month[(this_month['direct_purchase_amt'] != 0) | (this_month['total_redeem_amt'] != 0)])
    user_old_set = user_old_set | newset
plt.show()

3. 用户其他属性分析

In [86]:
# 用户的其他属性
users = pd.read_csv('Dataset/user_profile_table.csv')
users.head()
Out[86]:
user_id sex city constellation
0 2 1 6411949 狮子座
1 12 1 6412149 摩羯座
2 22 1 6411949 双子座
3 23 1 6411949 双鱼座
4 25 1 6481949 双鱼座
In [87]:
# 添加城市、星座、性别

data_balance = pd.merge(data_balance, users, on='user_id')
In [88]:
# 统计每个城市用户的日总交易额的区别并绘制分布估计图

fig = plt.figure(figsize=(10,5))
for i in np.unique(data_balance['city']):
    temp = data_balance.groupby(['date','city'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
    ax = sns.kdeplot( temp[temp['city'] == i]['total_purchase_amt'],label=i)
plt.legend(loc='best')
plt.title("The time series of different city of Purchase")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

fig = plt.figure(figsize=(10,5))
for i in np.unique(data_balance['city']):
    temp = data_balance.groupby(['date','city'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
    ax = sns.kdeplot( temp[temp['city'] == i]['total_redeem_amt'],label=i)
plt.legend(loc='best')
plt.title("The time series of different city of Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
In [89]:
# 统计每个性别用户的日总交易额的区别,并绘制时序图

temp = data_balance.groupby(['date','sex'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()

fig = plt.figure(figsize=(20,6))
plt.plot(temp[temp['sex'] == 1]['date'], temp[temp['sex'] == 1]['total_purchase_amt'],label='Male')
plt.plot(temp[temp['sex'] == 0]['date'], temp[temp['sex'] == 0]['total_purchase_amt'],label='Female')
plt.legend(loc='best')
plt.title("The time series of two gender user of Purchase")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

fig = plt.figure(figsize=(20,6))
plt.plot(temp[temp['sex'] == 1]['date'], temp[temp['sex'] == 1]['total_purchase_amt'],label='Male')
plt.plot(temp[temp['sex'] == 0]['date'], temp[temp['sex'] == 0]['total_redeem_amt'],label='Female')
plt.legend(loc='best')
plt.title("The time series of two gender user of Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
In [90]:
data_balance['constellation']
Out[90]:
0          白羊座
1          白羊座
2          白羊座
3          白羊座
4          白羊座
          ... 
2840416    处女座
2840417    处女座
2840418    处女座
2840419    处女座
2840420    摩羯座
Name: constellation, Length: 2840421, dtype: object
In [ ]:
# 统计每个星座用户的日总交易额的区别 并绘制分布估计图

fig = plt.figure(figsize=(12,5))
for i in np.unique(data_balance['constellation']):
    temp = data_balance.groupby(['date','constellation'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
    ax = sns.kdeplot( temp[temp['constellation'] == i]['total_purchase_amt'],label=i)
plt.legend(loc='best')
plt.title("The time series of small deal of Purchase and Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

fig = plt.figure(figsize=(12,5))
for i in np.unique(data_balance['constellation']):
    temp = data_balance.groupby(['date','constellation'], as_index=False)['total_purchase_amt','total_redeem_amt'].sum()
    ax = sns.kdeplot( temp[temp['constellation'] == i]['total_redeem_amt'],label=i)
plt.legend(loc='best')
plt.title("The time series of small deal of Purchase and Redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

4. 其他分析

In [ ]:
# 统计每个性别用户的日总交易额的区别

temp = data_balance.groupby(['date'], as_index=False)['direct_purchase_amt','share_amt'].sum()

In [ ]:
# 画出每日利息的增长/直接购买量的时序图

fig = plt.figure(figsize=(20,6))
plt.plot(temp['date'],  temp['share_amt'] / temp['direct_purchase_amt'] ,label='Rate')
plt.legend(loc='best')
plt.title("The rate of Share / Purchase")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
In [ ]:
# 加载支付宝利率数据

share = pd.read_csv('Dataset/mfd_day_share_interest.csv')
share = share.rename(columns = {'mfd_date': 'date'})
share_features = [x for x in share.columns if x not in ['date']]
share['date'] = pd.to_datetime(share['date'], format= "%Y%m%d")
share['day'] = share['date'].dt.day
share['month'] = share['date'].dt.month
share['year'] = share['date'].dt.year
share['week'] = share['date'].dt.week
share['weekday'] = share['date'].dt.weekday
In [ ]:
# 绘制支付宝利率与交易额的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(temp['date'], temp['share_amt'],'b',label="Share_amt")
plt.legend()

ax2=ax1.twinx()
plt.plot(share['date'], share['mfd_daily_yield'],'g',label="Share rate")

plt.legend(loc=2)
plt.title("The correlation between share rate and share amount")
plt.xlabel("Time")
plt.ylabel("Rate & Amount")

plt.show()
In [ ]:
# 支付宝利率与每日利息的增长/直接购买量的时序图

fig,ax1 = plt.subplots(figsize=(15,5))
plt.plot(temp['date'], temp['share_amt'] / temp['direct_purchase_amt'],'b',label="Share_amt / Direct_amt")
plt.legend()

ax2=ax1.twinx()
plt.plot(share['date'], share['mfd_daily_yield'],'g',label="Share rate")

plt.legend(loc=2)
plt.title("The correlation between share rate and Share/Purchase")
plt.xlabel("Time")
plt.ylabel("Rate & Amount")

plt.show()

直接购买里面有两种

In [ ]:
# 聚合两种不同购买方式

temp = data_balance.groupby(['date'], as_index=False)['purchase_bal_amt','purchase_bank_amt'].sum()
In [ ]:
# 画出不同购买方式日购买量的时序图

fig = plt.figure(figsize=(20,6))
plt.plot(temp['date'], temp['purchase_bal_amt'],label='Bal')
plt.plot(temp['date'], temp['purchase_bank_amt'],label='Bank')
plt.legend(loc='best')
plt.title("The purchase of bal and bank")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
In [ ]:
# 画出不同赎回方式日赎回量的时序图

temp = data_balance.groupby(['date'], as_index=False)['tftobal_amt','tftocard_amt'].sum()
fig = plt.figure(figsize=(20,6))
plt.plot(temp['date'], temp['tftobal_amt'],label='Bal')
plt.plot(temp['date'], temp['tftocard_amt'],label='Bank')
plt.legend(loc='best')
plt.title("The redeem of bal and bank")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()

5. 小结

It seems that:

  1. The influence of share is more likely to act on Purchase
  2. The influence of bank rate is more likely to act on Redeem
  3. The influence of share rate is for short
  4. The influence of bank rate is for long

based on above analysis, we can simply find these features:

  1. the weekday
  2. is it weekend
  3. is it holidy
  4. the distance from the start of week(monday)
  5. the distance from the end of week(sunday)
  6. the distance from the holiday centre(centre of QingMing DuanWu Labour ZhongQiu)
  7. the distance from the start of month
  8. the distance from the end of month
  9. the mean/max/min value of the same week in last month
  10. the value in last day of last month
  11. Although Alipay or bank card buying is not very different, but the difference on the redeem is quite obvious
  12. 3903 large users account for 13.92% of the total

1. 辅助函数整合

In [106]:
# Load the balance data
def load_data(path: str = 'user_balance_table.csv')->pd.DataFrame:
    data_balance = pd.read_csv(path)
    data_balance = add_timestamp(data_balance)
    return data_balance.reset_index(drop=True)
    

# add tiemstamp to dataset
def add_timestamp(data: pd.DataFrame, time_index: str = 'report_date')->pd.DataFrame:
    data_balance = data.copy()
    data_balance['date'] = pd.to_datetime(data_balance[time_index], format= "%Y%m%d")
    data_balance['day'] = data_balance['date'].dt.day
    data_balance['month'] = data_balance['date'].dt.month
    data_balance['year'] = data_balance['date'].dt.year
    data_balance['week'] = data_balance['date'].dt.week
    data_balance['weekday'] = data_balance['date'].dt.weekday
    return data_balance.reset_index(drop=True)

# total amount
def get_total_balance(data: pd.DataFrame, date: str = '2014-03-31')->pd.DataFrame:
    df_tmp = data.copy()
    df_tmp = df_tmp.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum()
    df_tmp.reset_index(inplace=True)
    return df_tmp[(df_tmp['date']>= date)].reset_index(drop=True)

# Generate the test data
def generate_test_data(data: pd.DataFrame)->pd.DataFrame:
    total_balance = data.copy()
    start = datetime.datetime(2014,9,1)
    testdata = []
    while start != datetime.datetime(2014,10,15):
        temp = [start, np.nan, np.nan]
        testdata.append(temp)
        start += datetime.timedelta(days = 1)
    testdata = pd.DataFrame(testdata)
    testdata.columns = total_balance.columns

    total_balance = pd.concat([total_balance, testdata], axis = 0)
    total_balance = total_balance.reset_index(drop=True)
    return total_balance.reset_index(drop=True)

# Load user's information
def load_user_information(path: str = 'user_profile_table.csv')->pd.DataFrame:
    return pd.read_csv(path)
In [107]:
# 载入数据

balance_data = load_data('Dataset/user_balance_table.csv')
balance_data = add_timestamp(balance_data)
total_balance = get_total_balance(balance_data, date = '2014-03-01')
total_balance = generate_test_data(total_balance)
total_balance = add_timestamp(total_balance, 'date')
In [108]:
# 创建数据的深层拷贝

data = total_balance.copy()

2. 时间序列规则设定

In [109]:
# 定义生成时间序列规则预测结果的方法

def generate_base(df: pd.DataFrame, month_index: int)->pd.DataFrame:
    # 选中固定时间段的数据集
    total_balance = df.copy()
    total_balance = total_balance[['date','total_purchase_amt','total_redeem_amt']]
    total_balance = total_balance[(total_balance['date'] >= datetime.datetime(2014,3,1)) & (total_balance['date'] < datetime.datetime(2014, month_index, 1))]

    # 加入时间戳
    total_balance['weekday'] = total_balance['date'].dt.weekday
    total_balance['day'] = total_balance['date'].dt.day
    total_balance['week'] = total_balance['date'].dt.week
    total_balance['month'] = total_balance['date'].dt.month
    
    # 统计翌日因子
    mean_of_each_weekday = total_balance[['weekday']+['total_purchase_amt','total_redeem_amt']].groupby('weekday',as_index=False).mean()
    for name in ['total_purchase_amt','total_redeem_amt']:
        mean_of_each_weekday = mean_of_each_weekday.rename(columns={name: name+'_weekdaymean'})
    mean_of_each_weekday['total_purchase_amt_weekdaymean'] /= np.mean(total_balance['total_purchase_amt'])
    mean_of_each_weekday['total_redeem_amt_weekdaymean'] /= np.mean(total_balance['total_redeem_amt'])

    # 合并统计结果到原数据集
    total_balance = pd.merge(total_balance, mean_of_each_weekday, on='weekday', how='left')

    # 分别统计翌日在(1~31)号出现的频次
    weekday_count = total_balance[['day','weekday','date']].groupby(['day','weekday'],as_index=False).count()
    weekday_count = pd.merge(weekday_count, mean_of_each_weekday, on='weekday')

    # 依据频次对翌日因子进行加权,获得日期因子
    weekday_count['total_purchase_amt_weekdaymean'] *= weekday_count['date']   / len(np.unique(total_balance['month']))
    weekday_count['total_redeem_amt_weekdaymean'] *= weekday_count['date']  / len(np.unique(total_balance['month']))
    day_rate = weekday_count.drop(['weekday','date'],axis=1).groupby('day',as_index=False).sum()

    # 将训练集中所有日期的均值剔除日期残差得到base
    day_mean = total_balance[['day'] + ['total_purchase_amt','total_redeem_amt']].groupby('day',as_index=False).mean()
    day_pre = pd.merge(day_mean, day_rate, on='day', how='left')
    day_pre['total_purchase_amt'] /= day_pre['total_purchase_amt_weekdaymean']
    day_pre['total_redeem_amt'] /= day_pre['total_redeem_amt_weekdaymean']

    # 生成测试集数据
    for index, row in day_pre.iterrows():
        if month_index in (2,4,6,9) and row['day'] == 31:
            break
        day_pre.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day']))

    # 基于base与翌日因子获得最后的预测结果
    day_pre['weekday'] = day_pre.date.dt.weekday
    day_pre = day_pre[['date','weekday']+['total_purchase_amt','total_redeem_amt']]
    day_pre = pd.merge(day_pre, mean_of_each_weekday,on='weekday')
    day_pre['total_purchase_amt'] *= day_pre['total_purchase_amt_weekdaymean']
    day_pre['total_redeem_amt'] *= day_pre['total_redeem_amt_weekdaymean']

    day_pre = day_pre.sort_values('date')[['date']+['total_purchase_amt','total_redeem_amt']]
    return day_pre

3. 结果预测并保存

In [110]:
# 生成预测结果(以及残差)

base_list = []
for i in range(4, 10):
    base_list.append(generate_base(data, i).reset_index(drop=True))

base = pd.concat(base_list).reset_index(drop=True)
for i in ['total_purchase_amt','total_redeem_amt']:
    base = base.rename(columns={i: i+'_base'})

data = pd.merge(data.reset_index(drop=True), base.reset_index(drop=True), on='date', how='left').reset_index(drop=True)

data['purchase_residual'] = data['total_purchase_amt'] / data['total_purchase_amt_base']

data['redeem_residual'] = data['total_redeem_amt'] / data['total_redeem_amt_base']
In [111]:
# 对结果表重命名

data = data[['date','purchase_residual','redeem_residual','total_purchase_amt_base', 'total_redeem_amt_base']]
for i in data.columns:
    if i == 'date':
        data[i] = data[i].astype(str)
        data[i] = data[i].str.replace('-','')
data.columns = [['date'] + ['total_purchase_amt','total_redeem_amt'] + ['total_purchase_predicted_by_cycle','total_redeem_predicted_by_cycle'] ]
In [112]:
# 保存预测结果到本地

data.to_csv('Dataset/base.csv',index=False)

1. 数据准备

In [1]:
library(plyr)
library(dplyr)
library(forecast)
library(data.table)
library(tseries)
options(warn=-1)
Attaching package: 'dplyr'


The following objects are masked from 'package:plyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Registered S3 methods overwritten by 'forecast':
  method             from    
  fitted.fracdiff    fracdiff
  residuals.fracdiff fracdiff


Attaching package: 'data.table'


The following objects are masked from 'package:dplyr':

    between, first, last


In [2]:
user_balance=read.csv("Dataset/user_balance_table.csv")
user_balance$report_date=as.Date(as.character(user_balance$report_date),format="%Y%m%d")

temp=order(user_balance$report_date)
user_balance=user_balance[temp,]

data=ddply(user_balance,.(report_date),function(D){colwise(sum)(D[,c(-1,-2)])})

train_data = data[274:396,]
test_data = data[397:427,]
In [3]:
head(train_data)
A data.frame: 6 × 17
report_datetBalanceyBalancetotal_purchase_amtdirect_purchase_amtpurchase_bal_amtpurchase_bank_amttotal_redeem_amtconsume_amttransfer_amttftobal_amttftocard_amtshare_amtcategory1category2category3category4
<date><dbl><dbl><int><int><int><int><int><int><int><int><int><int><int><int><int><int>
2742014-03-31204170094162044197714539888490539602644815407355124195289742385263448345100375507534565450103189625242858457NANANANA
2752014-04-01205929006432041700941645332058545047812214653959130393853127742935849540539227888819371166131907722062842463NANANANA
2762014-04-02206756356952059290064335534711835244961614102415821142545827261206651040952221571114283011471932699672897502NANANANA
2772014-04-03207729073582067563569536387712036095803416586675819509127626660545745762040220843417469155411739278762919086NANANANA
2782014-04-042082461061520772907358251895894248968008 6283848718612952120019263737871189162321448351544711271669772927886NANANANA
2792014-04-052086374747520824610615202336542199391149 687633481306278011631996824388972611930995623061774 962481822945393NANANANA
In [4]:
head(test_data)
A data.frame: 6 × 17
report_datetBalanceyBalancetotal_purchase_amtdirect_purchase_amtpurchase_bal_amtpurchase_bank_amttotal_redeem_amtconsume_amttransfer_amttftobal_amttftocard_amtshare_amtcategory1category2category3category4
<date><dbl><dbl><int><int><int><int><int><int><int><int><int><int><int><int><int><int>
3972014-08-01200697600151994741613837488473537267895813571532423696363425254085884227601168313257310133391372999182205777NANANANA
3982014-08-022008660192020069760015189092130186889349 784787041084106451722502254193097713031924834359098 959601502202781NANANANA
3992014-08-032013330210020086601920173825397171634865 6212057510951429012712521740828571 8629664623794226 625024202190532NANANANA
4002014-08-04201410354602013330210033064088432840447212090464620749982632290752478477516244430008439694812004605272236412NANANANA
4012014-08-05203141097912014103546039478087039258971516470555622788415922170653974040082147666457359747911116916662191155NANANANA
4022014-08-06203205842132031410979128882101628659829812764162415895667428234659456127546226219048268154161994036322222718NANANANA

2. 评估方式

In [5]:
get_score<- function(purchase_pred,redeem_pred,purchase_true,redeem_true,h=0.3){
  out = data.frame(purchase_pred ,redeem_pred,purchase_true,redeem_true )
  out$purchase_mape = abs(out$purchase_pred-out$purchase_true)/
    out$purchase_true
  out$redeem_mape = abs(out$redeem_pred-out$redeem_true)/
    out$redeem_true
  
  score = sum(exp(-out$purchase_mape/h)*10)* 0.45 + 
    sum(exp(-out$redeem_mape/h)*10)* 0.55 
  return(score)
}

3. 平稳性分析

In [6]:
# purchase
purchase_ts=ts(train_data$total_purchase_amt,
               frequency=7,start=c(1,1))       ## 生成时间序列
purchase_stl_res = stl(purchase_ts,s.window="periodic",robust=TRUE)  ## stl分解
purchase_pred_test = forecast(purchase_stl_res,h = 31)$mean  ## 得到测试集预测结果



# redeem
redeem_ts = ts(train_data$total_redeem_amt,
               frequency=7,start=c(1,1))
redeem_stl_res = stl(redeem_ts,s.window="periodic",robust=TRUE)
redeem_pred_test = forecast(redeem_stl_res,h = 31)$mean


get_score(purchase_pred_test,redeem_pred_test,test_data$total_purchase_amt,
          test_data$total_redeem_amt)
172.722232296235
In [7]:
purchase_del_ts=ts(purchase_ts-purchase_stl_res$time.series[,1],
                   frequency=7,start=c(1,1))
In [8]:
plot(purchase_del_ts)
In [9]:
acf(purchase_del_ts)
In [10]:
adf.test(purchase_del_ts)
	Augmented Dickey-Fuller Test

data:  purchase_del_ts
Dickey-Fuller = -4.3948, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary
In [11]:
for(i in 1:2){print(Box.test(purchase_del_ts,type = "Box-Pierce",
                             lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(purchase_del_ts,type = "Ljung-Box",
                             lag = i*6)$p.value)}
[1] 0.007097898
[1] 0.04290397
[1] 0.005825161
[1] 0.03343909
In [12]:
pacf(purchase_del_ts)
In [13]:
acf(purchase_del_ts)
In [15]:
# redeem
redeem_del_ts=ts(redeem_ts-redeem_stl_res$time.series[,1],
                   frequency=7,start=c(1,1))

plot(redeem_del_ts)
In [16]:
acf(redeem_del_ts)
In [17]:
adf.test(redeem_del_ts)
	Augmented Dickey-Fuller Test

data:  redeem_del_ts
Dickey-Fuller = -3.0508, Lag order = 4, p-value = 0.14
alternative hypothesis: stationary
In [18]:
plot(diff(redeem_del_ts)) # 差分后的时序图
In [19]:
acf(diff(redeem_del_ts))
In [20]:
adf.test(diff(redeem_del_ts))
	Augmented Dickey-Fuller Test

data:  diff(redeem_del_ts)
Dickey-Fuller = -7.1293, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary
In [21]:
for(i in 1:2){print(Box.test(diff(redeem_del_ts),type = "Box-Pierce",
                             lag = i*6)$p.value)}

for(i in 1:2){print(Box.test(diff(redeem_del_ts),type = "Ljung-Box",
                             lag = i*6)$p.value)}
[1] 0.004738446
[1] 0.0605687
[1] 0.003849948
[1] 0.04976641
In [22]:
pacf(diff(redeem_del_ts))#拖尾
In [23]:
acf(diff(redeem_del_ts)) #截尾

4. ARIMA建模

In [24]:
# purchase
purchase_del_fit = arima(purchase_del_ts/1000,order = c(1,0,1))
In [25]:
purchase_del_fit
Call:
arima(x = purchase_del_ts/1000, order = c(1, 0, 1))

Coefficients:
         ar1      ma1   intercept
      0.4187  -0.0593  265797.596
s.e.  0.2409   0.2584    7658.852

sigma^2 estimated as 2.784e+09:  log likelihood = -1512.04,  aic = 3032.09
In [26]:
# 显著性检验
for(i in 1:2){print(Box.test(purchase_del_fit$residuals,type = "Ljung-Box",
                             lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(purchase_del_fit$residuals,type = 'Box-Pierce',
                             lag = i*6)$p.value)}
[1] 0.9691384
[1] 0.9876937
[1] 0.9734135
[1] 0.9914217
In [27]:
purchase_pred_test = forecast(purchase_stl_res$time.series[,1],31)$mean+
  predict(purchase_del_fit,n.ahead = 31)$pred*1000
In [28]:
# redeem
redeem_del_fit = arima(redeem_del_ts,order = c(0,1,1))
In [29]:
# 显著性检验
for(i in 1:2){print(Box.test(redeem_del_fit$residuals,
                             type = "Ljung-Box",lag = i*6)$p.value)}


for(i in 1:2){print(Box.test(redeem_del_fit$residuals,
                             type = 'Box-Pierce',lag = i*6)$p.value)}
[1] 0.7397571
[1] 0.9739962
[1] 0.7547025
[1] 0.9783781
In [30]:
redeem_pred_test = predict(redeem_del_fit,n.ahead = 31)$pred+
  forecast(redeem_stl_res$time.series[,1],h=31)$mean
In [31]:
get_score(purchase_pred_test,redeem_pred_test,test_data$total_purchase_amt,
          test_data$total_redeem_amt)
191.230702400082
In [32]:
#####去除周期后 auto.arima
#####purchase======
autoFit = auto.arima(purchase_del_ts/1000,d=0,seasonal = F,trace = T )
 ARIMA(2,0,2)           with non-zero mean : 3036.781
 ARIMA(0,0,0)           with non-zero mean : 3044.866
 ARIMA(1,0,0)           with non-zero mean : 3030.344
 ARIMA(0,0,1)           with non-zero mean : 3032.692
 ARIMA(0,0,0)           with zero mean     : 3429.418
 ARIMA(2,0,0)           with non-zero mean : 3032.426
 ARIMA(1,0,1)           with non-zero mean : 3032.428
 ARIMA(2,0,1)           with non-zero mean : 3034.588
 ARIMA(1,0,0)           with zero mean     : 3074.01

 Best model: ARIMA(1,0,0)           with non-zero mean 

In [33]:
for(i in 1:2){print(Box.test(autoFit$residuals,type = "Ljung-Box",lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(autoFit$residuals,type = 'Box-Pierce',lag = i*6)$p.value)}
[1] 0.9697502
[1] 0.9881997
[1] 0.9739763
[1] 0.9917919
In [34]:
purchase_pred_test = forecast(purchase_stl_res$time.series[,1],31)$mean+
  forecast(autoFit,h=31)$mean*1000
In [35]:
autoFit = auto.arima(redeem_del_ts,d=1,seasonal = F,trace = TRUE)
 ARIMA(2,1,2)           with drift         : 4746.025
 ARIMA(0,1,0)           with drift         : 4781.675
 ARIMA(1,1,0)           with drift         : 4766.45
 ARIMA(0,1,1)           with drift         : 4742.536
 ARIMA(0,1,0)                              : 4779.628
 ARIMA(1,1,1)           with drift         : 4741.796
 ARIMA(2,1,1)           with drift         : 4743.756
 ARIMA(1,1,2)           with drift         : 4743.779
 ARIMA(0,1,2)           with drift         : 4741.605
 ARIMA(0,1,3)           with drift         : 4743.777
 ARIMA(1,1,3)           with drift         : 4745.991
 ARIMA(0,1,2)                              : 4739.482
 ARIMA(0,1,1)                              : 4740.435
 ARIMA(1,1,2)                              : 4741.617
 ARIMA(0,1,3)                              : 4741.616
 ARIMA(1,1,1)                              : 4739.681
 ARIMA(1,1,3)                              : 4743.793

 Best model: ARIMA(0,1,2)                              

In [36]:
###模型显著性检验====
for(i in 1:2){print(Box.test(autoFit$residuals,type = "Ljung-Box",lag = i*6)$p.value)}
for(i in 1:2){print(Box.test(autoFit$residuals,type = 'Box-Pierce',lag = i*6)$p.value)}
[1] 0.9896333
[1] 0.9995391
[1] 0.9910407
[1] 0.9996895
In [37]:
redeem_pred_test = forecast(autoFit,h=31)$mean+
  forecast(redeem_stl_res$time.series[,1],h=31)$mean
In [38]:
get_score(purchase_pred_test,redeem_pred_test,test_data$total_purchase_amt,
          test_data$total_redeem_amt)
190.495719132032

5. prophet模型预测

In [39]:
# prophet预测

holidays_and_effect_days <- data_frame(
  holiday = 'holidays_and_effect_days',
  ds = as.Date(c('2014-04-03', '2014-04-04', 
                 '2014-04-05', '2014-04-06', '2014-04-07', 
                 '2014-04-08', '2014-04-09',
                 
                 '2014-04-29', '2014-04-30', 
                 '2014-05-01', '2014-05-02', '2014-05-03',
                 '2014-05-04', '2014-05-05',
                 
                 '2014-05-29', '2014-05-30',
                 '2014-05-31', '2014-06-01', '2014-06-02', 
                 '2014-06-03', '2014-06-04')),
  lower_window = 0,
  upper_window = 0
)

holidays <- data_frame(
  holiday = 'true_holidays',
  ds = as.Date(c('2014-04-05', '2014-04-06', '2014-04-07',
                 '2014-05-01', '2014-05-02', '2014-05-03',
                 '2014-05-31', '2014-06-01', '2014-06-02')),
  lower_window = 0,
  upper_window = 0
)
holidays <- bind_rows(holidays_and_effect_days, holidays)



###purchase=========

Myprophet_purchase = prophet::prophet(changepoint_prior_scale=0.5,weekly_seasonality=T,
                                      holidays= holidays)
df_purchase = data.frame(train_data$report_date ,train_data$total_purchase_amt )
colnames(df_purchase) <-c('ds','y')
Myprophet_purchase = prophet::fit.prophet(Myprophet_purchase,df_purchase)

future_purchase = prophet::make_future_dataframe(Myprophet_purchase,periods=31,
                                                 include_history=F)
forecast_purchase = predict(Myprophet_purchase, future_purchase)
purchase_pred_test = forecast_purchase$yhat



###redeem=====
Myprophet_redeem = prophet::prophet(changepoint_prior_scale=0.5,weekly_seasonality=True,
                                    holidays= holidays)
df_redeem = data.frame(train_data$report_date ,train_data$total_redeem_amt )
colnames(df_redeem) <-c('ds','y')
df_redeem
Myprophet_redeem = prophet::fit.prophet(Myprophet_redeem,df_redeem)

future_redeem = prophet::make_future_dataframe(Myprophet_redeem,periods=31,
                                               include_history=F)
forecast_redeem = predict(Myprophet_redeem, future_redeem)
redeem_pred_test = forecast_redeem$yhat

get_score(purchase_pred_test,redeem_pred_test,test_data$total_purchase_amt,
          test_data$total_redeem_amt)
Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.

Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

A data.frame: 123 × 2
dsy
<date><int>
2014-03-31423852634
2014-04-01277429358
2014-04-02272612066
2014-04-03266605457
2014-04-04200192637
2014-04-05163199682
2014-04-06139576683
2014-04-07176966561
2014-04-08250015131
2014-04-09289330278
2014-04-10286914864
2014-04-11277077434
2014-04-12123295320
2014-04-13178934722
2014-04-14415986984
2014-04-15285293076
2014-04-16255914640
2014-04-17265341592
2014-04-18225952909
2014-04-19146374940
2014-04-20161057781
2014-04-21295635256
2014-04-22268810141
2014-04-23278470936
2014-04-24224536754
2014-04-25227764292
2014-04-26158122962
2014-04-27191915377
2014-04-28327724735
2014-04-29307578349
......
2014-07-02328950951
2014-07-03289009780
2014-07-04264494550
2014-07-05272535138
2014-07-06195530758
2014-07-07317612569
2014-07-08340453063
2014-07-09269642881
2014-07-10326009240
2014-07-11240050748
2014-07-12149081488
2014-07-13199459990
2014-07-14284753279
2014-07-15261722182
2014-07-16234775948
2014-07-17298279385
2014-07-18208671287
2014-07-19155464283
2014-07-20174462836
2014-07-21434191479
2014-07-22369043423
2014-07-23308353077
2014-07-24347622431
2014-07-25262874791
2014-07-26282653341
2014-07-27166610652
2014-07-28345986909
2014-07-29303480103
2014-07-30250117716
2014-07-31277194379
Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.

Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

161.851032682629