资金流入流出预测-挑战Baseline (下)

特征提取

数据准备

In [1]:
import pandas as  pd
import numpy as np

import datetime
import shap
import eli5
import seaborn as sns
import matplotlib.pyplot as plt

from mvtpy import mvtest
from wordcloud import WordCloud
from scipy import stats
from eli5.sklearn import PermutationImportance
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

from typing import *
import warnings 
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
The sklearn.metrics.scorer module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
The sklearn.feature_selection.base module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_selection. Anything that cannot be imported from sklearn.feature_selection is now part of the private API.
Using TensorFlow backend.
In [2]:
labels = ['total_purchase_amt','total_redeem_amt']
date_indexs = ['week','year','month','weekday','day']
In [3]:
# Load the balance data
def load_data(path: str = 'user_balance_table.csv')->pd.DataFrame:
    data_balance = pd.read_csv(path)
    return data_balance.reset_index(drop=True)
    

# add tiemstamp to dataset
def add_timestamp(data: pd.DataFrame, time_index: str = 'report_date')->pd.DataFrame:
    data_balance = data.copy()
    data_balance['date'] = pd.to_datetime(data_balance[time_index], format= "%Y%m%d")
    data_balance['day'] = data_balance['date'].dt.day
    data_balance['month'] = data_balance['date'].dt.month
    data_balance['year'] = data_balance['date'].dt.year
    data_balance['week'] = data_balance['date'].dt.week
    data_balance['weekday'] = data_balance['date'].dt.weekday
    return data_balance.reset_index(drop=True)

# total amount
def get_total_balance(data: pd.DataFrame, date: str = '2014-03-31')->pd.DataFrame:
    df_tmp = data.copy()
    df_tmp = df_tmp.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum()
    df_tmp.reset_index(inplace=True)
    return df_tmp[(df_tmp['date']>= date)].reset_index(drop=True)

# Generate the test data
def generate_test_data(data: pd.DataFrame)->pd.DataFrame:
    total_balance = data.copy()
    start = datetime.datetime(2014,9,1)
    testdata = []
    while start != datetime.datetime(2014,10,15):
        temp = [start, np.nan, np.nan]
        testdata.append(temp)
        start += datetime.timedelta(days = 1)
    testdata = pd.DataFrame(testdata)
    testdata.columns = total_balance.columns

    total_balance = pd.concat([total_balance, testdata], axis = 0)
    total_balance = total_balance.reset_index(drop=True)
    return total_balance.reset_index(drop=True)

# Load user's information
def load_user_information(path: str = 'user_profile_table.csv')->pd.DataFrame:
    return pd.read_csv(path)
In [4]:
# 读取数据集

balance_data = load_data('Dataset/user_balance_table.csv')
balance_data = add_timestamp(balance_data, time_index='report_date')
total_balance = get_total_balance(balance_data)
total_balance = generate_test_data(total_balance)
total_balance = add_timestamp(total_balance, 'date')
user_information = load_user_information('Dataset/user_profile_table.csv')

1. 基于日期的静态特征

1.1 提取 is 特征

In [5]:
# 获取节假日集合

def get_holiday_set()->Set[datetime.date]:
    holiday_set = set()
    # 清明节
    holiday_set = holiday_set | {datetime.date(2014,4,5), datetime.date(2014,4,6), datetime.date(2014,4,7)}
    # 劳动节
    holiday_set = holiday_set | {datetime.date(2014,5,1), datetime.date(2014,5,2), datetime.date(2014,5,3)}
    # 端午节
    holiday_set = holiday_set | {datetime.date(2014,5,31), datetime.date(2014,6,1), datetime.date(2014,6,2)}
    # 中秋节
    holiday_set = holiday_set | {datetime.date(2014,9,6), datetime.date(2014,9,7), datetime.date(2014,9,8)}
    # 国庆节
    holiday_set = holiday_set | {datetime.date(2014,10,1), datetime.date(2014,10,2), datetime.date(2014,10,3),\
                                 datetime.date(2014,10,4), datetime.date(2014,10,5), datetime.date(2014,10,6),\
                                datetime.date(2014,10,7)}
    # 中秋节
    holiday_set = holiday_set | {datetime.date(2013,9,19), datetime.date(2013,9,20), datetime.date(2013,9,21)}
    # 国庆节
    holiday_set = holiday_set | {datetime.date(2013,10,1), datetime.date(2013,10,2), datetime.date(2013,10,3),\
                                 datetime.date(2013,10,4), datetime.date(2013,10,5), datetime.date(2013,10,6),\
                                datetime.date(2013,10,7)}
    return holiday_set
In [6]:
# 提取所有 is特征

def extract_is_feature(data: pd.DataFrame)->pd.DataFrame:
    total_balance = data.copy().reset_index(drop=True)
    
    # 是否是Weekend
    total_balance['is_weekend'] = 0
    total_balance.loc[total_balance['weekday'].isin((5,6)), 'is_weekend'] = 1
    # 是否是假期
    total_balance['is_holiday'] = 0
    total_balance.loc[total_balance['date'].isin(get_holiday_set()), 'is_holiday'] = 1
    
    # 是否是节假日的第一天
    last_day_flag = 0
    total_balance['is_firstday_of_holiday'] = 0
    for index, row in total_balance.iterrows():
        if last_day_flag == 0 and row['is_holiday'] == 1:
            total_balance.loc[index, 'is_firstday_of_holiday'] = 1
        last_day_flag = row['is_holiday']

    # 是否是节假日的最后一天
    total_balance['is_lastday_of_holiday'] = 0
    for index, row in total_balance.iterrows():
        if row['is_holiday'] == 1 and total_balance.loc[index+1, 'is_holiday'] == 0:
             total_balance.loc[index, 'is_lastday_of_holiday'] = 1

    # 是否是节假日后的上班第一天
    total_balance['is_firstday_of_work'] = 0
    last_day_flag = 0
    for index, row in total_balance.iterrows():
        if last_day_flag == 1 and row['is_holiday'] == 0:
            total_balance.loc[index, 'is_firstday_of_work'] = 1
        last_day_flag = row['is_lastday_of_holiday']

    # 是否不用上班
    total_balance['is_work'] = 1
    total_balance.loc[(total_balance['is_holiday'] == 1) | (total_balance['is_weekend'] == 1), 'is_work'] = 0
    special_work_day_set = {datetime.date(2014,5,4), datetime.date(2014,9,28)}
    total_balance.loc[total_balance['date'].isin(special_work_day_set), 'is_work'] = 1

    # 是否明天要上班
    total_balance['is_gonna_work_tomorrow'] = 0
    for index, row in total_balance.iterrows():
        if index == len(total_balance)-1:
            break
        if row['is_work'] == 0 and total_balance.loc[index+1, 'is_work'] == 1:
             total_balance.loc[index, 'is_gonna_work_tomorrow'] = 1

    # 昨天上班了吗
    total_balance['is_worked_yestday'] = 0
    for index, row in total_balance.iterrows():
        if index <= 1:
            continue
        if total_balance.loc[index-1, 'is_work'] == 1:
             total_balance.loc[index, 'is_worked_yestday'] = 1

    # 是否是放假前一天
    total_balance['is_lastday_of_workday'] = 0
    for index, row in total_balance.iterrows():
        if index == len(total_balance)-1:
            break
        if row['is_holiday'] == 0 and total_balance.loc[index+1, 'is_holiday'] == 1:
             total_balance.loc[index, 'is_lastday_of_workday'] = 1

    # 是否周日要上班
    total_balance['is_work_on_sunday'] = 0
    for index, row in total_balance.iterrows():
        if index == len(total_balance)-1:
            break
        if row['weekday'] == 6 and row['is_work'] == 1:
             total_balance.loc[index, 'is_work_on_sunday'] = 1
                
    # 是否是月初第一天
    total_balance['is_firstday_of_month'] = 0
    total_balance.loc[total_balance['day'] == 1, 'is_firstday_of_month'] = 1

    # 是否是月初第二天
    total_balance['is_secday_of_month'] = 0
    total_balance.loc[total_balance['day'] == 2, 'is_secday_of_month'] = 1

    # 是否是月初
    total_balance['is_premonth'] = 0
    total_balance.loc[total_balance['day'] <= 10, 'is_premonth'] = 1

    # 是否是月中
    total_balance['is_midmonth'] = 0
    total_balance.loc[(10 < total_balance['day']) & (total_balance['day'] <= 20), 'is_midmonth'] = 1

    # 是否是月末
    total_balance['is_tailmonth'] = 0
    total_balance.loc[20 < total_balance['day'], 'is_tailmonth'] = 1

    # 是否是每个月第一个周
    total_balance['is_first_week'] = 0
    total_balance.loc[total_balance['week'] % 4 == 1, 'is_first_week'] = 1

    # 是否是每个月第一个周
    total_balance['is_second_week'] = 0
    total_balance.loc[total_balance['week'] % 4 == 2, 'is_second_week'] = 1

    # 是否是每个月第一个周
    total_balance['is_third_week'] = 0
    total_balance.loc[total_balance['week'] % 4 == 3, 'is_third_week'] = 1

    # 是否是每个月第四个周
    total_balance['is_fourth_week'] = 0
    total_balance.loc[total_balance['week'] % 4 == 0, 'is_fourth_week'] = 1
    
    return total_balance.reset_index(drop=True)
In [7]:
# 提取is特征到数据集

total_balance = extract_is_feature(total_balance)
In [8]:
# 编码翌日特征

def encode_data(data: pd.DataFrame, feature_name:str = 'weekday', encoder=OneHotEncoder())->pd.DataFrame():
    total_balance = data.copy()
    week_feature = encoder.fit_transform(np.array(total_balance[feature_name]).reshape(-1, 1)).toarray()
    week_feature = pd.DataFrame(week_feature,columns= [feature_name + '_onehot_'+ str(x) for x in range(len(week_feature[0]))])
    #featureWeekday = pd.concat([total_balance, week_feature], axis = 1).drop(feature_name, axis=1)
    featureWeekday = pd.concat([total_balance, week_feature], axis = 1)
    return featureWeekday
In [9]:
# 编码翌日特征到数据集

total_balance = encode_data(total_balance)
In [10]:
# 生成is特征集合

feature = total_balance[[x for x in total_balance.columns if x not in date_indexs]]
In [11]:
feature.head()
Out[11]:
date total_purchase_amt total_redeem_amt is_weekend is_holiday is_firstday_of_holiday is_lastday_of_holiday is_firstday_of_work is_work is_gonna_work_tomorrow ... is_second_week is_third_week is_fourth_week weekday_onehot_0 weekday_onehot_1 weekday_onehot_2 weekday_onehot_3 weekday_onehot_4 weekday_onehot_5 weekday_onehot_6
0 2014-03-31 398884905.0 423852634.0 0 0 0 0 0 1 0 ... 1 0 0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
1 2014-04-01 453320585.0 277429358.0 0 0 0 0 0 1 0 ... 1 0 0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
2 2014-04-02 355347118.0 272612066.0 0 0 0 0 0 1 0 ... 1 0 0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
3 2014-04-03 363877120.0 266605457.0 0 0 0 0 0 1 0 ... 1 0 0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
4 2014-04-04 251895894.0 200192637.0 0 0 0 0 0 1 0 ... 1 0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0

5 rows × 29 columns

1.2 is特征的下标签分布分析

In [12]:
# 绘制箱型图

def draw_boxplot(data: pd.DataFrame)->None:
    f, axes = plt.subplots(7, 4, figsize=(18, 24))
    global date_indexs, labels
    count = 0
    for i in [x for x in data.columns if x not in date_indexs + labels + ['date']]:
        sns.boxenplot(x=i, y='total_purchase_amt', data=data, ax=axes[count // 4][count % 4])
        count += 1
In [13]:
draw_boxplot(feature)
In [14]:
## 剔除看起来较差的特征

purchase_feature_seems_useless = [
      #样本量太少,建模时无效;但若确定这是一个有用规则,可以对结果做修正
      'is_work_on_sunday',
      #中位数差异不明显
      'is_first_week'
]

1.3 IS 特征的相关性分析

In [15]:
# 画相关性热力图

def draw_correlation_heatmap(data: pd.DataFrame, way:str = 'pearson')->None:
    feature = data.copy()
    plt.figure(figsize=(20,10))
    plt.title('The ' + way +' coleration between total purchase and each feature')
    sns.heatmap(feature[[x for x in feature.columns if x not in ['total_redeem_amt', 'date'] ]].corr(way),linecolor='white',
        linewidths=0.1,
        cmap="RdBu")
In [16]:
draw_correlation_heatmap(feature, 'spearman')
In [17]:
# 剔除相关性较低的特征

temp = np.abs(feature[[x for x in feature.columns 
                       if x not in ['total_redeem_amt', 'date'] ]].corr('spearman')['total_purchase_amt'])
feature_low_correlation = list(set(temp[temp < 0.1].index))
In [18]:
feature_low_correlation
Out[18]:
['is_fourth_week',
 'is_secday_of_month',
 'is_firstday_of_work',
 'is_firstday_of_month',
 'is_lastday_of_workday',
 'is_work_on_sunday',
 'is_midmonth',
 'is_first_week']

2. 基于距离的特征

2.1 距离特征提取

In [19]:
# 提取距离特征

def extract_distance_feature(data: pd.DataFrame)->pd.DataFrame:
    total_balance = data.copy()
    
    # 距离放假还有多少天
    total_balance['dis_to_nowork'] = 0
    for index, row in total_balance.iterrows():
        if row['is_work'] == 0:
            step = 1
            flag = 1
            while flag:
                if index - step >= 0 and total_balance.loc[index - step, 'is_work'] == 1:
                    total_balance.loc[index - step, 'dis_to_nowork'] = step
                    step += 1
                else:
                    flag = 0

    total_balance['dis_from_nowork'] = 0
    step = 0
    for index, row in total_balance.iterrows():
        step += 1
        if row['is_work'] == 1:
            total_balance.loc[index, 'dis_from_nowork'] = step
        else:
            step = 0

    # 距离上班还有多少天
    total_balance['dis_to_work'] = 0
    for index, row in total_balance.iterrows():
        if row['is_work'] == 1:
            step = 1
            flag = 1
            while flag:
                if index - step >= 0 and total_balance.loc[index - step, 'is_work'] == 0:
                    total_balance.loc[index - step, 'dis_to_work'] = step
                    step += 1
                else:
                    flag = 0

    total_balance['dis_from_work'] = 0
    step = 0
    for index, row in total_balance.iterrows():
        step += 1
        if row['is_work'] == 0:
            total_balance.loc[index, 'dis_from_work'] = step
        else:
            step = 0


    # 距离节假日还有多少天
    total_balance['dis_to_holiday'] = 0
    for index, row in total_balance.iterrows():
        if row['is_holiday'] == 1:
            step = 1
            flag = 1
            while flag:
                if index - step >= 0 and total_balance.loc[index - step, 'is_holiday'] == 0:
                    total_balance.loc[index - step, 'dis_to_holiday'] = step
                    step += 1
                else:
                    flag = 0

    total_balance['dis_from_holiday'] = 0
    step = 0
    for index, row in total_balance.iterrows():
        step += 1
        if row['is_holiday'] == 0:
            total_balance.loc[index, 'dis_from_holiday'] = step
        else:
            step = 0

    # 距离节假日最后一天还有多少天
    total_balance['dis_to_holiendday'] = 0
    for index, row in total_balance.iterrows():
        if row['is_lastday_of_holiday'] == 1:
            step = 1
            flag = 1
            while flag:
                if index - step >= 0 and total_balance.loc[index - step, 'is_lastday_of_holiday'] == 0:
                    total_balance.loc[index - step, 'dis_to_holiendday'] = step
                    step += 1
                else:
                    flag = 0

    total_balance['dis_from_holiendday'] = 0
    step = 0
    for index, row in total_balance.iterrows():
        step += 1
        if row['is_lastday_of_holiday'] == 0:
            total_balance.loc[index, 'dis_from_holiendday'] = step
        else:
            step = 0

    # 距离月初第几天
    total_balance['dis_from_startofmonth'] = np.abs(total_balance['day'])

    # 距离月的中心点有几天
    total_balance['dis_from_middleofmonth'] = np.abs(total_balance['day'] - 15)

    # 距离星期的中心有几天
    total_balance['dis_from_middleofweek'] = np.abs(total_balance['weekday'] - 3)

    # 距离星期日有几天
    total_balance['dis_from_endofweek'] = np.abs(total_balance['weekday'] - 6)

    return total_balance
In [20]:
# 拼接距离特征到原数据集

total_balance = extract_distance_feature(total_balance)
In [21]:
total_balance.shape
Out[21]:
(198, 46)

2.2 距离特征分析

In [22]:
# 获取距离特征的列名

feature = total_balance[[x for x in total_balance.columns if x not in date_indexs]]
dis_feature_indexs = [x for x in feature.columns if (x not in date_indexs + labels + ['date']) & ('dis' in x)]
In [23]:
# 画点线

def draw_point_feature(data: pd.DataFrame)->None:
    feature = data.copy()
    f, axes = plt.subplots(data.shape[1] // 3, 3, figsize=(30, data.shape[1] // 3 * 4))
    count = 0
    for i in [x for x in feature.columns if (x not in date_indexs + labels + ['date'])]:
        sns.pointplot(x=i, y="total_purchase_amt",
                markers=["^", "o"], linestyles=["-", "--"],
                kind="point", data=feature, ax=axes[count // 3][count % 3] if data.shape[1] > 3 else axes[count])
        count += 1
In [24]:
draw_point_feature(feature[['total_purchase_amt'] + dis_feature_indexs])
In [25]:
# 处理距离过远的时间点

def dis_change(x):
    if x > 5:
        x = 10
    return x
In [26]:
# 处理特殊距离

dis_holiday_feature = [x for x in total_balance.columns if 'dis' in x and 'holi' in x]
dis_month_feature = [x for x in total_balance.columns if 'dis' in x and 'month' in x]
total_balance[dis_holiday_feature] = total_balance[dis_holiday_feature].applymap(dis_change)
total_balance[dis_month_feature] = total_balance[dis_month_feature].applymap(dis_change)
In [27]:
feature = total_balance[[x for x in total_balance.columns if x not in date_indexs]]
In [28]:
# 画处理后的点线图

draw_point_feature(feature[['total_purchase_amt'] + dis_feature_indexs])
In [29]:
## 剔除看起来用处不大的特征
purchase_feature_seems_useless += [
                                  #即使做了处理,但方差太大,不可信,规律不明显
                                  'dis_to_holiday',
                                  #方差太大,不可信
                                  'dis_from_startofmonth',
                                  #方差太大,不可信
                                  'dis_from_middleofmonth'
]
In [30]:
# 画出相关性图

draw_correlation_heatmap(feature[['total_purchase_amt'] + dis_feature_indexs])
In [31]:
# 剔除相关性较差的特征

temp = np.abs(feature[[x for x in feature.columns 
                       if ('dis' in x) | (x in ['total_purchase_amt']) ]].corr()['total_purchase_amt'])
feature_low_correlation += list(set(temp[temp < 0.1].index) )
In [32]:
feature_low_correlation
Out[32]:
['is_fourth_week',
 'is_secday_of_month',
 'is_firstday_of_work',
 'is_firstday_of_month',
 'is_lastday_of_workday',
 'is_work_on_sunday',
 'is_midmonth',
 'is_first_week',
 'dis_from_holiday',
 'dis_from_startofmonth',
 'dis_from_middleofmonth']

3. 波峰波谷特征

3.1 提取波峰特征

In [33]:
# 观察波峰特点

fig = plt.figure(figsize=(15,12))
for i in range(6, 10):
    plt.subplot(5,1,i - 5)
    total_balance_2 = total_balance[(total_balance['date'] >= datetime.datetime(2014,8,1)) & (total_balance['date'] < datetime.datetime(2014,9,1))]
    sns.pointplot(x=total_balance_2['day'],y=total_balance_2['total_purchase_amt'])
    plt.legend().set_title('Month:' + str(i))
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
In [34]:
#Purchase

#0401(周二)                                     0406(周日,清明节第二天)
#0410(周四,与周二近似)                         0412(周六,与周日近似)
#0415(周二)                                   0420(周日)
#0424(周四,与周二在近似水平)                 0427(周日)
#0429(周二)                                   0502(周五,劳动节第二天)
#0507(周三,与周二差异较大,可能受劳务节影响) 0511(周日)
#0512(周一,与周二有一定差距)                 0518(周日)
#0519(周二)                                   0525(周日)
#0526(周一,与周二有一定差距)                 0531(周六,月末)
#0605(周四,与周二差异大,可能受端午节影响)   0607(周六,可能受端午节影响)
#0609(周一,与周二近似)                       0615(周日)
#0616(周一,与周二差异大)                     0622(周日)
#0626(周四,与周二差异不大)                   0629(周日)
#0701(周二)                                   0705(周六,与周日差距不大)
#0707(周一,与周二有差距)                     0713(周日)
#0716(周三,与周二有一定差距)                 0720(周日)
#0721(周一,与周二有明显差距)                 0726(周六,与周日近似)
#0728(周一,与周二有明显差距)                 0803(周日)
#0805(周二)                                   0809(周六,与周日有较大差距)
#0811(周一,有周二有较大差距)                 0817(周日)
#0818(周一,与周二差距不大)                   0824(周日)
In [35]:
# 设定波峰日期

def extract_peak_feature(data: pd.DataFrame)->pd.DataFrame:
    total_balance = data.copy()
    # 距离purchase波峰(即周二)有几天
    total_balance['dis_from_purchase_peak'] = np.abs(total_balance['weekday'] - 1)

    # 距离purchase波谷(即周日)有几天,与dis_from_endofweek相同
    total_balance['dis_from_purchase_valley'] = np.abs(total_balance['weekday'] - 6)
    
    return total_balance
In [36]:
# 提取波峰特征

total_balance = extract_peak_feature(total_balance)
feature = total_balance[[x for x in total_balance.columns if x not in date_indexs]]
In [37]:
feature.head()
Out[37]:
date total_purchase_amt total_redeem_amt is_weekend is_holiday is_firstday_of_holiday is_lastday_of_holiday is_firstday_of_work is_work is_gonna_work_tomorrow ... dis_to_holiday dis_from_holiday dis_to_holiendday dis_from_holiendday dis_from_startofmonth dis_from_middleofmonth dis_from_middleofweek dis_from_endofweek dis_from_purchase_peak dis_from_purchase_valley
0 2014-03-31 398884905.0 423852634.0 0 0 0 0 0 1 0 ... 5 1 10 1 10 10 3 6 1 6
1 2014-04-01 453320585.0 277429358.0 0 0 0 0 0 1 0 ... 4 2 10 2 1 10 2 5 0 5
2 2014-04-02 355347118.0 272612066.0 0 0 0 0 0 1 0 ... 3 3 5 3 2 10 1 4 1 4
3 2014-04-03 363877120.0 266605457.0 0 0 0 0 0 1 0 ... 2 4 4 4 3 10 0 3 2 3
4 2014-04-04 251895894.0 200192637.0 0 0 0 0 0 1 0 ... 1 5 3 5 4 10 1 2 3 2

5 rows × 43 columns

3.2 分析波峰特征

In [38]:
draw_point_feature(feature[['total_purchase_amt'] + ['dis_from_purchase_peak','dis_from_purchase_valley']])

3.3 分析波峰特征相关性

In [39]:
temp = np.abs(feature[[x for x in feature.columns if ('peak' in x) or ('valley' in x) or (x in ['total_purchase_amt']) ]].corr()['total_purchase_amt'])
temp
Out[39]:
total_purchase_amt          1.000000
dis_from_purchase_peak      0.682149
dis_from_purchase_valley    0.653811
Name: total_purchase_amt, dtype: float64

4. 周期因子分析

4.1 提取周期因子

In [40]:
def generate_rate(df, month_index):
    total_balance = df.copy()
    pure_balance = total_balance[['date','total_purchase_amt','total_redeem_amt']]
    pure_balance = pure_balance[(pure_balance['date'] >= datetime.datetime(2014,3,1)) & (pure_balance['date'] < datetime.datetime(2014, month_index, 1))]
    pure_balance['weekday'] = pure_balance['date'].dt.weekday
    pure_balance['day'] = pure_balance['date'].dt.day
    pure_balance['week'] = pure_balance['date'].dt.week
    pure_balance['month'] = pure_balance['date'].dt.month
    weekday_rate = pure_balance[['weekday']+labels].groupby('weekday',as_index=False).mean()
    for name in labels:
        weekday_rate = weekday_rate.rename(columns={name: name+'_weekdaymean'})
    weekday_rate['total_purchase_amt_weekdaymean'] /= np.mean(pure_balance['total_purchase_amt'])
    weekday_rate['total_redeem_amt_weekdaymean'] /= np.mean(pure_balance['total_redeem_amt'])
    pure_balance = pd.merge(pure_balance, weekday_rate, on='weekday', how='left')
    weekday_count = pure_balance[['day','weekday','date']].groupby(['day','weekday'],as_index=False).count()
    weekday_count = pd.merge(weekday_count, weekday_rate, on = 'weekday')
    weekday_count['total_purchase_amt_weekdaymean'] *= weekday_count['date'] / (len(set(pure_balance['month'])) - 1)
    weekday_count['total_redeem_amt_weekdaymean'] *= weekday_count['date'] / (len(set(pure_balance['month'])) - 1)
    day_rate = weekday_count.drop(['weekday','date'],axis=1).groupby('day',as_index=False).sum()
    weekday_rate.columns = ['weekday','purchase_weekdayrate','redeem_weekdayrate']
    day_rate.columns = ['day','purchase_dayrate','redeem_dayrate']
    day_rate['date'] = datetime.datetime(2014, month_index, 1)
    for index, row in day_rate.iterrows():
        if month_index in (2,4,6,9) and row['day'] == 31:
            continue
        day_rate.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day']))
    day_rate['weekday'] = day_rate['date'].dt.weekday
    day_rate = pd.merge(day_rate, weekday_rate, on='weekday')
    day_rate['purchase_dayrate'] = day_rate['purchase_weekdayrate'] / day_rate['purchase_dayrate']
    day_rate['redeem_dayrate'] = day_rate['redeem_weekdayrate'] / day_rate['redeem_dayrate']
    weekday_rate['month'] = month_index
    day_rate['month'] = month_index
    
    return weekday_rate, day_rate[['day','purchase_dayrate','redeem_dayrate','month']].sort_values('day')
In [41]:
# 生成周期因子并合并到数据集

weekday_rate_list = []
day_rate_list = []
for i in range(3, 10):
    weekday_rate, day_rate = generate_rate(total_balance, i)
    weekday_rate_list.append(weekday_rate.reset_index(drop=True))
    day_rate_list.append(day_rate.reset_index(drop=True))

weekday_rate_list = pd.concat(weekday_rate_list).reset_index(drop=True)
day_rate_list = pd.concat(day_rate_list).reset_index(drop=True)
total_balance = pd.merge(total_balance, weekday_rate_list, on=['weekday','month'], how='left')
total_balance = pd.merge(total_balance, day_rate_list, on=['day','month'], how='left')
In [42]:
# 对周期因子进行特殊处理

for i in [x for x in total_balance.columns 
          if 'rate' in x and x not in labels + date_indexs]:
    total_balance[i] = total_balance[i].fillna(np.nanmedian(total_balance[i]))
In [43]:
total_balance.head()
Out[43]:
date total_purchase_amt total_redeem_amt day month year week weekday is_weekend is_holiday ... dis_from_startofmonth dis_from_middleofmonth dis_from_middleofweek dis_from_endofweek dis_from_purchase_peak dis_from_purchase_valley purchase_weekdayrate redeem_weekdayrate purchase_dayrate redeem_dayrate
0 2014-03-31 398884905.0 423852634.0 31 3 2014 14 0 0 0 ... 10 10 3 6 1 6 1.125999 1.014235 1.114807 1.013956
1 2014-04-01 453320585.0 277429358.0 1 4 2014 14 1 0 0 ... 1 10 2 5 0 5 1.125999 1.014235 1.114807 1.013956
2 2014-04-02 355347118.0 272612066.0 2 4 2014 14 2 0 0 ... 2 10 1 4 1 4 1.125999 1.014235 1.114807 1.013956
3 2014-04-03 363877120.0 266605457.0 3 4 2014 14 3 0 0 ... 3 10 0 3 2 3 1.125999 1.014235 1.114807 1.013956
4 2014-04-04 251895894.0 200192637.0 4 4 2014 14 4 0 0 ... 4 10 1 2 3 2 1.125999 1.014235 1.114807 1.013956

5 rows × 52 columns

4.2 分析周期因子的相关性

In [44]:
# 画出相关性图

draw_correlation_heatmap(total_balance[['total_purchase_amt'] 
                                       + [x for x in total_balance.columns 
                                          if 'rate' in x and x not in labels + date_indexs]])
In [45]:
# 剔除相关性低的特征

feature = total_balance.drop(date_indexs, axis=1)
feature
Out[45]:
date total_purchase_amt total_redeem_amt is_weekend is_holiday is_firstday_of_holiday is_lastday_of_holiday is_firstday_of_work is_work is_gonna_work_tomorrow ... dis_from_startofmonth dis_from_middleofmonth dis_from_middleofweek dis_from_endofweek dis_from_purchase_peak dis_from_purchase_valley purchase_weekdayrate redeem_weekdayrate purchase_dayrate redeem_dayrate
0 2014-03-31 398884905.0 423852634.0 0 0 0 0 0 1 0 ... 10 10 3 6 1 6 1.125999 1.014235 1.114807 1.013956
1 2014-04-01 453320585.0 277429358.0 0 0 0 0 0 1 0 ... 1 10 2 5 0 5 1.125999 1.014235 1.114807 1.013956
2 2014-04-02 355347118.0 272612066.0 0 0 0 0 0 1 0 ... 2 10 1 4 1 4 1.125999 1.014235 1.114807 1.013956
3 2014-04-03 363877120.0 266605457.0 0 0 0 0 0 1 0 ... 3 10 0 3 2 3 1.125999 1.014235 1.114807 1.013956
4 2014-04-04 251895894.0 200192637.0 0 0 0 0 0 1 0 ... 4 10 1 2 3 2 1.125999 1.014235 1.114807 1.013956
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
193 2014-10-10 NaN NaN 0 0 0 0 0 1 0 ... 10 5 1 2 3 2 1.125999 1.014235 1.114807 1.013956
194 2014-10-11 NaN NaN 1 0 0 0 0 0 0 ... 10 4 2 1 4 1 1.125999 1.014235 1.114807 1.013956
195 2014-10-12 NaN NaN 1 0 0 0 0 0 1 ... 10 3 3 0 5 0 1.125999 1.014235 1.114807 1.013956
196 2014-10-13 NaN NaN 0 0 0 0 0 1 0 ... 10 2 3 6 1 6 1.125999 1.014235 1.114807 1.013956
197 2014-10-14 NaN NaN 0 0 0 0 0 1 0 ... 10 1 2 5 0 5 1.125999 1.014235 1.114807 1.013956

198 rows × 47 columns

5. 动态时序分析

5.1 提取动态特征

In [46]:
## 提取动态特征

def get_amtfeature_with_time(data: pd.DataFrame)->pd.DataFrame:
    df_tmp_ = data[labels + date_indexs + ['date']].copy()
    total_balance = data.copy()
    
    df_tmp_ = df_tmp_[(df_tmp_['date']>=datetime.datetime(2014,3,3))]
    df_tmp_['weekday'] = df_tmp_['date'].dt.weekday + 1
    df_tmp_['week'] = df_tmp_['date'].dt.week - min(df_tmp_['date'].dt.week) + 1
    df_tmp_['day'] = df_tmp_['date'].dt.day
    df_tmp_['month'] = df_tmp_['date'].dt.month
    df_tmp_.reset_index(inplace=True)
    del df_tmp_['index']
    df_purchase = pd.DataFrame(columns = ['weekday1','weekday2','weekday3','weekday4',
                                          'weekday5','weekday6','weekday7'])
    count = 0

    for i in range(len(df_tmp_)):
        df_purchase.loc[count,'weekday'+str(df_tmp_.loc[i,'weekday'])] = df_tmp_.loc[i,'total_purchase_amt']
        if df_tmp_.loc[i,'weekday'] == 7:
            count = count + 1

    df_tmp_['purchase_weekday_median'] = np.nan
    df_tmp_['purchase_weekday_mean'] = np.nan
    df_tmp_['purchase_weekday_min'] = np.nan
    df_tmp_['purchase_weekday_max'] = np.nan
    df_tmp_['purchase_weekday_std'] = np.nan
    df_tmp_['purchase_weekday_skew'] = np.nan

    for i in range(len(df_tmp_)):
        #从2014年3月31日开始统计
        if i > 4*7-1:
            df_tmp_.loc[i,'purchase_weekday_median'] = df_purchase.loc[:df_tmp_.loc[i,'week']-2,
                                          'weekday'+str(df_tmp_.loc[i,'weekday'])].median()

            df_tmp_.loc[i,'purchase_weekday_mean'] = df_purchase.loc[:df_tmp_.loc[i,'week']-2,
                                          'weekday'+str(df_tmp_.loc[i,'weekday'])].mean()

            df_tmp_.loc[i,'purchase_weekday_min'] = df_purchase.loc[:df_tmp_.loc[i,'week']-2,
                                          'weekday'+str(df_tmp_.loc[i,'weekday'])].min()    

            df_tmp_.loc[i,'purchase_weekday_max'] = df_purchase.loc[:df_tmp_.loc[i,'week']-2,
                                          'weekday'+str(df_tmp_.loc[i,'weekday'])].max()   

            df_tmp_.loc[i,'purchase_weekday_std'] = df_purchase.loc[:df_tmp_.loc[i,'week']-2,
                                          'weekday'+str(df_tmp_.loc[i,'weekday'])].std() 

            df_tmp_.loc[i,'purchase_weekday_skew'] = df_purchase.loc[:df_tmp_.loc[i,'week']-2,
                                          'weekday'+str(df_tmp_.loc[i,'weekday'])].skew() 

    colList = ['purchase_weekday_median','purchase_weekday_mean','purchase_weekday_min',
               'purchase_weekday_max','purchase_weekday_std','purchase_weekday_skew']
    total_balance = pd.merge(total_balance, df_tmp_[colList+['day','month']], on=['day','month'], how='left')
    return total_balance
In [47]:
# 合并特征到数据集

total_balance = get_amtfeature_with_time(total_balance)
In [48]:
# 对动态特征做特殊处理

for i in [x for x in total_balance.columns 
          if '_weekday_' in x and x not in labels + date_indexs]:
    total_balance[i] = total_balance[i].fillna(np.nanmedian(total_balance[i]))
In [49]:
total_balance.head()
Out[49]:
date total_purchase_amt total_redeem_amt day month year week weekday is_weekend is_holiday ... purchase_weekdayrate redeem_weekdayrate purchase_dayrate redeem_dayrate purchase_weekday_median purchase_weekday_mean purchase_weekday_min purchase_weekday_max purchase_weekday_std purchase_weekday_skew
0 2014-03-31 398884905.0 423852634.0 31 3 2014 14 0 0 0 ... 1.125999 1.014235 1.114807 1.013956 303834661.5 3.060830e+08 158219402.0 392838756.0 5.412342e+07 0.357456
1 2014-04-01 453320585.0 277429358.0 1 4 2014 14 1 0 0 ... 1.125999 1.014235 1.114807 1.013956 303834661.5 3.060830e+08 158219402.0 392838756.0 5.412342e+07 0.357456
2 2014-04-02 355347118.0 272612066.0 2 4 2014 14 2 0 0 ... 1.125999 1.014235 1.114807 1.013956 303834661.5 3.060830e+08 158219402.0 392838756.0 5.412342e+07 0.357456
3 2014-04-03 363877120.0 266605457.0 3 4 2014 14 3 0 0 ... 1.125999 1.014235 1.114807 1.013956 303834661.5 3.060830e+08 158219402.0 392838756.0 5.412342e+07 0.357456
4 2014-04-04 251895894.0 200192637.0 4 4 2014 14 4 0 0 ... 1.125999 1.014235 1.114807 1.013956 303834661.5 3.060830e+08 158219402.0 392838756.0 5.412342e+07 0.357456

5 rows × 58 columns

5.2 分析动态特征相关性

In [50]:
# 绘制动态特征的相关性图

draw_correlation_heatmap(total_balance[['total_purchase_amt'] + 
                                      ['purchase_weekday_median',
                                      'purchase_weekday_mean','purchase_weekday_min',
                                       'purchase_weekday_max','purchase_weekday_std',
                                       'purchase_weekday_skew'
                                      ]])
In [51]:
feature[labels + ['dis_to_nowork', 'dis_to_work', 'dis_from_work', 'purchase_weekdayrate',
       'redeem_dayrate', 'weekday_onehot_5', 'weekday_onehot_6',
       'dis_from_nowork', 'is_holiday', 'weekday_onehot_1', 'weekday_onehot_2',
       'weekday_onehot_0', 'dis_from_middleofweek', 'dis_from_holiendday',
       'weekday_onehot_3', 'is_lastday_of_holiday', 'is_firstday_of_holiday',
       'weekday_onehot_4', 'is_worked_yestday', 'is_second_week',
       'is_third_week', 'dis_from_startofmonth', 'dis_from_holiday', 'total_purchase_amt',
       'total_redeem_amt', 'date']].to_csv('Feature/0615_residual_purchase_origined.csv', index=False)

特征劣汰剔除

2.1 剔除无法有效分割数据集的特征

In [52]:
# 画出各个特征分割数据集的分布估计图
plt.figure(figsize=(4 * 6, 6 * len(feature.columns) / 6))
count = 0
for i in [x for x in feature.columns 
          if (x not in labels + date_indexs + ['date']) 
          & ('amt' not in x) & ('dis' not in x) & ('rate' not in x)]:
    count += 1
    if feature[feature[i] == 0].empty:
        continue
    plt.subplot(len(feature.columns) / 4, 4, count)
    
    ax = sns.kdeplot(feature[feature[i] == 0]['total_purchase_amt'], label= str(i) + ' = 0, 买')
    ax = sns.kdeplot(feature[feature[i] == 1]['total_purchase_amt'], label= str(i) + ' = 1, 买') 
    plt.rcParams.update({'font.size': 8})
    plt.legend(loc = 0)
    plt.title(str(i))
In [53]:
# 剔除对数据集划分不明显的特征

purchase_feature_seems_useless += ['is_gonna_work_tomorrow','is_fourth_week','weekday_onehot_4']

2.2 使用MVTest挽回一些有依赖性但是不相关的特征

In [54]:
feature_low_correlation
Out[54]:
['is_fourth_week',
 'is_secday_of_month',
 'is_firstday_of_work',
 'is_firstday_of_month',
 'is_lastday_of_workday',
 'is_work_on_sunday',
 'is_midmonth',
 'is_first_week',
 'dis_from_holiday',
 'dis_from_startofmonth',
 'dis_from_middleofmonth']
In [55]:
# MVtest Ref: https://github.com/ChuanyuXue/MVTest

l = mvtest.mvtest()

name_list = []
Tn_list = []
p_list = []
for i in [i for i in feature_low_correlation if 'is' in i or 'discret' in i]:
    pair = l.test(feature['total_purchase_amt'], feature[i])
    name_list.append(str(i))
    Tn_list.append(pair['Tn'])
    p_list.append(pair['p-value'][0])
temp = pd.DataFrame([name_list,Tn_list]).T.sort_values(1)
temp[1] = np.abs(temp[1])
feature_saved_from_mv_purchase = list(temp.sort_values(1, ascending=False)[temp[1] > 0.5984][0])

2.3 剔除复共线特征

In [56]:
feature = feature[[x for x in feature.columns 
                   if (x not in feature_low_correlation + purchase_feature_seems_useless) or\
                   (x in feature_saved_from_mv_purchase )]]
In [57]:
purchase_cors = feature.corr()
purchase_cors['total_purchase_amt'] = np.abs(purchase_cors['total_purchase_amt'])
feature_lists = list(purchase_cors.sort_values(by='total_purchase_amt',ascending=False).index)[2:]
feature_temp = feature.dropna()
In [58]:
# 这里要注意 保留的时候按照相关性降序排序 剔除按照相关性升序排序的顺序
thershold = 0.8
for i in range(len(feature_lists)):
    for k in range(len(feature_lists)-1, -1, -1):
        if i >= len(feature_lists) or k >= len(feature_lists) or i == k:
            break
        if np.abs(np.corrcoef(feature_temp[feature_lists[i]], feature_temp[feature_lists[k]])[0][1]) > thershold:
            higher_feature_temp = feature_temp[feature_lists[i]] * feature_temp[feature_lists[k]]
            if np.abs(np.corrcoef(feature_temp[feature_lists[i]], higher_feature_temp)[0][1]) <= thershold:
                name = str(feature_lists[i]) + '%%%%' + str(feature_lists[k])
                feature_temp[name] = higher_feature_temp
                feature[name] = feature[feature_lists[i]] * feature[feature_lists[k]]
                feature_lists.append(name)
            feature_temp = feature_temp.drop(feature_lists[k], axis=1)
            feature_lists.remove(feature_lists[k])
In [59]:
feature = feature[[x for x in feature_lists if x not in labels] + labels + ['date']]
In [60]:
feature_lists
Out[60]:
['dis_to_nowork',
 'dis_to_work',
 'dis_from_work',
 'purchase_weekdayrate',
 'total_redeem_amt',
 'redeem_dayrate',
 'weekday_onehot_5',
 'weekday_onehot_6',
 'dis_from_nowork',
 'is_holiday',
 'weekday_onehot_1',
 'weekday_onehot_2',
 'weekday_onehot_0',
 'dis_from_middleofweek',
 'dis_from_holiendday',
 'weekday_onehot_3',
 'is_lastday_of_holiday',
 'is_firstday_of_holiday',
 'is_tailmonth',
 'is_premonth',
 'is_worked_yestday',
 'is_second_week',
 'is_third_week',
 'dis_from_startofmonth',
 'dis_from_holiday',
 'dis_to_nowork%%%%dis_from_purchase_peak']
In [61]:
feature.to_csv('Feature/purchase_feature_droped_0614.csv',index=False)

选出优胜特征

In [62]:
# 分割数据集

def split_data_underline(data):
    trainset = data[(datetime.datetime(2014,4,1) <= data['date']) & (data['date'] < datetime.datetime(2014,8,1))]
    testset = data[(datetime.datetime(2014,8,1) <= data['date']) & (data['date'] < datetime.datetime(2014,9,1))]
    return trainset, testset

3.1 使用SHAP包获取优胜特征

SHAP testues represent the fair score of features depending on their contribution towards the total score in the set of features.

SHAP also can visualize how the score changes when the feature testue is low/high on each data.

In [63]:
shap.initjs()
from sklearn import tree
model = tree.DecisionTreeRegressor()
train, test = split_data_underline(feature.dropna())
features = [x for x in train.columns if x not in date_indexs]
model.fit(train[features].drop(labels+['date'], axis=1), train['total_purchase_amt'])

explainer = shap.TreeExplainer(model)
shap_testues = explainer.shap_values(test[features].drop(labels+['date'], axis=1))

shap.summary_plot(shap_testues, test[features].drop(labels+['date'], axis=1), plot_type='bar')

shap.summary_plot(shap_testues, test[features].drop(labels+['date'], axis=1))

tree_important_purchase = pd.DataFrame(np.mean(np.abs(shap_testues), axis=0),[x for x in features if x not in labels + date_indexs + ['date']]).reset_index()
In [64]:
tree_important_purchase = tree_important_purchase.sort_values(0, ascending=False).reset_index(drop=True)
tree_important_purchase = list(tree_important_purchase[:20]['index'])
In [65]:
tree_important_purchase
Out[65]:
['dis_to_nowork',
 'is_tailmonth',
 'redeem_dayrate',
 'is_premonth',
 'purchase_weekdayrate',
 'is_third_week',
 'dis_from_startofmonth',
 'is_worked_yestday',
 'dis_from_holiday',
 'dis_from_nowork',
 'is_second_week',
 'weekday_onehot_1',
 'dis_to_nowork%%%%dis_from_purchase_peak',
 'weekday_onehot_3',
 'weekday_onehot_5',
 'dis_from_middleofweek',
 'dis_from_holiendday',
 'dis_to_work',
 'weekday_onehot_2',
 'weekday_onehot_6']
In [66]:
# 输出选择的特征

def draw_cloud(feature_index: List[str])->None:
    plt.figure(figsize=(20,10))
    plt.subplot(1,2,1)
    ciyun = WordCloud(background_color='white', max_font_size=40)
    ciyun.generate(text=''.join([x+' ' for x in feature_index if x != 'total_purchase_amt']))
    plt.imshow(ciyun, interpolation='bilinear')
    plt.axis("off")
In [67]:
draw_cloud(tree_important_purchase)

3.2 使用 Permutation Importance 包获取优胜特征

SHAP testues represent the fair score of features depending on their contribution towards the total score in the set of features.

SHAP also can visualize how the score changes when the feature testue is low/high on each data.

In [68]:
model = LinearRegression()
train, test = split_data_underline(feature.dropna())
model.fit(train[features].drop(labels+['date'], axis=1), train['total_purchase_amt'])
perm = PermutationImportance(model, random_state=42).fit(test[features].drop(labels+['date'], axis=1), test['total_purchase_amt'])
liner_important_purchase = pd.DataFrame(np.abs(perm.feature_importances_), [x for x in features if x not in labels + date_indexs + ['date']]).reset_index()
eli5.show_weights(perm, feature_names=list(str(x) for x in features if x not in labels + ['date']))
Out[68]:
Weight Feature
1.5833 ± 0.2491 dis_from_middleofweek
1.0012 ± 0.9894 dis_to_work
0.6924 ± 0.3141 weekday_onehot_2
0.6844 ± 0.4412 weekday_onehot_3
0.5292 ± 0.4108 dis_from_nowork
0.5021 ± 0.4352 weekday_onehot_1
0.4113 ± 0.1239 dis_from_work
0.3898 ± 0.1135 weekday_onehot_5
0.2494 ± 0.1985 weekday_onehot_0
0.1342 ± 0.2148 weekday_onehot_6
0.0626 ± 0.0867 redeem_dayrate
0.0291 ± 0.1535 is_tailmonth
0.0224 ± 0.0126 is_third_week
0.0175 ± 0.0142 dis_from_startofmonth
0.0058 ± 0.0181 is_premonth
0.0030 ± 0.0070 purchase_weekdayrate
0 ± 0.0000 is_lastday_of_holiday
0 ± 0.0000 is_firstday_of_holiday
0 ± 0.0000 dis_from_holiendday
0 ± 0.0000 is_holiday
… 5 more …
In [69]:
liner_important_purchase = liner_important_purchase.sort_values(0, ascending=False).reset_index(drop=True)
liner_important_purchase = list(liner_important_purchase[:20]['index'])
In [70]:
draw_cloud(liner_important_purchase)

3.3 量特征集合取交集选出最终优胜特征

In [71]:
winer_features_purchase = list(set(tree_important_purchase)\
                     & set(liner_important_purchase))
In [72]:
draw_cloud(winer_features_purchase)

数据准备&辅助函数整合

In [73]:
import pandas as pd
import sklearn as skr
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.relativedelta import relativedelta
from typing import *
import random
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')
np.random.seed(1024)

labels = ['total_purchase_amt', 'total_redeem_amt']
In [74]:
# 分割数据集

def split_data_underline(data: pd.DataFrame)->pd.DataFrame:
    trainset = data[(datetime.datetime(2014,4,1) <= data['date']) & (data['date'] < datetime.datetime(2014,8,1))]
    testset = data[(datetime.datetime(2014,8,1) <= data['date']) & (data['date'] < datetime.datetime(2014,9,1))]
    return trainset, testset

def split_data_online(data: pd.DataFrame)->pd.DataFrame:
    trainset = data[(datetime.datetime(2014,4,1) <= data['date']) & (data['date'] < datetime.datetime(2014,9,1))]
    testset = data[(datetime.datetime(2014,9,1) <= data['date']) & (data['date'] < datetime.datetime(2014,10,1))]
    return trainset, testset

In [75]:
# 定义评价函数

def AE(y: Iterable, yhat: Iterable)->Iterable:
    return np.abs(y - yhat) / np.abs(y)

def total_AE(purchasehat: Iterable, redeemhat: Iterable, purchase: Iterable, redeem: Iterable, h: int = 0.3)->Iterable:
    return sum(map(lambda x : np.exp(-x/h)*10, AE(purchase, purchasehat))) * 0.45 + sum(map(lambda x : np.exp(-x/h)*10, AE(redeem, redeemhat))) * 0.55
In [76]:
# 在不同的时间段对模型进行验证

def week_evalution_single(data: pd.DataFrame, model: object, types: str)->pd.DataFrame:
    results = []
    a_month = relativedelta(months=1)
    for i in [datetime.datetime(2014, 8, 1), datetime.datetime(2014, 7, 25), datetime.datetime(2014, 7, 18), datetime.datetime(2014, 7, 11), 
          datetime.datetime(2014, 7, 4), datetime.datetime(2014, 6, 27), datetime.datetime(2014, 6,20)]:
        trainset = data[(i - 4 * a_month <= data['date']) & (data['date'] < i)]
        testset = data[(i <= data['date']) & (data['date'] < i + a_month)]
        if len(testset) == 0 or len(trainset) == 0:
            i = datetime.datetime(2014, 4, 20)
            trainset = data[(i - 4 * a_month <= data['date']) & (data['date'] < i)]
            testset = data[(i <= data['date']) & (data['date'] < datetime.datetime(2014, 9, 1))]
        feature = [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']]
        
        model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])
        result_lr = model.predict(testset[feature])
        
        h = 0.3
        results.append(sum(AE(testset['total_' + types + '_amt'], result_lr).apply(lambda x : np.exp(-x/h))*10))
    return pd.DataFrame(results)
In [77]:
# 输出评级表格

def draw_eva_table(df: pd.DataFrame)->pd.DataFrame:
    rest = df.copy()
    rest['interval'] = [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11), 
          datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]
    return rest
In [78]:
# 对生成结果进行可视化

def visual(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame)->None:
    fig = plt.figure(figsize=(10,4))
    plt.plot(testset['date'], result_purchase_lr, label='predicted_purchase')
    plt.plot(testset['date'], testset['total_purchase_amt'], label='real_redeem')

    plt.legend(loc='best')
    plt.title("The distribution of real and predict purchase")
    plt.xlabel("Time")
    plt.ylabel("Amount")
    plt.show()
    fig = plt.figure(figsize=(10,4))
    sns.barplot(testset['date'].dt.day ,result_purchase_lr - testset['total_purchase_amt'])

    fig = plt.figure(figsize=(10,4))
    plt.plot(testset['date'], result_redeem_lr, label='predicted_redeem')
    plt.plot(testset['date'], testset['total_redeem_amt'], label='real_redeem')

    plt.legend(loc='best')
    plt.title("The distribution of real and predict redeem")
    plt.xlabel("Time")
    plt.ylabel("Amount")
    plt.show()
    fig = plt.figure(figsize=(10,4))
    sns.barplot(testset['date'].dt.day ,result_redeem_lr - testset['total_redeem_amt'])
In [79]:
# 定义提取线下最好效果特征的函数

def feature_extract(data: pd.DataFrame, model: object, types: str)->Tuple[List[str], List[float]]:
    features = [x for x in data.columns if x not in labels + ['date']]
    random.shuffle(features)
    results = []
    score = -1
    for i in features:
        score_update = np.mean(week_evalution_single(data[results + [i] + labels + ['date']], model, types))
        if score_update > score:
            score = score_update
            results.append(i)
    return results, score
    
def robust_feature_extract(data: pd.DataFrame, model: object, types: str):
    results = []
    score = -1
    for i in range(10):
        results_update, score_update = feature_extract(data, model, types)
        if score_update > score:
            score = score_update
            results = results_update
        print(results_update, score_update)
    return results
In [80]:
# 定义AIC,BIC评价指标

def AIC(L: Iterable, delta: float, n_features: int):
    return L * np.log10(delta) + 2 * (n_features + 1)
def AIC(L: Iterable, delta: float, n_features: int):
    return L * np.log10(delta) + (n_features + 1) * np.log10(L)
In [81]:
# 使用AIC指标融合模型

def feature_extract_AIC(data: pd.DataFrame, model: object, types: str)->Tuple[List[str], float]:
    features = [x for x in data.columns if x not in labels + ['date']]
    random.shuffle(features)
    results = []
    test_score = 1e9
    train_score = 0
    for i in features:
        test_score_update = np.mean(week_evalution_single(data[results + [i] + labels + ['date']], model, types)[0])
        if test_score_update < test_score:
            test_score = test_score_update
            results.append(i)
            
    trainset, testset = split_data_underline(data)
    feature = results
    model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])
    train_result_lr = model.predict(trainset[feature])
    delta = mean_squared_error(train_result_lr, trainset['total_' + types + '_amt'])
    #delta = np.sum(AE(trainset['total_' + types + '_amt'], train_result_lr).apply(lambda x : np.exp(-x/0.1))*10)
    return results, AIC(len(trainset), delta, len(feature))

def multi_model(data: pd.DataFrame, model: object, types: str)->Tuple[List[List[str]], float]:
    features = []
    weights = []
    for i in range(100):
        results_update, score_update = feature_extract_AIC(data, model, types)
        features.append(results_update)
        weights.append(score_update)
    avg = np.mean(weights)
    weights = [x - avg for x in weights]
    weights = [np.power((-1 * x / 2), 10) for x in weights]
    summ = np.sum(weights)
    weights = [x / summ for x in weights]
    return features, weights
In [82]:
# 生成线上结果

def generate_online_result(df: pd.DataFrame, feature: Iterable, model = LinearRegression(), target:str = 'total_purchase_amt')->Iterable:
    trainset, testset = split_data_online(df)
    model.fit(X=trainset[feature], y=trainset[target])
    result_purchase_lr = model.predict(testset[feature])
    return result_purchase_lr
In [83]:
def generate_under_result(df: pd.DataFrame, feature: Iterable, model = LinearRegression(), target:str = 'total_purchase_amt')->Iterable:
    trainset, testset = split_data_underline(df)
    model.fit(X=trainset[feature], y=trainset[target])
    result_purchase_lr = model.predict(testset[feature])
    return result_purchase_lr
In [84]:
# 生成线上提交的格式

def normalize_upload_file(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame)->pd.DataFrame:
    testset['total_purchase_amt'] = result_purchase_lr
    testset['total_redeem_amt'] = result_redeem_lr
    online_upload = testset[['date','total_purchase_amt','total_redeem_amt']]
    online_upload['date'] = online_upload['date'].astype(str)
    online_upload['date'] = online_upload['date'].str.replace('-','')
    return online_upload
In [85]:
# 线上结果可视化

def draw_result(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame):
    fig = plt.figure(figsize=(12,6))
    plt.plot(testset['date'], result_purchase_lr, label='online_purchase', linewidth = 0.7, linestyle = '--', marker = 'o')
    plt.plot(testset['date'], result_redeem_lr, label='online_redeem', linewidth = 1.2, marker = '*')
    plt.legend(loc='best')
    plt.title("The predict values")
    plt.xlabel("Time")
    plt.ylabel("Amount")
In [86]:
# 重载DataFrame加法

def add_two_df(df1, df2, features = None, left_a = 0.45, right_a = 0.55):
    data = df1.copy()
    if not features:
        features = [x for x in data.columns if x != 'interval']
    for i in features:
        data[i] = (data[i] * left_a + df2[i] * right_a)
    return data
In [87]:
# 重载DataFrame乘法

def scale_df(df1, features = None, eta = 1):
    data = df1.copy()
    if not features:
        features = [x for x in data.columns if x != 'interval']
    for i in features:
        data[i] *= eta
    return data

1. 仅用 is 特征建模

In [88]:
winer_features_purchase
Out[88]:
['dis_to_nowork',
 'dis_to_nowork%%%%dis_from_purchase_peak',
 'dis_from_startofmonth',
 'weekday_onehot_2',
 'weekday_onehot_5',
 'redeem_dayrate',
 'is_second_week',
 'is_worked_yestday',
 'is_tailmonth',
 'weekday_onehot_6',
 'dis_from_nowork',
 'is_third_week',
 'weekday_onehot_3',
 'purchase_weekdayrate',
 'dis_from_middleofweek',
 'weekday_onehot_1',
 'dis_to_work',
 'is_premonth']
In [89]:
data = pd.read_csv('Feature/purchase_feature_droped_0614.csv')
data['date'] = pd.to_datetime(data['date'])
In [90]:
trainset, testset = split_data_underline(data)
result_purchase_lr = generate_under_result(data, [x for x in data.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_purchase_amt')
result_redeem_lr = generate_under_result(data, [x for x in data.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_redeem_amt')

1.1 八月预测

In [91]:
total_AE(result_purchase_lr, result_redeem_lr, testset['total_purchase_amt'], testset['total_redeem_amt'])
Out[91]:
174.06417080343283
In [92]:
draw_eva_table(week_evalution_single(data, model=LinearRegression(), types = 'purchase'))
Out[92]:
0 interval
0 183.800267 2014-08-01
1 172.077478 2014-07-25
2 172.624680 2014-07-18
3 173.423005 2014-07-11
4 152.231692 2014-07-04
5 164.373225 2014-06-27
6 167.673897 2014-06-20

1.2 滑动窗口测试

In [93]:
draw_eva_table(week_evalution_single(data, LinearRegression(), 'redeem'))
Out[93]:
0 interval
0 166.098274 2014-08-01
1 152.456713 2014-07-25
2 189.623702 2014-07-18
3 185.380688 2014-07-11
4 187.920327 2014-07-04
5 194.165195 2014-06-27
6 171.540925 2014-06-20

1.3 对比分析

In [94]:
visual(result_purchase_lr, result_redeem_lr, testset)
In [95]:
result_purchase_lr = generate_online_result(data, [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_purchase_amt')
result_redeem_lr = generate_online_result(data, [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_redeem_amt')

1.4 九月预测

In [96]:
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)
In [97]:
normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190612_only_is.csv',index=False,header=None)

2. 多模型对比分析

In [98]:
def multi_model_eva(data, types:str = 'purchase'):
    results = pd.DataFrame()
    for model in [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), MLPRegressor(solver='lbfgs'), xgb.XGBRegressor(objective='reg:squarederror')]:
        if results.empty:
            results = draw_eva_table(week_evalution_single(data, model, types)).rename(columns={0: repr(model).split('(')[0]})
        else:
            results = pd.merge(results, \
                               draw_eva_table(week_evalution_single(data, model, types)).rename(columns={0: repr(model).split('(')[0]}), on='interval')
    results = results[['interval'] + [x for x in results.columns if x != 'interval']]
    return results
In [99]:
multi_result = add_two_df(multi_model_eva(data, 'purchase'), multi_model_eva(data, 'redeem')).set_index('interval')
multi_result
Out[99]:
LinearRegression DecisionTreeRegressor RandomForestRegressor GradientBoostingRegressor MLPRegressor XGBRegressor
interval
2014-08-01 174.064171 155.565590 174.302533 167.686436 172.779689 160.549147
2014-07-25 161.286057 154.660426 172.429864 163.442536 159.994705 163.178648
2014-07-18 181.974142 170.567107 187.273027 180.137258 167.449174 179.082897
2014-07-11 179.999731 163.943825 174.478238 173.028963 173.007285 176.611508
2014-07-04 171.860441 155.147499 166.438937 165.822702 166.234121 154.838255
2014-06-27 180.758808 168.512987 185.133194 180.965133 160.983171 172.190603
2014-06-20 169.800762 172.183968 177.182528 174.696406 156.033423 163.660270
In [100]:
multi_result.plot(figsize = (12,5),kind = 'line', marker = 'o', linewidth = 0.7, linestyle = '--')
plt.title('多模型对比分析')
plt.xlabel('时间间隔')
plt.ylabel('分数')
Out[100]:
Text(0, 0.5, '分数')

神经网络模型

In [101]:
import math
import numpy
import pandas
from keras.layers import LSTM, RNN, GRU, SimpleRNN
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
import os

numpy.random.seed(2019)


class RNNModel(object):
    def __init__(self, look_back=1, epochs_purchase=20, epochs_redeem=40, batch_size=1, verbose=2, patience=10, store_result=False):
        self.look_back = look_back
        self.epochs_purchase = epochs_purchase
        self.epochs_redeem = epochs_redeem
        self.batch_size = batch_size
        self.verbose = verbose
        self.store_result = store_result
        self.patience = patience
        self.purchase = pandas.read_csv('Dataset/date_label.csv', usecols=[1], engine='python') 
        self.redeem = pandas.read_csv('Dataset/date_label.csv', usecols=[2], engine='python')
        
    def access_data(self, data_frame):
        # load the data set
        data_set = data_frame.values
        data_set = data_set.astype('float32')

        # LSTMs are sensitive to the scale of the input data, specifically when the sigmoid (default) or tanh activation functions are used. It can be a good practice to rescale the data to the range of 0-to-1, also called normalizing.
        scaler = MinMaxScaler(feature_range=(0, 1))
        data_set = scaler.fit_transform(data_set)

        # reshape into X=t and Y=t+1
        train_x, train_y, test = self.create_data_set(data_set)

        # reshape input to be [samples, time steps, features]
        train_x = numpy.reshape(train_x, (train_x.shape[0], 1, train_x.shape[1]))
        return train_x, train_y, test, scaler

    # convert an array of values into a data set matrix
    def create_data_set(self, data_set):
        data_x, data_y = [], []
        for i in range(len(data_set)-self.look_back - 30):
            a = data_set[i:(i + self.look_back), 0]
            data_x.append(a)
            data_y.append(list(data_set[i + self.look_back: i + self.look_back + 30, 0]))
        # print(numpy.array(data_y).shape)
        return numpy.array(data_x), numpy.array(data_y), data_set[-self.look_back:, 0].reshape(1, 1, self.look_back)

    def rnn_model(self, train_x, train_y, epochs):
        model = Sequential()
        model.add(LSTM(64, input_shape=(1, self.look_back), return_sequences=True))
        model.add(LSTM(32, return_sequences=False))
        model.add(Dense(32))
        model.add(Dense(30))
        #model.add(Dense(30))
        model.compile(loss='mean_squared_error', optimizer='adam')
        model.summary()
        early_stopping = EarlyStopping('loss', patience=self.patience)
        history = model.fit(train_x, train_y, epochs=epochs, batch_size=self.batch_size, verbose=self.verbose, callbacks=[early_stopping])
        return model

    def predict(self, model, data):
        prediction = model.predict(data)
        return prediction

    def plot_show(self, predict):
        predict = predict[['purchase', 'redeem']]
        plt.figure(figsize = (12,8))
        predict.plot(linewidth = 0.8, marker = 'o', linestyle = '--', markersize=6)
        plt.title('2014年9月预测对比分析:LSTM模型可视化')
        plt.xlabel('时间')
        plt.ylabel('金额')
        new_xticks = ['9.' + str(x) for x in range(1,31)]
        plt.xticks(np.arange(30),new_xticks, rotation = 45)
        plt.legend(bbox_to_anchor=(1.05, 0), loc=3, borderaxespad=0)

    def run(self):
        purchase_train_x, purchase_train_y, purchase_test, purchase_scaler = self.access_data(self.purchase)
        redeem_train_x, redeem_train_y, redeem_test, redeem_scaler = self.access_data(self.redeem)

        purchase_model = self.rnn_model(purchase_train_x, purchase_train_y, self.epochs_purchase)
        redeem_model = self.rnn_model(redeem_train_x, redeem_train_y, self.epochs_redeem)

        purchase_predict = self.predict(purchase_model, purchase_test)
        redeem_predict = self.predict(redeem_model, redeem_test)

        test_user = pandas.DataFrame({'report_date': [20140900 + i for i in range(1, 31)]})

        purchase = purchase_scaler.inverse_transform(purchase_predict).reshape(30, 1)
        redeem = redeem_scaler.inverse_transform(redeem_predict).reshape(30, 1)

        test_user['purchase'] = purchase
        test_user['redeem'] = redeem
        print(test_user)

        """Store submit file"""
        if self.store_result is True:
            test_user.to_csv('Dataset/submit_lstm.csv', encoding='utf-8', index=None, header=None)
            
        """plot result picture"""
        self.plot_show(test_user)
        
if __name__ == '__main__':
    initiation = RNNModel(look_back=40, epochs_purchase=150, epochs_redeem=230, batch_size=16, verbose=1, patience=50, store_result=False)
    initiation.run()
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_1 (LSTM)                (None, 1, 64)             26880     
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 30)                990       
=================================================================
Total params: 41,342
Trainable params: 41,342
Non-trainable params: 0
_________________________________________________________________
Epoch 1/150
357/357 [==============================] - 1s 2ms/step - loss: 0.0646
Epoch 2/150
357/357 [==============================] - 0s 202us/step - loss: 0.0231
Epoch 3/150
357/357 [==============================] - 0s 216us/step - loss: 0.0175
Epoch 4/150
357/357 [==============================] - 0s 202us/step - loss: 0.0169
Epoch 5/150
357/357 [==============================] - 0s 213us/step - loss: 0.0165
Epoch 6/150
357/357 [==============================] - 0s 202us/step - loss: 0.0160
Epoch 7/150
357/357 [==============================] - 0s 221us/step - loss: 0.0158
Epoch 8/150
357/357 [==============================] - 0s 213us/step - loss: 0.0153
Epoch 9/150
357/357 [==============================] - 0s 221us/step - loss: 0.0148
Epoch 10/150
357/357 [==============================] - 0s 210us/step - loss: 0.0145
Epoch 11/150
357/357 [==============================] - 0s 227us/step - loss: 0.0143
Epoch 12/150
357/357 [==============================] - 0s 210us/step - loss: 0.0140
Epoch 13/150
357/357 [==============================] - 0s 224us/step - loss: 0.0136
Epoch 14/150
357/357 [==============================] - 0s 216us/step - loss: 0.0132
Epoch 15/150
357/357 [==============================] - 0s 218us/step - loss: 0.0128
Epoch 16/150
357/357 [==============================] - 0s 224us/step - loss: 0.0125
Epoch 17/150
357/357 [==============================] - ETA: 0s - loss: 0.011 - 0s 202us/step - loss: 0.0120
Epoch 18/150
357/357 [==============================] - 0s 218us/step - loss: 0.0118
Epoch 19/150
357/357 [==============================] - 0s 210us/step - loss: 0.0115
Epoch 20/150
357/357 [==============================] - 0s 221us/step - loss: 0.0113
Epoch 21/150
357/357 [==============================] - 0s 204us/step - loss: 0.0112
Epoch 22/150
357/357 [==============================] - 0s 218us/step - loss: 0.0110
Epoch 23/150
357/357 [==============================] - 0s 207us/step - loss: 0.0111
Epoch 24/150
357/357 [==============================] - 0s 228us/step - loss: 0.0110
Epoch 25/150
357/357 [==============================] - 0s 213us/step - loss: 0.0109
Epoch 26/150
357/357 [==============================] - 0s 221us/step - loss: 0.0109
Epoch 27/150
357/357 [==============================] - 0s 226us/step - loss: 0.0110
Epoch 28/150
357/357 [==============================] - 0s 216us/step - loss: 0.0109
Epoch 29/150
357/357 [==============================] - 0s 232us/step - loss: 0.0108
Epoch 30/150
357/357 [==============================] - 0s 204us/step - loss: 0.0108
Epoch 31/150
357/357 [==============================] - 0s 218us/step - loss: 0.0107
Epoch 32/150
357/357 [==============================] - 0s 232us/step - loss: 0.0107
Epoch 33/150
357/357 [==============================] - 0s 230us/step - loss: 0.0106
Epoch 34/150
357/357 [==============================] - 0s 230us/step - loss: 0.0109
Epoch 35/150
357/357 [==============================] - 0s 272us/step - loss: 0.0106
Epoch 36/150
357/357 [==============================] - 0s 258us/step - loss: 0.0106
Epoch 37/150
357/357 [==============================] - 0s 283us/step - loss: 0.0105
Epoch 38/150
357/357 [==============================] - 0s 282us/step - loss: 0.0104
Epoch 39/150
357/357 [==============================] - 0s 255us/step - loss: 0.0103
Epoch 40/150
357/357 [==============================] - 0s 275us/step - loss: 0.0103
Epoch 41/150
357/357 [==============================] - 0s 246us/step - loss: 0.0105
Epoch 42/150
357/357 [==============================] - 0s 238us/step - loss: 0.0102
Epoch 43/150
357/357 [==============================] - 0s 216us/step - loss: 0.0102
Epoch 44/150
357/357 [==============================] - 0s 221us/step - loss: 0.0101
Epoch 45/150
357/357 [==============================] - 0s 213us/step - loss: 0.0102
Epoch 46/150
357/357 [==============================] - 0s 224us/step - loss: 0.0105
Epoch 47/150
357/357 [==============================] - 0s 210us/step - loss: 0.0102
Epoch 48/150
357/357 [==============================] - 0s 218us/step - loss: 0.0104
Epoch 49/150
357/357 [==============================] - 0s 241us/step - loss: 0.0102
Epoch 50/150
357/357 [==============================] - 0s 230us/step - loss: 0.0101
Epoch 51/150
357/357 [==============================] - 0s 230us/step - loss: 0.0100
Epoch 52/150
357/357 [==============================] - 0s 230us/step - loss: 0.0099
Epoch 53/150
357/357 [==============================] - 0s 240us/step - loss: 0.0099
Epoch 54/150
357/357 [==============================] - 0s 224us/step - loss: 0.0099
Epoch 55/150
357/357 [==============================] - 0s 241us/step - loss: 0.0099
Epoch 56/150
357/357 [==============================] - 0s 224us/step - loss: 0.0099
Epoch 57/150
357/357 [==============================] - 0s 227us/step - loss: 0.0099
Epoch 58/150
357/357 [==============================] - 0s 232us/step - loss: 0.0098
Epoch 59/150
357/357 [==============================] - 0s 241us/step - loss: 0.0099
Epoch 60/150
357/357 [==============================] - 0s 235us/step - loss: 0.0098
Epoch 61/150
357/357 [==============================] - 0s 232us/step - loss: 0.0096
Epoch 62/150
357/357 [==============================] - 0s 232us/step - loss: 0.0097
Epoch 63/150
357/357 [==============================] - 0s 244us/step - loss: 0.0095
Epoch 64/150
357/357 [==============================] - 0s 249us/step - loss: 0.0095
Epoch 65/150
357/357 [==============================] - 0s 244us/step - loss: 0.0095
Epoch 66/150
357/357 [==============================] - 0s 244us/step - loss: 0.0094
Epoch 67/150
357/357 [==============================] - 0s 247us/step - loss: 0.0095
Epoch 68/150
357/357 [==============================] - 0s 235us/step - loss: 0.0095
Epoch 69/150
357/357 [==============================] - 0s 230us/step - loss: 0.0094
Epoch 70/150
357/357 [==============================] - 0s 221us/step - loss: 0.0094
Epoch 71/150
357/357 [==============================] - 0s 233us/step - loss: 0.0094
Epoch 72/150
357/357 [==============================] - 0s 238us/step - loss: 0.0092
Epoch 73/150
357/357 [==============================] - 0s 232us/step - loss: 0.0092
Epoch 74/150
357/357 [==============================] - 0s 238us/step - loss: 0.0091
Epoch 75/150
357/357 [==============================] - 0s 241us/step - loss: 0.0092
Epoch 76/150
357/357 [==============================] - 0s 238us/step - loss: 0.0091
Epoch 77/150
357/357 [==============================] - 0s 235us/step - loss: 0.0093
Epoch 78/150
357/357 [==============================] - 0s 244us/step - loss: 0.0091
Epoch 79/150
357/357 [==============================] - 0s 216us/step - loss: 0.0091
Epoch 80/150
357/357 [==============================] - 0s 223us/step - loss: 0.0091
Epoch 81/150
357/357 [==============================] - 0s 227us/step - loss: 0.0092
Epoch 82/150
357/357 [==============================] - 0s 221us/step - loss: 0.0090
Epoch 83/150
357/357 [==============================] - 0s 227us/step - loss: 0.0089
Epoch 84/150
357/357 [==============================] - 0s 224us/step - loss: 0.0089
Epoch 85/150
357/357 [==============================] - 0s 224us/step - loss: 0.0089
Epoch 86/150
357/357 [==============================] - 0s 230us/step - loss: 0.0088
Epoch 87/150
357/357 [==============================] - 0s 232us/step - loss: 0.0089
Epoch 88/150
357/357 [==============================] - 0s 235us/step - loss: 0.0088
Epoch 89/150
357/357 [==============================] - 0s 227us/step - loss: 0.0088
Epoch 90/150
357/357 [==============================] - 0s 224us/step - loss: 0.0088
Epoch 91/150
357/357 [==============================] - 0s 221us/step - loss: 0.0088
Epoch 92/150
357/357 [==============================] - 0s 224us/step - loss: 0.0089
Epoch 93/150
357/357 [==============================] - 0s 230us/step - loss: 0.0089
Epoch 94/150
357/357 [==============================] - 0s 230us/step - loss: 0.0087
Epoch 95/150
357/357 [==============================] - 0s 224us/step - loss: 0.0088
Epoch 96/150
357/357 [==============================] - 0s 227us/step - loss: 0.0087
Epoch 97/150
357/357 [==============================] - 0s 221us/step - loss: 0.0087
Epoch 98/150
357/357 [==============================] - 0s 230us/step - loss: 0.0086
Epoch 99/150
357/357 [==============================] - 0s 221us/step - loss: 0.0086
Epoch 100/150
357/357 [==============================] - 0s 235us/step - loss: 0.0085
Epoch 101/150
357/357 [==============================] - 0s 269us/step - loss: 0.0086
Epoch 102/150
357/357 [==============================] - 0s 303us/step - loss: 0.0087
Epoch 103/150
357/357 [==============================] - 0s 266us/step - loss: 0.0085
Epoch 104/150
357/357 [==============================] - 0s 269us/step - loss: 0.0086
Epoch 105/150
357/357 [==============================] - 0s 275us/step - loss: 0.0086
Epoch 106/150
357/357 [==============================] - 0s 273us/step - loss: 0.0085
Epoch 107/150
357/357 [==============================] - 0s 244us/step - loss: 0.0084
Epoch 108/150
357/357 [==============================] - 0s 235us/step - loss: 0.0085
Epoch 109/150
357/357 [==============================] - 0s 230us/step - loss: 0.0085
Epoch 110/150
357/357 [==============================] - 0s 241us/step - loss: 0.0084
Epoch 111/150
357/357 [==============================] - 0s 246us/step - loss: 0.0083
Epoch 112/150
357/357 [==============================] - 0s 232us/step - loss: 0.0084
Epoch 113/150
357/357 [==============================] - 0s 235us/step - loss: 0.0083
Epoch 114/150
357/357 [==============================] - 0s 227us/step - loss: 0.0082
Epoch 115/150
357/357 [==============================] - 0s 230us/step - loss: 0.0082
Epoch 116/150
357/357 [==============================] - 0s 235us/step - loss: 0.0083
Epoch 117/150
357/357 [==============================] - 0s 230us/step - loss: 0.0082
Epoch 118/150
357/357 [==============================] - 0s 230us/step - loss: 0.0083
Epoch 119/150
357/357 [==============================] - 0s 226us/step - loss: 0.0082
Epoch 120/150
357/357 [==============================] - 0s 227us/step - loss: 0.0082
Epoch 121/150
357/357 [==============================] - 0s 240us/step - loss: 0.0081
Epoch 122/150
357/357 [==============================] - 0s 232us/step - loss: 0.0081
Epoch 123/150
357/357 [==============================] - 0s 230us/step - loss: 0.0081
Epoch 124/150
357/357 [==============================] - 0s 230us/step - loss: 0.0080
Epoch 125/150
357/357 [==============================] - 0s 232us/step - loss: 0.0081
Epoch 126/150
357/357 [==============================] - 0s 227us/step - loss: 0.0080
Epoch 127/150
357/357 [==============================] - 0s 231us/step - loss: 0.0081
Epoch 128/150
357/357 [==============================] - 0s 232us/step - loss: 0.0081
Epoch 129/150
357/357 [==============================] - 0s 232us/step - loss: 0.0081
Epoch 130/150
357/357 [==============================] - 0s 247us/step - loss: 0.0079
Epoch 131/150
357/357 [==============================] - 0s 235us/step - loss: 0.0081
Epoch 132/150
357/357 [==============================] - 0s 228us/step - loss: 0.0080
Epoch 133/150
357/357 [==============================] - 0s 232us/step - loss: 0.0079
Epoch 134/150
357/357 [==============================] - 0s 221us/step - loss: 0.0078
Epoch 135/150
357/357 [==============================] - 0s 223us/step - loss: 0.0078
Epoch 136/150
357/357 [==============================] - 0s 238us/step - loss: 0.0079
Epoch 137/150
357/357 [==============================] - 0s 224us/step - loss: 0.0078
Epoch 138/150
357/357 [==============================] - 0s 234us/step - loss: 0.0078
Epoch 139/150
357/357 [==============================] - 0s 216us/step - loss: 0.0078
Epoch 140/150
357/357 [==============================] - 0s 221us/step - loss: 0.0077
Epoch 141/150
357/357 [==============================] - 0s 221us/step - loss: 0.0077
Epoch 142/150
357/357 [==============================] - 0s 216us/step - loss: 0.0077
Epoch 143/150
357/357 [==============================] - 0s 218us/step - loss: 0.0078
Epoch 144/150
357/357 [==============================] - 0s 216us/step - loss: 0.0077
Epoch 145/150
357/357 [==============================] - 0s 218us/step - loss: 0.0077
Epoch 146/150
357/357 [==============================] - 0s 213us/step - loss: 0.0076
Epoch 147/150
357/357 [==============================] - 0s 213us/step - loss: 0.0077
Epoch 148/150
357/357 [==============================] - 0s 216us/step - loss: 0.0076
Epoch 149/150
357/357 [==============================] - 0s 207us/step - loss: 0.0075
Epoch 150/150
357/357 [==============================] - 0s 216us/step - loss: 0.0076
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_3 (LSTM)                (None, 1, 64)             26880     
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_3 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_4 (Dense)              (None, 30)                990       
=================================================================
Total params: 41,342
Trainable params: 41,342
Non-trainable params: 0
_________________________________________________________________
Epoch 1/230
357/357 [==============================] - 1s 2ms/step - loss: 0.1239
Epoch 2/230
357/357 [==============================] - 0s 213us/step - loss: 0.0422
Epoch 3/230
357/357 [==============================] - 0s 216us/step - loss: 0.0259
Epoch 4/230
357/357 [==============================] - 0s 213us/step - loss: 0.0243
Epoch 5/230
357/357 [==============================] - 0s 218us/step - loss: 0.0238
Epoch 6/230
357/357 [==============================] - 0s 218us/step - loss: 0.0235
Epoch 7/230
357/357 [==============================] - 0s 216us/step - loss: 0.0230
Epoch 8/230
357/357 [==============================] - 0s 210us/step - loss: 0.0224
Epoch 9/230
357/357 [==============================] - 0s 219us/step - loss: 0.0221
Epoch 10/230
357/357 [==============================] - 0s 213us/step - loss: 0.0218
Epoch 11/230
357/357 [==============================] - 0s 219us/step - loss: 0.0214
Epoch 12/230
357/357 [==============================] - 0s 213us/step - loss: 0.0211
Epoch 13/230
357/357 [==============================] - 0s 213us/step - loss: 0.0210
Epoch 14/230
357/357 [==============================] - 0s 218us/step - loss: 0.0208
Epoch 15/230
357/357 [==============================] - 0s 216us/step - loss: 0.0204
Epoch 16/230
357/357 [==============================] - 0s 221us/step - loss: 0.0204
Epoch 17/230
357/357 [==============================] - 0s 216us/step - loss: 0.0200
Epoch 18/230
357/357 [==============================] - 0s 218us/step - loss: 0.0199
Epoch 19/230
357/357 [==============================] - 0s 218us/step - loss: 0.0193
Epoch 20/230
357/357 [==============================] - 0s 213us/step - loss: 0.0191
Epoch 21/230
357/357 [==============================] - 0s 216us/step - loss: 0.0186
Epoch 22/230
357/357 [==============================] - 0s 216us/step - loss: 0.0182
Epoch 23/230
357/357 [==============================] - 0s 221us/step - loss: 0.0179
Epoch 24/230
357/357 [==============================] - 0s 230us/step - loss: 0.0176
Epoch 25/230
357/357 [==============================] - 0s 221us/step - loss: 0.0174
Epoch 26/230
357/357 [==============================] - 0s 226us/step - loss: 0.0171
Epoch 27/230
357/357 [==============================] - 0s 230us/step - loss: 0.0170
Epoch 28/230
357/357 [==============================] - 0s 216us/step - loss: 0.0167
Epoch 29/230
357/357 [==============================] - 0s 217us/step - loss: 0.0168
Epoch 30/230
357/357 [==============================] - 0s 221us/step - loss: 0.0166
Epoch 31/230
357/357 [==============================] - 0s 227us/step - loss: 0.0164
Epoch 32/230
357/357 [==============================] - 0s 232us/step - loss: 0.0163
Epoch 33/230
357/357 [==============================] - 0s 221us/step - loss: 0.0162
Epoch 34/230
357/357 [==============================] - 0s 221us/step - loss: 0.0160
Epoch 35/230
357/357 [==============================] - 0s 218us/step - loss: 0.0161
Epoch 36/230
357/357 [==============================] - 0s 223us/step - loss: 0.0159
Epoch 37/230
357/357 [==============================] - 0s 218us/step - loss: 0.0158
Epoch 38/230
357/357 [==============================] - 0s 227us/step - loss: 0.0158
Epoch 39/230
357/357 [==============================] - 0s 216us/step - loss: 0.0158
Epoch 40/230
357/357 [==============================] - 0s 216us/step - loss: 0.0156
Epoch 41/230
357/357 [==============================] - 0s 216us/step - loss: 0.0155
Epoch 42/230
357/357 [==============================] - 0s 232us/step - loss: 0.0154
Epoch 43/230
357/357 [==============================] - 0s 227us/step - loss: 0.0155
Epoch 44/230
357/357 [==============================] - 0s 227us/step - loss: 0.0153
Epoch 45/230
357/357 [==============================] - 0s 221us/step - loss: 0.0150
Epoch 46/230
357/357 [==============================] - 0s 227us/step - loss: 0.0150
Epoch 47/230
357/357 [==============================] - 0s 224us/step - loss: 0.0150
Epoch 48/230
357/357 [==============================] - 0s 219us/step - loss: 0.0150
Epoch 49/230
357/357 [==============================] - 0s 224us/step - loss: 0.0149
Epoch 50/230
357/357 [==============================] - 0s 221us/step - loss: 0.0148
Epoch 51/230
357/357 [==============================] - 0s 213us/step - loss: 0.0147
Epoch 52/230
357/357 [==============================] - 0s 218us/step - loss: 0.0146
Epoch 53/230
357/357 [==============================] - 0s 221us/step - loss: 0.0148
Epoch 54/230
357/357 [==============================] - 0s 218us/step - loss: 0.0145
Epoch 55/230
357/357 [==============================] - 0s 218us/step - loss: 0.0146
Epoch 56/230
357/357 [==============================] - 0s 213us/step - loss: 0.0146
Epoch 57/230
357/357 [==============================] - 0s 224us/step - loss: 0.0144
Epoch 58/230
357/357 [==============================] - 0s 269us/step - loss: 0.0143
Epoch 59/230
357/357 [==============================] - 0s 258us/step - loss: 0.0144
Epoch 60/230
357/357 [==============================] - 0s 263us/step - loss: 0.0144
Epoch 61/230
357/357 [==============================] - 0s 258us/step - loss: 0.0142
Epoch 62/230
357/357 [==============================] - 0s 263us/step - loss: 0.0142
Epoch 63/230
357/357 [==============================] - 0s 255us/step - loss: 0.0142
Epoch 64/230
357/357 [==============================] - 0s 241us/step - loss: 0.0145
Epoch 65/230
357/357 [==============================] - 0s 221us/step - loss: 0.0142
Epoch 66/230
357/357 [==============================] - 0s 218us/step - loss: 0.0141
Epoch 67/230
357/357 [==============================] - 0s 210us/step - loss: 0.0140
Epoch 68/230
357/357 [==============================] - 0s 227us/step - loss: 0.0140
Epoch 69/230
357/357 [==============================] - 0s 213us/step - loss: 0.0139
Epoch 70/230
357/357 [==============================] - 0s 216us/step - loss: 0.0139
Epoch 71/230
357/357 [==============================] - 0s 216us/step - loss: 0.0140
Epoch 72/230
357/357 [==============================] - 0s 213us/step - loss: 0.0139
Epoch 73/230
357/357 [==============================] - 0s 210us/step - loss: 0.0138
Epoch 74/230
357/357 [==============================] - 0s 213us/step - loss: 0.0139
Epoch 75/230
357/357 [==============================] - 0s 214us/step - loss: 0.0140
Epoch 76/230
357/357 [==============================] - 0s 210us/step - loss: 0.0138
Epoch 77/230
357/357 [==============================] - 0s 216us/step - loss: 0.0138
Epoch 78/230
357/357 [==============================] - 0s 217us/step - loss: 0.0137
Epoch 79/230
357/357 [==============================] - 0s 213us/step - loss: 0.0136
Epoch 80/230
357/357 [==============================] - 0s 221us/step - loss: 0.0138
Epoch 81/230
357/357 [==============================] - 0s 218us/step - loss: 0.0136
Epoch 82/230
357/357 [==============================] - 0s 218us/step - loss: 0.0136
Epoch 83/230
357/357 [==============================] - 0s 213us/step - loss: 0.0135
Epoch 84/230
357/357 [==============================] - 0s 210us/step - loss: 0.0135
Epoch 85/230
357/357 [==============================] - 0s 216us/step - loss: 0.0135
Epoch 86/230
357/357 [==============================] - 0s 218us/step - loss: 0.0135
Epoch 87/230
357/357 [==============================] - 0s 221us/step - loss: 0.0135
Epoch 88/230
357/357 [==============================] - 0s 216us/step - loss: 0.0136
Epoch 89/230
357/357 [==============================] - 0s 216us/step - loss: 0.0134
Epoch 90/230
357/357 [==============================] - 0s 221us/step - loss: 0.0135
Epoch 91/230
357/357 [==============================] - 0s 218us/step - loss: 0.0134
Epoch 92/230
357/357 [==============================] - 0s 218us/step - loss: 0.0135
Epoch 93/230
357/357 [==============================] - 0s 218us/step - loss: 0.0134
Epoch 94/230
357/357 [==============================] - 0s 210us/step - loss: 0.0133
Epoch 95/230
357/357 [==============================] - 0s 219us/step - loss: 0.0133
Epoch 96/230
357/357 [==============================] - 0s 218us/step - loss: 0.0133
Epoch 97/230
357/357 [==============================] - 0s 210us/step - loss: 0.0133
Epoch 98/230
357/357 [==============================] - 0s 217us/step - loss: 0.0132
Epoch 99/230
357/357 [==============================] - 0s 218us/step - loss: 0.0133
Epoch 100/230
357/357 [==============================] - 0s 218us/step - loss: 0.0131
Epoch 101/230
357/357 [==============================] - 0s 221us/step - loss: 0.0131
Epoch 102/230
357/357 [==============================] - 0s 224us/step - loss: 0.0131
Epoch 103/230
357/357 [==============================] - 0s 218us/step - loss: 0.0132
Epoch 104/230
357/357 [==============================] - 0s 218us/step - loss: 0.0133
Epoch 105/230
357/357 [==============================] - 0s 213us/step - loss: 0.0131
Epoch 106/230
357/357 [==============================] - 0s 216us/step - loss: 0.0131
Epoch 107/230
357/357 [==============================] - 0s 210us/step - loss: 0.0129
Epoch 108/230
357/357 [==============================] - 0s 221us/step - loss: 0.0131
Epoch 109/230
357/357 [==============================] - 0s 216us/step - loss: 0.0130
Epoch 110/230
357/357 [==============================] - 0s 221us/step - loss: 0.0129
Epoch 111/230
357/357 [==============================] - 0s 218us/step - loss: 0.0129
Epoch 112/230
357/357 [==============================] - 0s 218us/step - loss: 0.0130
Epoch 113/230
357/357 [==============================] - 0s 210us/step - loss: 0.0130
Epoch 114/230
357/357 [==============================] - 0s 213us/step - loss: 0.0128
Epoch 115/230
357/357 [==============================] - 0s 213us/step - loss: 0.0127
Epoch 116/230
357/357 [==============================] - 0s 216us/step - loss: 0.0128
Epoch 117/230
357/357 [==============================] - 0s 216us/step - loss: 0.0127
Epoch 118/230
357/357 [==============================] - 0s 218us/step - loss: 0.0129
Epoch 119/230
357/357 [==============================] - 0s 213us/step - loss: 0.0128
Epoch 120/230
357/357 [==============================] - 0s 218us/step - loss: 0.0127
Epoch 121/230
357/357 [==============================] - 0s 213us/step - loss: 0.0127
Epoch 122/230
357/357 [==============================] - 0s 224us/step - loss: 0.0127
Epoch 123/230
357/357 [==============================] - 0s 219us/step - loss: 0.0126
Epoch 124/230
357/357 [==============================] - 0s 232us/step - loss: 0.0125
Epoch 125/230
357/357 [==============================] - 0s 216us/step - loss: 0.0125
Epoch 126/230
357/357 [==============================] - 0s 218us/step - loss: 0.0127
Epoch 127/230
357/357 [==============================] - 0s 249us/step - loss: 0.0127
Epoch 128/230
357/357 [==============================] - 0s 260us/step - loss: 0.0126
Epoch 129/230
357/357 [==============================] - 0s 260us/step - loss: 0.0126
Epoch 130/230
357/357 [==============================] - 0s 254us/step - loss: 0.0125
Epoch 131/230
357/357 [==============================] - 0s 258us/step - loss: 0.0124
Epoch 132/230
357/357 [==============================] - 0s 266us/step - loss: 0.0125
Epoch 133/230
357/357 [==============================] - 0s 255us/step - loss: 0.0124
Epoch 134/230
357/357 [==============================] - 0s 238us/step - loss: 0.0124
Epoch 135/230
357/357 [==============================] - 0s 216us/step - loss: 0.0124
Epoch 136/230
357/357 [==============================] - 0s 227us/step - loss: 0.0124
Epoch 137/230
357/357 [==============================] - 0s 224us/step - loss: 0.0122
Epoch 138/230
357/357 [==============================] - 0s 216us/step - loss: 0.0123
Epoch 139/230
357/357 [==============================] - 0s 224us/step - loss: 0.0122
Epoch 140/230
357/357 [==============================] - 0s 218us/step - loss: 0.0123
Epoch 141/230
357/357 [==============================] - 0s 213us/step - loss: 0.0122
Epoch 142/230
357/357 [==============================] - 0s 232us/step - loss: 0.0121
Epoch 143/230
357/357 [==============================] - 0s 221us/step - loss: 0.0121
Epoch 144/230
357/357 [==============================] - 0s 216us/step - loss: 0.0121
Epoch 145/230
357/357 [==============================] - 0s 213us/step - loss: 0.0121
Epoch 146/230
357/357 [==============================] - 0s 216us/step - loss: 0.0120
Epoch 147/230
357/357 [==============================] - 0s 218us/step - loss: 0.0121
Epoch 148/230
357/357 [==============================] - 0s 221us/step - loss: 0.0120
Epoch 149/230
357/357 [==============================] - 0s 221us/step - loss: 0.0120
Epoch 150/230
357/357 [==============================] - 0s 216us/step - loss: 0.0120
Epoch 151/230
357/357 [==============================] - 0s 230us/step - loss: 0.0119
Epoch 152/230
357/357 [==============================] - 0s 216us/step - loss: 0.0119
Epoch 153/230
357/357 [==============================] - 0s 216us/step - loss: 0.0120
Epoch 154/230
357/357 [==============================] - 0s 221us/step - loss: 0.0119
Epoch 155/230
357/357 [==============================] - 0s 216us/step - loss: 0.0119
Epoch 156/230
357/357 [==============================] - 0s 220us/step - loss: 0.0120
Epoch 157/230
357/357 [==============================] - 0s 213us/step - loss: 0.0119
Epoch 158/230
357/357 [==============================] - 0s 218us/step - loss: 0.0118
Epoch 159/230
357/357 [==============================] - 0s 217us/step - loss: 0.0118
Epoch 160/230
357/357 [==============================] - 0s 213us/step - loss: 0.0117
Epoch 161/230
357/357 [==============================] - 0s 213us/step - loss: 0.0117
Epoch 162/230
357/357 [==============================] - 0s 213us/step - loss: 0.0117
Epoch 163/230
357/357 [==============================] - 0s 216us/step - loss: 0.0117
Epoch 164/230
357/357 [==============================] - 0s 213us/step - loss: 0.0116
Epoch 165/230
357/357 [==============================] - 0s 216us/step - loss: 0.0116
Epoch 166/230
357/357 [==============================] - 0s 218us/step - loss: 0.0116
Epoch 167/230
357/357 [==============================] - 0s 218us/step - loss: 0.0116
Epoch 168/230
357/357 [==============================] - 0s 221us/step - loss: 0.0117
Epoch 169/230
357/357 [==============================] - 0s 227us/step - loss: 0.0116
Epoch 170/230
357/357 [==============================] - 0s 216us/step - loss: 0.0115
Epoch 171/230
357/357 [==============================] - 0s 224us/step - loss: 0.0115
Epoch 172/230
357/357 [==============================] - 0s 237us/step - loss: 0.0115
Epoch 173/230
357/357 [==============================] - 0s 235us/step - loss: 0.0114
Epoch 174/230
357/357 [==============================] - 0s 241us/step - loss: 0.0115
Epoch 175/230
357/357 [==============================] - 0s 228us/step - loss: 0.0115
Epoch 176/230
357/357 [==============================] - 0s 224us/step - loss: 0.0114
Epoch 177/230
357/357 [==============================] - 0s 224us/step - loss: 0.0113
Epoch 178/230
357/357 [==============================] - 0s 227us/step - loss: 0.0113
Epoch 179/230
357/357 [==============================] - 0s 224us/step - loss: 0.0113
Epoch 180/230
357/357 [==============================] - 0s 207us/step - loss: 0.0113
Epoch 181/230
357/357 [==============================] - 0s 223us/step - loss: 0.0113
Epoch 182/230
357/357 [==============================] - 0s 218us/step - loss: 0.0112
Epoch 183/230
357/357 [==============================] - 0s 226us/step - loss: 0.0113
Epoch 184/230
357/357 [==============================] - 0s 221us/step - loss: 0.0113
Epoch 185/230
357/357 [==============================] - 0s 224us/step - loss: 0.0113
Epoch 186/230
357/357 [==============================] - 0s 210us/step - loss: 0.0112
Epoch 187/230
357/357 [==============================] - 0s 216us/step - loss: 0.0111
Epoch 188/230
357/357 [==============================] - 0s 210us/step - loss: 0.0112
Epoch 189/230
357/357 [==============================] - 0s 235us/step - loss: 0.0111
Epoch 190/230
357/357 [==============================] - 0s 221us/step - loss: 0.0111
Epoch 191/230
357/357 [==============================] - 0s 227us/step - loss: 0.0110
Epoch 192/230
357/357 [==============================] - 0s 216us/step - loss: 0.0110
Epoch 193/230
357/357 [==============================] - 0s 221us/step - loss: 0.0110
Epoch 194/230
357/357 [==============================] - 0s 216us/step - loss: 0.0110
Epoch 195/230
357/357 [==============================] - 0s 227us/step - loss: 0.0109
Epoch 196/230
357/357 [==============================] - 0s 272us/step - loss: 0.0110
Epoch 197/230
357/357 [==============================] - 0s 255us/step - loss: 0.0110
Epoch 198/230
357/357 [==============================] - 0s 263us/step - loss: 0.0109
Epoch 199/230
357/357 [==============================] - 0s 269us/step - loss: 0.0109
Epoch 200/230
357/357 [==============================] - 0s 255us/step - loss: 0.0109
Epoch 201/230
357/357 [==============================] - 0s 277us/step - loss: 0.0108
Epoch 202/230
357/357 [==============================] - 0s 252us/step - loss: 0.0109
Epoch 203/230
357/357 [==============================] - 0s 216us/step - loss: 0.0109
Epoch 204/230
357/357 [==============================] - 0s 224us/step - loss: 0.0108
Epoch 205/230
357/357 [==============================] - 0s 221us/step - loss: 0.0108
Epoch 206/230
357/357 [==============================] - 0s 227us/step - loss: 0.0108
Epoch 207/230
357/357 [==============================] - 0s 224us/step - loss: 0.0108
Epoch 208/230
357/357 [==============================] - 0s 218us/step - loss: 0.0108
Epoch 209/230
357/357 [==============================] - 0s 221us/step - loss: 0.0108
Epoch 210/230
357/357 [==============================] - 0s 241us/step - loss: 0.0107
Epoch 211/230
357/357 [==============================] - 0s 216us/step - loss: 0.0107
Epoch 212/230
357/357 [==============================] - 0s 231us/step - loss: 0.0106
Epoch 213/230
357/357 [==============================] - 0s 224us/step - loss: 0.0106
Epoch 214/230
357/357 [==============================] - 0s 230us/step - loss: 0.0106
Epoch 215/230
357/357 [==============================] - 0s 235us/step - loss: 0.0106
Epoch 216/230
357/357 [==============================] - 0s 221us/step - loss: 0.0106
Epoch 217/230
357/357 [==============================] - 0s 216us/step - loss: 0.0105
Epoch 218/230
357/357 [==============================] - 0s 216us/step - loss: 0.0105
Epoch 219/230
357/357 [==============================] - 0s 214us/step - loss: 0.0105
Epoch 220/230
357/357 [==============================] - 0s 241us/step - loss: 0.0105
Epoch 221/230
357/357 [==============================] - 0s 224us/step - loss: 0.0104
Epoch 222/230
357/357 [==============================] - 0s 238us/step - loss: 0.0104
Epoch 223/230
357/357 [==============================] - 0s 235us/step - loss: 0.0104
Epoch 224/230
357/357 [==============================] - 0s 235us/step - loss: 0.0104
Epoch 225/230
357/357 [==============================] - 0s 238us/step - loss: 0.0103
Epoch 226/230
357/357 [==============================] - 0s 235us/step - loss: 0.0104
Epoch 227/230
357/357 [==============================] - 0s 220us/step - loss: 0.0104
Epoch 228/230
357/357 [==============================] - 0s 216us/step - loss: 0.0103
Epoch 229/230
357/357 [==============================] - 0s 221us/step - loss: 0.0103
Epoch 230/230
357/357 [==============================] - 0s 218us/step - loss: 0.0103
    report_date     purchase       redeem
0      20140901  348423744.0  322023424.0
1      20140902  364116224.0  289541824.0
2      20140903  314362464.0  292815072.0
3      20140904  235335616.0  267737088.0
4      20140905  223891152.0  257827808.0
5      20140906  187502288.0  182770400.0
6      20140907  197273152.0  172102064.0
7      20140908  284323456.0  313361856.0
8      20140909  334362528.0  290888672.0
9      20140910  297960832.0  288162688.0
10     20140911  283303936.0  288217824.0
11     20140912  238455424.0  296960800.0
12     20140913  212268720.0  209290080.0
13     20140914  241748864.0  221454528.0
14     20140915  339355904.0  332502656.0
15     20140916  357754464.0  287828224.0
16     20140917  292172640.0  275982752.0
17     20140918  264310208.0  270322624.0
18     20140919  259663872.0  257026096.0
19     20140920  236437824.0  179520224.0
20     20140921  237860272.0  165486512.0
21     20140922  311783200.0  292050432.0
22     20140923  348307168.0  282247232.0
23     20140924  320614656.0  299530048.0
24     20140925  283868736.0  276986784.0
25     20140926  262741248.0  277232544.0
26     20140927  214875744.0  194831136.0
27     20140928  251362720.0  201820928.0
28     20140929  315660704.0  327805344.0
29     20140930  359690944.0  299992256.0
<Figure size 864x576 with 0 Axes>

劣汰后特征对比

In [102]:
data_purchase = pd.read_csv('Feature/purchase_feature_droped_0614.csv')
data_purchase['date'] = pd.to_datetime(data_purchase['date'])
In [103]:
data_redeem = pd.read_csv('Feature/redeem_feature_droped_0614.csv')
data_redeem['date'] = pd.to_datetime(data_redeem['date'])
In [104]:
trainset_purchase, testset_purchase = split_data_underline(data_purchase)
result_purchase_lr = generate_under_result(data_purchase, [x for x in data_purchase.columns
                                                           if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                           target='total_purchase_amt')
In [105]:
trainset_redeem, testset_redeem = split_data_underline(data_redeem)
result_redeem_lr = generate_under_result(data_redeem, [x for x in data_redeem.columns
                                                           if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                           target='total_redeem_amt')
In [106]:
total_AE(result_purchase_lr, result_redeem_lr, testset_purchase['total_purchase_amt'], testset_redeem['total_redeem_amt'])
Out[106]:
177.52598256862842
In [107]:
add_two_df(multi_model_eva(data_purchase, 'purchase'), multi_model_eva(data_redeem, 'redeem'))
Out[107]:
interval LinearRegression DecisionTreeRegressor RandomForestRegressor GradientBoostingRegressor MLPRegressor XGBRegressor
0 2014-08-01 177.525983 169.056581 178.636148 178.798203 186.447677 158.329306
1 2014-07-25 169.910917 166.708591 169.725779 171.646263 171.821540 163.155185
2 2014-07-18 184.160135 177.757060 183.111555 186.391490 186.238349 177.244671
3 2014-07-11 183.707793 172.937738 173.583136 172.331584 185.474452 167.496197
4 2014-07-04 175.802846 162.357548 167.165225 165.404841 183.388025 152.430971
5 2014-06-27 175.492073 162.188395 183.826849 183.137700 180.337426 167.084495
6 2014-06-20 158.633177 166.820516 169.679528 165.158796 151.728044 157.084786
In [108]:
trainset, testset = split_data_underline(data)
visual(result_purchase_lr, result_redeem_lr, testset)
In [109]:
result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_purchase_amt')
result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_redeem_amt')

3.1 线性回归

In [110]:
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)

purchase feature

'dis_to_nowork', 'dis_to_work', 'dis_from_work', 'purchase_weekdayrate', 'redeem_dayrate', 'weekday_onehot_5', 'weekday_onehot_6', 'dis_from_nowork', 'is_holiday', 'weekday_onehot_1', 'weekday_onehot_2', 'weekday_onehot_0', 'dis_from_middleofweek', 'dis_from_holiendday', 'weekday_onehot_3', 'is_lastday_of_holiday', 'is_firstday_of_holiday', 'weekday_onehot_4', 'is_worked_yestday', 'is_second_week', 'is_third_week', 'dis_from_startofmonth', 'dis_from_holiday', 'dis_to_nowork%%%%dis_from_purchase_peak', 'total_purchase_amt', 'total_redeem_amt', 'date'

Redeem feature ​

'is_work', 'dis_from_redeem_valley', 'purchase_weekdayrate', 'redeem_dayrate', 'weekday_onehot_5', 'is_gonna_work_tomorrow', 'is_holiday', 'dis_from_nowork', 'weekday_onehot_0', 'weekday_onehot_1', 'is_firstday_of_holiday', 'weekday_onehot_2', 'is_lastday_of_holiday', 'dis_from_holiday', 'is_work_on_sunday', 'is_firstday_of_work', 'is_secday_of_month', 'dis_from_holiendday', 'dis_from_redeem_valley%%%%dis_from_redeem_peak', 'total_purchase_amt', 'total_redeem_amt', 'date'

In [111]:
normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('Dataset/20190614_droped.csv',index=False,header=None)

3.2 MLP

In [112]:
result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns 
                                                            if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                            MLPRegressor(solver='lbfgs'),'total_purchase_amt')
result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns 
                                                        if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                          MLPRegressor(solver='lbfgs'),'total_redeem_amt')
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)
In [113]:
normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('Dataset/20190614_droped_MLP.csv',index=False,header=None)

3.3 Xgboost

In [114]:
result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns 
                                                            if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                            xgb.XGBRegressor(objective='reg:squarederror'),'total_purchase_amt')
result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns 
                                                        if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                          xgb.XGBRegressor(objective='reg:squarederror'),'total_redeem_amt')
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)
In [115]:
normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('Dataset/20190615_droped_XGB.csv',index=False,header=None)