Data Preparation

Load the Data

In [ ]:
## Mounting Google Drive locally
from google.colab import drive
drive.mount('/content/drive/')

cancer_df = pd.read_csv('/content/drive/My Drive/Capstone/PAT_DATA_With_ZIPCODE.csv')
cancer_df.head()

zipcode = pd.read_csv('/content/drive/My Drive/Capstone/zipcode.csv')
zipcode.head()
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Out[ ]:
STATEFIPS STATE zipcode agi_stub N1 mars1 MARS2 MARS4 ELF CPREP PREP DIR_DEP N2 NUMDEP TOTAL_VITA VITA TCE VITA_EIC RAC ELDERLY A00100 N02650 A02650 N00200 A00200 N00300 A00300 N00600 A00600 N00650 A00650 N00700 A00700 N00900 A00900 N01000 A01000 N01400 A01400 N01700 ... N07220 A07220 N07260 A07260 N09400 A09400 N85770 A85770 N85775 A85775 N09750 A09750 N10600 A10600 N59660 A59660 N59720 A59720 N11070 A11070 N10960 A10960 N11560 A11560 N06500 A06500 N10300 A10300 N85530 A85530 N85300 A85300 N11901 A11901 N11900 A11900 N11902 A11902 N12000 A12000
0 1 AL 0 1 802640.0 474470.0 99850.0 216600.0 717050.0 44090.0 426090.0 580390.0 1259760.0 475750.0 25570.0 17310.0 8250.0 5670.0 212170.0 150660.0 10467448.0 802640.0 10626881.0 640460.0 8368271.0 89770.0 54949.0 40190.0 67982.0 35970.0 45716.0 8330.0 4383.0 144200.0 741047.0 33800.0 35843.0 37360.0 236386.0 102180.0 ... 27250.0 8374.0 870.0 483.0 112110.0 153786.0 54300.0 390448.0 56390.0 397723.0 14430.0 8483.0 741920.0 2078353.0 353820.0 1078908.0 324630.0 967560.0 221720.0 283322.0 55630.0 51730.0 23200.0 14464.0 255600.0 161796.0 363870.0 327165.0 0.0 0.0 0.0 0.0 64680.0 53602.0 700940.0 1803125.0 698100.0 1796343.0 2860.0 4917.0
1 1 AL 0 2 499070.0 218590.0 137460.0 129760.0 448190.0 26230.0 269560.0 365010.0 985860.0 352150.0 11550.0 7570.0 3980.0 370.0 120760.0 112510.0 17977581.0 499070.0 18153235.0 429020.0 14801613.0 90100.0 62310.0 39370.0 91981.0 34800.0 64670.0 41160.0 23705.0 67150.0 254049.0 31780.0 77816.0 36500.0 356129.0 99740.0 ... 120230.0 106522.0 7380.0 6388.0 38700.0 86925.0 27970.0 188902.0 29050.0 201292.0 18970.0 13043.0 487380.0 2065629.0 138510.0 295395.0 115390.0 249017.0 100530.0 139491.0 31740.0 28933.0 6510.0 6541.0 380780.0 896996.0 401700.0 1003316.0 0.0 0.0 0.0 0.0 77660.0 118725.0 419640.0 1175607.0 416180.0 1165352.0 4250.0 8894.0
2 1 AL 0 3 268590.0 89780.0 134440.0 38280.0 241060.0 14160.0 156410.0 175150.0 584720.0 181950.0 2780.0 1430.0 1350.0 0.0 44750.0 78030.0 16502473.0 268590.0 16659518.0 229300.0 12727137.0 82080.0 64795.0 38570.0 122427.0 35410.0 88866.0 54360.0 37829.0 41690.0 266422.0 32780.0 121012.0 30100.0 402312.0 72260.0 ... 74240.0 111795.0 5820.0 5791.0 27650.0 70831.0 3850.0 27765.0 6650.0 47490.0 4510.0 4911.0 264520.0 1799819.0 90.0 33.0 20.0 8.0 9660.0 12750.0 16310.0 15019.0 80.0 88.0 254580.0 1303768.0 258100.0 1388200.0 0.0 0.0 0.0 0.0 67820.0 156752.0 201030.0 560461.0 197060.0 547812.0 5440.0 13482.0
3 1 AL 0 4 170880.0 32180.0 124070.0 11660.0 154120.0 7980.0 101650.0 106500.0 425800.0 130670.0 730.0 420.0 310.0 0.0 20160.0 53770.0 14817087.0 170900.0 14951547.0 146490.0 11098156.0 69200.0 60626.0 34510.0 132608.0 31260.0 96413.0 49240.0 40351.0 29580.0 249368.0 28970.0 153144.0 23990.0 430430.0 53310.0 ... 54170.0 88819.0 2290.0 2031.0 19760.0 57506.0 230.0 1509.0 1380.0 9404.0 720.0 1162.0 169120.0 1721162.0 0.0 0.0 0.0 0.0 180.0 237.0 11340.0 10469.0 0.0 0.0 168810.0 1391918.0 169380.0 1465080.0 0.0 0.0 0.0 0.0 48440.0 141721.0 121930.0 396526.0 118460.0 383588.0 3160.0 12369.0
4 1 AL 0 5 229870.0 22810.0 196990.0 5540.0 208380.0 11120.0 136470.0 117930.0 630970.0 204340.0 690.0 650.0 40.0 0.0 13650.0 74020.0 31131390.0 229850.0 31461757.0 202630.0 22900238.0 123960.0 152168.0 72860.0 417240.0 67830.0 319909.0 109080.0 118952.0 45280.0 679483.0 64050.0 628310.0 38210.0 1017867.0 77350.0 ... 47890.0 64780.0 3090.0 1928.0 32290.0 134318.0 0.0 0.0 330.0 2512.0 240.0 574.0 227720.0 4273327.0 0.0 0.0 0.0 0.0 0.0 0.0 18760.0 17760.0 0.0 0.0 228870.0 3979974.0 229110.0 4147216.0 350.0 104.0 50.0 19.0 91100.0 465160.0 138250.0 588068.0 130970.0 529001.0 8430.0 55564.0

5 rows × 153 columns

In [ ]:
### import libraries ###

# basic
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV, cross_validate  #Perforing grid search
from matplotlib.legend_handler import HandlerLine2D

%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 5

Descriptive Analysis

In [ ]:
# check the shape of the data
cancer_df.shape
Out[ ]:
(5953, 111)
In [ ]:
cancer_df
Out[ ]:
_ visitid siteid patientnumber admissiondate admissiontime dischargedate dischargetime canceldate canceltime drgid patienttypeid transferreasonid room bed clinicid preadmissionnumber hscrcpatienttypeid hscrcmajorserviceid visitnumber referringmedicalfacilityid financialclasseffectivedate placeofsurgeryid natureofsurgeryid financialclassid curfinancialclassid curinsuranceplan1id curinsuranceplan2id curinsuranceplan3id curinsuranceplan4id finalbilldate patientlocationid methodofarrivalid natureofadmissionid sourceofadmissionid admittinghospitalserviceid dischargehospitalserviceid dischargedispositionid employerid privateserviceid ... datefinalizedrevised datecreated dateupdated expr1 group_type hospital discharge_disposition eadnumber corporateidentifier ssn dateofbirth firstname lastname bl_admission_date bl_discharge_date dob age_admission first_readmit_visitid first_readmit_date first_day_diff readmission_90 readmission_90_sum readmission_30 readmission_30_sum comorbidities complications financial_class primary_insurance_plan gender race ethnicity race_cat eth_cat age_admission_group age_admission_decade race2 race3 discharge discharge2 ZIPCODE
0 2504 27742713 7 2200589907 1/28/15 727 2/6/15 1502 NaN NaN 21475.0 216 NaN B701 5A NaN 2.200590e+09 3 NaN 4.371429e+06 1613.0 1/28/15 7.0 79.0 200 203.0 15540.0 9888.0 NaN NaN 2/13/15 457.0 128.0 97.0 569.0 NaN 565 1057.0 2065.0 21.0 ... 4/3/15 1/8/15 7/22/15 CWhitley Colon_Rectum MedStar Georgetown University Hospital Refer/Disch to Home Care 18403645.0 18403645.0 213428177.0 10/14/42 JOHN HUNTER 1/28/15 2/6/15 10/14/42 72 27933488.0 2/12/15 6.0 1 1 1 1 1 2 MEDICARE MEDICARE A 2015 Male White Not Spanish/Hispanic Orig White Not Hispanic or Latino Senior 70-79 White White Value 5 Medical Facility 20839
1 2433 26170864 7 2200260178 3/21/14 1441 3/28/14 1709 NaN NaN 20718.0 210 NaN B701 7A NaN 2.200260e+09 3 NaN 4.123373e+06 1613.0 3/21/14 7.0 79.0 204 204.0 9624.0 NaN NaN NaN 4/7/14 457.0 128.0 97.0 564.0 NaN 565 1053.0 2040.0 24.0 ... NaN 3/10/14 5/27/14 ETALYOWE Pancreas MedStar Georgetown University Hospital Disch to home or self care 23691035.0 23691035.0 96621787.0 3/27/65 DAVID POULOS 3/21/14 3/28/14 3/27/65 48 NaN NaN NaN 0 0 0 0 2 0 HMO UHC POS HMO Male White Not Spanish/Hispanic Orig White Not Hispanic or Latino Adult 40-49 White White Value 1 Home/Self Care/Jail 20839
2 2637 21393532 7 7721442411 12/15/11 1252 12/21/11 1508 NaN NaN 19275.0 210 NaN C411 0A NaN 7.721442e+09 3 NaN 3.474557e+06 1218.0 12/15/11 7.0 79.0 204 204.0 14312.0 NaN NaN NaN 12/28/11 459.0 128.0 97.0 569.0 NaN 565 1053.0 NaN 21.0 ... 1/4/12 11/30/11 5/21/12 ADAVIS2 Pancreas MedStar Georgetown University Hospital Disch to home or self care 19153342.0 19153342.0 NaN 5/1/72 MATTHEW WILLIAMS 12/15/11 12/21/11 5/1/72 39 21508861.0 12/23/11 2.0 1 2 1 1 1 0 HMO BLUE CHOICE OPEN Male White Not Spanish/Hispanic Orig White Not Hispanic or Latino Adult 30-39 White White Value 1 Home/Self Care/Jail 20611
3 1017 6805230 1 3012264655 8/19/04 959 8/22/04 1445 NaN NaN 1620.0 16 NaN IC16 A NaN 3.012265e+09 3 2.0 1.180632e+06 NaN 8/19/04 7.0 12.0 12 164.0 5879.0 NaN NaN NaN 8/25/04 61.0 15.0 13.0 24.0 66.0 68 258.0 76.0 1.0 ... NaN 8/11/04 10/27/12 Dwagner Lung MedStar Franklin Square Medical Center Disch to home or self care 308094.0 308094.0 217348836.0 2/8/38 SONIA BELL 8/19/04 8/22/04 2/8/38 66 NaN NaN NaN 0 0 0 0 1 0 HMO BLUE CHOICE /CAPCARE Female White Not Spanish/Hispanic Orig White Not Hispanic or Latino Senior 60-69 White White Value 1 Home/Self Care/Jail 21017
4 1223 10033541 2 5016074006 1/16/07 901 1/20/07 1300 NaN NaN 3267.0 121 NaN 939 1 NaN 5.016074e+09 3 2.0 1.556189e+06 469.0 1/16/07 7.0 30.0 51 51.0 11581.0 8544.0 NaN NaN 1/23/07 181.0 57.0 32.0 566.0 164.0 164 969.0 1128.0 5.0 ... NaN 12/28/06 11/20/12 dmon Lung MedStar Union Memorial Hospital Disch to home or self care 11422538.0 11422538.0 216387296.0 9/13/41 BEVERLY GILBERT 1/16/07 1/20/07 9/13/41 65 NaN NaN NaN 0 0 0 0 4 2 MEDICARE DNU MC IP A 2007 Female White Not Spanish/Hispanic Orig White Not Hispanic or Latino Senior 60-69 White White Value 1 Home/Self Care/Jail 21017
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5948 3058 14762254 7 7711721758 4/23/08 1847 6/12/08 1530 NaN NaN 7172.0 210 NaN B700 4A NaN 7.711722e+09 3 NaN 2.381948e+06 1218.0 4/23/08 7.0 79.0 200 291.0 11770.0 9642.0 NaN NaN 6/17/08 457.0 128.0 97.0 564.0 NaN 565 1067.0 NaN 24.0 ... 6/17/08 3/16/08 8/5/15 SSMOOT Pancreas MedStar Georgetown University Hospital Disch to on-site Rehab Unit 7765950.0 7765950.0 703079324.0 9/7/22 CHARLES O ROKE 4/23/08 6/12/08 9/7/22 85 NaN NaN NaN 0 0 0 0 2 2 MEDICARE DNU MC IP A 2008 Male White Unknown White Declined/Unknown Senior 80-89 White White Value 5 Rehab 25404
5949 3332 24354726 7 7723008749 3/21/13 1802 3/26/13 1200 NaN NaN 20028.0 216 NaN B601 7A NaN 7.723009e+09 3 NaN 3.837373e+06 1613.0 3/21/13 7.0 79.0 200 203.0 14447.0 9888.0 NaN NaN 4/3/13 456.0 128.0 97.0 569.0 NaN 565 1053.0 NaN 21.0 ... 4/24/13 3/15/13 6/17/13 SRAINEY Pancreas MedStar Georgetown University Hospital Disch to home or self care 1825430.0 1825430.0 232743849.0 11/24/44 IRIS FEARNOW 3/21/13 3/26/13 11/24/44 68 NaN NaN NaN 0 0 0 0 0 0 MEDICARE MEDICARE A 2013 Female White Unknown White Declined/Unknown Senior 60-69 White White Value 1 Home/Self Care/Jail 25403
5950 1490 16065164 2 5019733715 11/11/08 547 12/9/08 1425 NaN NaN 9288.0 110 NaN 524 1 NaN 5.019734e+09 3 2.0 1.920733e+06 478.0 11/11/08 7.0 30.0 51 168.0 11783.0 9938.0 NaN NaN 12/16/08 435.0 57.0 32.0 566.0 164.0 164 978.0 1128.0 5.0 ... NaN 10/28/08 11/20/12 LEskow Esophagus MedStar Union Memorial Hospital Trans to Short-Term Gen Hosp 13406920.0 13406920.0 405484346.0 11/8/39 WILLIAM PETRI 11/11/08 12/9/08 11/8/39 69 NaN NaN NaN 0 0 0 0 2 3 MEDICARE DNU MC IP A 2008 Male White Not Spanish/Hispanic Orig White Not Hispanic or Latino Senior 60-69 White White Value 4 Acute Hospital 17202
5951 4128 16985166 9 23399827 5/21/09 523 6/18/09 1031 NaN NaN 9540.0 231 NaN 2N18 PW NaN 2.339983e+07 3 NaN 8.520581e+06 NaN 5/21/09 7.0 88.0 226 226.0 10432.0 NaN NaN NaN 6/22/09 507.0 137.0 105.0 514.0 NaN 594 564.0 NaN 26.0 ... NaN 5/20/09 3/28/10 NaN Colon_Rectum MedStar Washington Hospital Center Transferred/discharged to an organized home he... 14629631.0 14629631.0 310985594.0 1/15/86 COURTNEY FATE 5/21/09 6/18/09 1/15/86 23 NaN NaN NaN 0 0 0 0 2 3 BLUE CROSS/BLUE SHIELD B72 CAREFIRST BC Female White Not Hispanic Or Latino White Not Hispanic or Latino Adult 20-29 White White Value 5 Medical Facility 60491
5952 5881 23193132 11 34161216 8/3/12 805 8/18/12 1821 NaN NaN 19086.0 308 NaN 4F27 #NAME? NaN 3.416122e+07 3 NaN 9.000010e+11 NaN 8/3/12 7.0 NaN 324 317.0 14338.0 12868.0 NaN NaN 10/2/12 628.0 158.0 168.0 612.0 1024.0 1024 1210.0 2331.0 29.0 ... 10/2/12 7/27/12 3/5/13 VMH107 Lung MedStar Washington Hospital Center Disch to home or self care 20536126.0 20536126.0 580034021.0 6/4/33 IVA SMITH 8/3/12 8/18/12 6/4/33 79 NaN NaN NaN 0 0 0 0 1 0 MEDICARE MEDICARE A 2012 Female African American Not Spanish/Hispanic Orig African American or Black Not Hispanic or Latino Senior 70-79 African American or Black African American or Black Value 1 Home/Self Care/Jail 99999

5953 rows × 111 columns

In [ ]:
# describe the data
cancer_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5953 entries, 0 to 5952
Columns: 111 entries, _ to ZIPCODE
dtypes: float64(45), int64(20), object(46)
memory usage: 5.0+ MB

Data Quality Check

In [ ]:
# data quality check
# null value check 
cancer_df.isna().sum()
Out[ ]:
_                0
visitid          0
siteid           0
patientnumber    0
admissiondate    0
                ..
race2            0
race3            0
discharge        0
discharge2       0
ZIPCODE          0
Length: 111, dtype: int64
In [ ]:
# # get rid of 4 patients that have no ICD-9
# cancer_df = cancer_df[(cancer_df['PatientID'] != 2736) &
#    (cancer_df['PatientID'] != 3640) &
#    (cancer_df['PatientID'] != 3726) &
#    (cancer_df['PatientID'] != 3851)]
In [ ]:
fea_col = "gender age_admission race3 hospital group_type comorbidities complications admissiondate ZIPCODE readmission_30 readmission_90".split()
# Just select all needs variables in the dataset
fea_col
# Have a glance at the those column names
Out[ ]:
['gender',
 'age_admission',
 'race3',
 'hospital',
 'group_type',
 'comorbidities',
 'complications',
 'admissiondate',
 'ZIPCODE',
 'readmission_30',
 'readmission_90']
In [ ]:
# df = cancer_df[cancer_df["age_admission"]>=65][fea_col]
# df
In [ ]:
# df.admissiondate = df["admissiondate"].map(lambda x: x[-4:])#Only take the last four number as the year
# df.head()
# #Transform admissiondate to year of admission
In [ ]:
df = cancer_df
In [ ]:
# df = df[(df["admissiondate"] >= 2003) & (df["admissiondate"] <= 2012)]
In [ ]:
# select feature columns we use
df = df[fea_col]

Data Visualization

Count Plot

In [ ]:
sns.countplot(cancer_df.readmission_30)
# Descriptive statistic summary for age for the patients
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe85f080860>
In [ ]:
sns.countplot(cancer_df.readmission_90)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe85eaf2828>
In [ ]:
df.ZIPCODE.value_counts()
Out[ ]:
21222    177
21220    152
21221    139
20011    119
20019    101
        ... 
14901      1
14905      1
21052      1
2675       1
22539      1
Name: ZIPCODE, Length: 628, dtype: int64
In [ ]:
categorical_fea = ['gender', 'race3', 'hospital', 'group_type', 'comorbidities', 'complications']

for f in categorical_fea:
    plt.figure()
    fig, ax = plt.subplots(figsize=(20,10))
    # Calculate the percentage of target=1 per category value
    cat_perc = df[[f, 'readmission_30']].groupby([f],as_index=False).mean()
    cat_perc.sort_values(by='readmission_30', ascending=False, inplace=True)
    # Bar plot
    # Order the bars descending on target mean
    sns.barplot(ax=ax, x=f, y='readmission_30', data=cat_perc, order=cat_perc[f], palette='spring')
    plt.ylabel('% target', fontsize=18)
    plt.xlabel(f, fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    if f == "hospital": ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
    plt.tight_layout()
    plt.show();
<Figure size 864x360 with 0 Axes>
<Figure size 864x360 with 0 Axes>
<Figure size 864x360 with 0 Axes>
<Figure size 864x360 with 0 Axes>
<Figure size 864x360 with 0 Axes>
<Figure size 864x360 with 0 Axes>

Distribution Plot

In [ ]:
df['readmission_30'].groupby(df.admissiondate.astype("datetime64").dt.year).count().plot(kind="bar", color = '#16419B')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe85a22d470>
In [ ]:
age = cancer_df["age_admission"]
sns.distplot(age)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe84e23dcf8>

Correlation Plot

In [ ]:
def corr_heatmap(v):
    correlations = df[v].corr()

    # Create color map ranging between two colors
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
    plt.tight_layout()
    plt.show();
corr_heatmap(fea_col)
In [ ]:
sns.lmplot(x='comorbidities', y='complications', data=df, hue='readmission_30', palette='winter', scatter_kws={'alpha':0.5})
plt.show()
In [ ]:
sns.set(style="whitegrid")
ax = sns.countplot(x="race3", data=df)
print(df['race3'].value_counts())

plt.xticks(rotation = -45)
plt.figure(figsize=(100, 20))
White                        3271
African American or Black    1745
Other                         835
Asian/Pacific Islander        102
Name: race3, dtype: int64
Out[ ]:
<Figure size 7200x1440 with 0 Axes>
<Figure size 7200x1440 with 0 Axes>

Data Transformation

Under Sampling

In [ ]:
from sklearn.utils import shuffle
desired_apriori=0.2

# Get the indices per target value
idx_0 = df[df['readmission_30'] == 0].index
idx_1 = df[df['readmission_30'] == 1].index

# Get original number of records per target value
nb_0 = len(df.loc[idx_0])
nb_1 = len(df.loc[idx_1])

# Calculate the undersampling rate and resulting number of records with target=0
undersampling_rate = ((1-desired_apriori)*nb_1)/(nb_0*desired_apriori)
undersampled_nb_0 = int(undersampling_rate*nb_0)
print('Rate to undersample records with target=0: {}'.format(undersampling_rate))
print('Number of records with target=0 after undersampling: {}'.format(undersampled_nb_0))

# Randomly select records with target=0 to get at the desired a priori
undersampled_idx = shuffle(idx_0, random_state=37, n_samples=undersampled_nb_0)

# Construct list with remaining indices
idx_list = list(undersampled_idx) + list(idx_1)

# Return undersample data frame
df_undersample = df.loc[idx_list].reset_index(drop=True)
Rate to undersample records with target=0: 0.6635330983157071
Number of records with target=0 after undersampling: 3388
In [ ]:
sns.countplot(df_undersample.readmission_30)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe858e798d0>
In [ ]:
sns.countplot(df_undersample.readmission_90)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe858ded7b8>

Train Test Split

In [ ]:
from sklearn.model_selection import train_test_split
X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(df_undersample.drop(["readmission_30", "readmission_90"], axis= 1), df_undersample['readmission_30'], df_undersample['readmission_90'], test_size=0.2, random_state=102)

Categorical Data Encoding

Target Encoding

In [ ]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)
In [ ]:
col_target_encode = ['gender', 'age_admission', 'race3', 'hospital', 'group_type', 'comorbidities', 'complications', 'admissiondate', 'ZIPCODE']

for i in col_target_encode:
  X_train[i], X_test[i] = target_encode(X_train[i], 
                             X_test[i], 
                             target=y1_train, 
                             min_samples_leaf=100,
                             smoothing=10,
                             noise_level=0.01)
In [ ]:
X_train.head()
Out[ ]:
gender age_admission race3 hospital group_type comorbidities complications admissiondate ZIPCODE
3343 0.187423 0.212648 0.188576 0.250988 0.154047 0.190098 0.248883 0.201220 0.199441
1462 0.183125 0.204065 0.188942 0.166404 0.221607 0.193539 0.156095 0.195473 0.201769
177 0.184055 0.201926 0.209138 0.183579 0.272852 0.162446 0.156612 0.198475 0.229520
3167 0.213763 0.183469 0.210828 0.201750 0.164165 0.253383 0.238175 0.199546 0.203117
2984 0.186920 0.211674 0.185344 0.245803 0.166822 0.212015 0.219753 0.198108 0.201634
In [ ]:
# combine train, test set 
df_for_te = pd.concat([X_train,X_test],axis=0)

y1 = pd.concat([y1_train,y1_test],axis=0)
y2 = pd.concat([y2_train,y2_test],axis=0)

One Hot Encoding

In [ ]:
df_for_oe = df_undersample
In [ ]:
# partition by age group
def age_group(x):
  if (x >= 65) & (x <= 69):
    return 0
  elif (x >= 70) & (x <= 74):
    return 1
  elif (x >= 75) & (x <= 79):
    return 2
  else:
    return 3

df_for_oe.age_admission = df_for_oe.age_admission.apply(lambda x: age_group(x))
In [ ]:
# So as comorbidities and complications
def c_group(x):
  if x == 0:
    return 0
  elif x == 1:
    return 1
  elif x == 2:
    return 2
  else:
    return 3


df_for_oe.comorbidities = df_for_oe.comorbidities.apply(lambda x: c_group(x))

df_for_oe.complications = df_for_oe.complications.apply(lambda x: c_group(x))
In [ ]:
df_for_oe.head()
Out[ ]:
gender age_admission race3 hospital group_type comorbidities complications admissiondate ZIPCODE readmission_30 readmission_90
0 Female 0 White MedStar Good Samaritan Hospital Lung 1 0 11/10/03 21014 0 0
1 Male 3 White MedStar Georgetown University Hospital Colon_Rectum 2 0 10/14/05 20007 0 0
2 Female 3 Other MedStar Georgetown University Hospital Stomach 0 1 5/25/12 20111 0 1
3 Female 3 White MedStar Good Samaritan Hospital Lung 2 3 12/5/07 21093 0 0
4 Male 2 White MedStar Franklin Square Medical Center Lung 1 0 6/13/03 21028 0 0
In [ ]:
df_for_oe.admissiondate = df_for_oe["admissiondate"].map(lambda x: x[-2:]) #Only take the last four number as the year
df_for_oe.head()
Out[ ]:
gender age_admission race3 hospital group_type comorbidities complications admissiondate ZIPCODE readmission_30 readmission_90
0 Female 0 White MedStar Good Samaritan Hospital Lung 1 0 03 21014 0 0
1 Male 3 White MedStar Georgetown University Hospital Colon_Rectum 2 0 05 20007 0 0
2 Female 3 Other MedStar Georgetown University Hospital Stomach 0 1 12 20111 0 1
3 Female 3 White MedStar Good Samaritan Hospital Lung 2 3 07 21093 0 0
4 Male 2 White MedStar Franklin Square Medical Center Lung 1 0 03 21028 0 0
In [ ]:
for i in df_for_oe.columns:
  df_for_oe[i] = df_for_oe[i].apply(lambda x : str(x))
In [ ]:
df_for_oe = pd.get_dummies(df_for_oe[fea_col[:-2]],drop_first=True)
df_for_oe.head()
Out[ ]:
gender_Male age_admission_1 age_admission_2 age_admission_3 race3_Asian/Pacific Islander race3_Other race3_White hospital_MedStar Georgetown University Hospital hospital_MedStar Good Samaritan Hospital hospital_MedStar Harbor Hospital hospital_MedStar Union Memorial Hospital hospital_MedStar Washington Hospital Center hospital_Montgomery General Hospital hospital_St.Mary's Hospital group_type_Esophagus group_type_Liver group_type_Lung group_type_Pancreas group_type_Stomach comorbidities_1 comorbidities_2 comorbidities_3 complications_1 complications_2 complications_3 admissiondate_03 admissiondate_04 admissiondate_05 admissiondate_06 admissiondate_07 admissiondate_08 admissiondate_09 admissiondate_10 admissiondate_11 admissiondate_12 admissiondate_13 admissiondate_14 admissiondate_15 ZIPCODE_14901 ZIPCODE_14905 ... ZIPCODE_33308 ZIPCODE_33411 ZIPCODE_33469 ZIPCODE_33613 ZIPCODE_33771 ZIPCODE_34287 ZIPCODE_34684 ZIPCODE_34698 ZIPCODE_34769 ZIPCODE_34952 ZIPCODE_34986 ZIPCODE_37824 ZIPCODE_38119 ZIPCODE_38769 ZIPCODE_39180 ZIPCODE_40205 ZIPCODE_44224 ZIPCODE_44507 ZIPCODE_45701 ZIPCODE_49080 ZIPCODE_49097 ZIPCODE_55353 ZIPCODE_60050 ZIPCODE_60491 ZIPCODE_6457 ZIPCODE_72642 ZIPCODE_739 ZIPCODE_76028 ZIPCODE_7719 ZIPCODE_7860 ZIPCODE_801 ZIPCODE_80207 ZIPCODE_80403 ZIPCODE_80437 ZIPCODE_8742 ZIPCODE_89117 ZIPCODE_89144 ZIPCODE_91007 ZIPCODE_96753 ZIPCODE_99999
0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

5 rows × 584 columns

In [ ]:
y1_oe = df_undersample['readmission_30'].astype(int)
y2_oe = df_undersample['readmission_90'].astype(int)
X_train_oe, X_test_oe, y1_train_oe, y1_test_oe, y2_train_oe, y2_test_oe = train_test_split(df_for_oe, y1_oe, y2_oe, test_size=0.2, random_state=102)

Label Encoding

In [ ]:
df_for_le = df_undersample
In [ ]:
encoder = LabelEncoder()
df_for_le = df_for_le[fea_col[:-2]].apply(encoder.fit_transform)
In [ ]:
df_for_le.head()
Out[ ]:
gender age_admission race3 hospital group_type comorbidities complications admissiondate ZIPCODE
0 0 0 3 2 3 1 0 1 238
1 1 3 3 1 0 2 0 3 38
2 0 3 2 1 5 0 1 10 63
3 0 3 3 2 3 2 3 5 274
4 1 2 3 0 3 1 0 1 244
In [ ]:
y1_le = df_undersample['readmission_30'].astype(int)
y2_le = df_undersample['readmission_90'].astype(int)
X_train_le, X_test_le, y1_train_le, y1_test_le, y2_train_le, y2_test_le = train_test_split(df_for_le, y1_le, y2_le, test_size=0.2, random_state=102)

Chi Square Test

In [ ]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
categorical_features = df.columns[:10]
df_2 = df[categorical_features].apply(encoder.fit_transform)
In [ ]:
df_2.to_csv('labelencoder_df.csv')
In [ ]:
df.race3.value_counts()
Out[ ]:
White                        3269
African American or Black    1744
Other                         835
Asian/Pacific Islander        101
Name: race3, dtype: int64
In [ ]:
d1 = df[['race3','readmission_30']]
In [ ]:
a = d1[d1.readmission_30 == '0'].iloc[:,0].value_counts()/ sum(d1[d1.readmission_30 == '0'].iloc[:,0].value_counts())
print('target = 1','\n', round(a,2))
target = 1 
 Series([], Name: race3, dtype: float64)
/usr/local/lib/python3.6/dist-packages/pandas/core/ops/array_ops.py:253: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  res_values = method(rvalues)
In [ ]:
def trans(df, col, target):
  data = df[[col,target]]
  v1 = data[data[target] == '0'].iloc[:,0].value_counts()/ sum(data[data[target] == '0'].iloc[:,0].value_counts())
  print('target = 0')
  print(round(100*v1,1))
  print('/////////////////////////////////////////////')
  v2 = data[data[target] == '1'].iloc[:,0].value_counts()/ sum(data[data[target] == '1'].iloc[:,0].value_counts())
  print('target = 1')
  print(round(100*v2,1))
  print("=============================================")
In [ ]:
col_to_use = ['gender', 'age_admission', 'race3', 'hospital', 'group_type', 'comorbidities',
       'complications', 'admissiondate']  # primary_insurance_plan

for col in col_to_use:
  trans(df,col,'readmission_30')
  print("#############################################")
  trans(df,col,'readmission_90')
In [ ]:
df_2 = df[categorical_features].apply(encoder.fit_transform)

Data Modeling

Logistic Regression

In [ ]:
# target encoding
clf_1 = LogisticRegression(max_iter = 1000, random_state=0).fit(X_train, y1_train)
pred_1 = clf_1.predict(X_test)
prob_1 = clf_1.predict_proba(X_test)


# one hot encoding
clf_2 = LogisticRegression(max_iter = 1000, random_state=0).fit(X_train_oe, y1_train_oe)
pred_2 = clf_2.predict(X_test_oe)
prob_2 = clf_2.predict_proba(X_test_oe)

# label encoding
clf_3 = LogisticRegression(max_iter = 1000, random_state=0).fit(X_train_le, y1_train_le)
pred_3 = clf_3.predict(X_test_le)
prob_3 = clf_3.predict_proba(X_test_le)
In [ ]:
fig, axs = plt.subplots(3)
fig.suptitle('The more fluctuate they are, the more distinctive readmission probability they have')
axs[0].plot(pd.DataFrame(prob_1))
axs[1].plot(pd.DataFrame(prob_2))
axs[2].plot(pd.DataFrame(prob_3))
Out[ ]:
[<matplotlib.lines.Line2D at 0x7fe8572340b8>,
 <matplotlib.lines.Line2D at 0x7fe857258a58>]
In [ ]:
fig, axs = plt.subplots(3)
fig.suptitle('The more fluctuate they are, the more distinctive readmission probability they have')

sns.heatmap(pd.DataFrame(confusion_matrix(y1_test, pred_1)), annot=True, ax=axs[0], cmap='copper')
sns.heatmap(pd.DataFrame(confusion_matrix(y1_test_oe, pred_2)), annot=True, ax=axs[1], cmap='copper')
sns.heatmap(pd.DataFrame(confusion_matrix(y1_test_le, pred_3)), annot=True, ax=axs[2], cmap='copper')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe84f4d2b70>

XGBoost

In [ ]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50, readmission = 'readmission_30'):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[readmission].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[readmission],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain[readmission].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[readmission], dtrain_predprob))
    print(confusion_matrix(dtrain_predictions,dtrain[readmission].values))
    print(classification_report(dtrain_predictions,dtrain[readmission].values))

    pd.DataFrame(alg.predict_proba(dtrain[predictors])).plot()
    plt.show()

    sns.heatmap(pd.DataFrame(confusion_matrix(dtrain_predictions,dtrain[readmission].values)), annot=True, cmap='copper', fmt="d")
    plt.show()


    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
In [ ]:
# target encoding
df_for_te['readmission_30'] = y1.astype(int)
# X_train['readmission_90'] = y2_train.astype(int)
predictors = [x for x in df_for_te.columns[:-1]]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=60,
 max_depth=12,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=2)
modelfit(xgb1, df_for_te, predictors, readmission = "readmission_30")
Model Report
Accuracy : 0.9856
AUC Score (Train): 0.999996
[[3388   61]
 [   0  786]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      3449
           1       0.93      1.00      0.96       786

    accuracy                           0.99      4235
   macro avg       0.96      0.99      0.98      4235
weighted avg       0.99      0.99      0.99      4235

In [ ]:
# one hot encoding
df_for_oe['readmission_30'] = y1_oe.astype(int)
# X_train['readmission_90'] = y2_train.astype(int)
predictors = [x for x in df_for_oe.columns[:-1]]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=50,
 max_depth=20,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=1)
modelfit(xgb1, df_for_oe, predictors, readmission = "readmission_30")
Model Report
Accuracy : 0.863
AUC Score (Train): 0.943279
[[3382  574]
 [   6  273]]
              precision    recall  f1-score   support

           0       1.00      0.85      0.92      3956
           1       0.32      0.98      0.48       279

    accuracy                           0.86      4235
   macro avg       0.66      0.92      0.70      4235
weighted avg       0.95      0.86      0.89      4235

In [ ]:
# label encoding
def norm(col):
  return col / np.sqrt(np.sum([x**2 for x in col]))

for i in df_for_le.columns[1:]:
  df_for_le[i] = norm(df_for_le[i])

df_for_le['readmission_30'] = y1_le.astype(int)
# X_train['readmission_90'] = y2_train.astype(int)
predictors = [x for x in df_for_le.columns[:-1]]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=50,
 max_depth=15,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=123)
modelfit(xgb1, df_for_le, predictors, readmission = "readmission_30")
Model Report
Accuracy : 0.9341
AUC Score (Train): 0.997296
[[3386  277]
 [   2  570]]
              precision    recall  f1-score   support

           0       1.00      0.92      0.96      3663
           1       0.67      1.00      0.80       572

    accuracy                           0.93      4235
   macro avg       0.84      0.96      0.88      4235
weighted avg       0.96      0.93      0.94      4235

Random Forest

In [ ]:
# target encoding
def my_rf(Xtrain,ytrain,Xtest,ytest):
  rf = RandomForestClassifier()
  rf.fit(Xtrain, ytrain)
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
  max_depth=10, max_features='auto', max_leaf_nodes=None,
  min_impurity_split=1e-07, min_samples_leaf=1,
  min_samples_split=2, min_weight_fraction_leaf=0.0,
  n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
  verbose=0, warm_start=False)
  y_pred = rf.predict(Xtest)

  #Print model report:


  pd.DataFrame(rf.predict_proba(Xtest)).plot()
  plt.show()

  sns.heatmap(pd.DataFrame(confusion_matrix(y_pred,ytest)), annot=True, cmap='copper', fmt="d")
  plt.show()

  false_positive_rate, true_positive_rate, thresholds = roc_curve(ytest, y_pred)
  roc_auc = auc(false_positive_rate, true_positive_rate)
  print(roc_auc)

my_rf(Xtrain = X_train,
      ytrain = y1_train,
      Xtest = X_test,
      ytest = y1_test)


my_rf(Xtrain = X_train_oe,
      ytrain = y1_train_oe,
      Xtest = X_test_oe,
      ytest = y1_test_oe)

my_rf(Xtrain = X_train_le,
      ytrain = y1_train_le,
      Xtest = X_test_le,
      ytest = y1_test_le)
0.508253559155621
0.5060137457044673
0.5112911143838978

Conclusion

This project applied deep learning models to predict the patients' readmission rate within 30 days. XGBoost performed the best with the validation score up to 91.2% on average after 5-fold cross validation.