Readmission Rate Prediction in Deep Learning¶

Data Preparation
Data Visualization
Data Transformation
- Under Sampling
- Train Test Split
Categorical Data Encoding
Data Modeling
Conclusion

Data Preparation¶

Load the Data¶

## Mounting Google Drive locally
from google.colab import drive
drive.mount('/content/drive/')

cancer_df = pd.read_csv('/content/drive/My Drive/Capstone/PAT_DATA_With_ZIPCODE.csv')
cancer_df.head()

zipcode = pd.read_csv('/content/drive/My Drive/Capstone/zipcode.csv')
zipcode.head()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).

### import libraries ###

# basic
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV, cross_validate  #Perforing grid search
from matplotlib.legend_handler import HandlerLine2D

%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 5

Descriptive Analysis¶

# check the shape of the data
cancer_df.shape

(5953, 111)

cancer_df

# describe the data
cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5953 entries, 0 to 5952
Columns: 111 entries, _ to ZIPCODE
dtypes: float64(45), int64(20), object(46)
memory usage: 5.0+ MB

Data Quality Check¶

# data quality check
# null value check 
cancer_df.isna().sum()

_                0
visitid          0
siteid           0
patientnumber    0
admissiondate    0
                ..
race2            0
race3            0
discharge        0
discharge2       0
ZIPCODE          0
Length: 111, dtype: int64

# # get rid of 4 patients that have no ICD-9
# cancer_df = cancer_df[(cancer_df['PatientID'] != 2736) &
#    (cancer_df['PatientID'] != 3640) &
#    (cancer_df['PatientID'] != 3726) &
#    (cancer_df['PatientID'] != 3851)]

fea_col = "gender age_admission race3 hospital group_type comorbidities complications admissiondate ZIPCODE readmission_30 readmission_90".split()
# Just select all needs variables in the dataset
fea_col
# Have a glance at the those column names

['gender',
 'age_admission',
 'race3',
 'hospital',
 'group_type',
 'comorbidities',
 'complications',
 'admissiondate',
 'ZIPCODE',
 'readmission_30',
 'readmission_90']

# df = cancer_df[cancer_df["age_admission"]>=65][fea_col]
# df

# df.admissiondate = df["admissiondate"].map(lambda x: x[-4:])#Only take the last four number as the year
# df.head()
# #Transform admissiondate to year of admission

df = cancer_df

# df = df[(df["admissiondate"] >= 2003) & (df["admissiondate"] <= 2012)]

# select feature columns we use
df = df[fea_col]

Data Visualization¶

Count Plot¶

sns.countplot(cancer_df.readmission_30)
# Descriptive statistic summary for age for the patients

<matplotlib.axes._subplots.AxesSubplot at 0x7fe85f080860>

sns.countplot(cancer_df.readmission_90)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe85eaf2828>

df.ZIPCODE.value_counts()

21222    177
21220    152
21221    139
20011    119
20019    101
        ... 
14901      1
14905      1
21052      1
2675       1
22539      1
Name: ZIPCODE, Length: 628, dtype: int64

categorical_fea = ['gender', 'race3', 'hospital', 'group_type', 'comorbidities', 'complications']

for f in categorical_fea:
    plt.figure()
    fig, ax = plt.subplots(figsize=(20,10))
    # Calculate the percentage of target=1 per category value
    cat_perc = df[[f, 'readmission_30']].groupby([f],as_index=False).mean()
    cat_perc.sort_values(by='readmission_30', ascending=False, inplace=True)
    # Bar plot
    # Order the bars descending on target mean
    sns.barplot(ax=ax, x=f, y='readmission_30', data=cat_perc, order=cat_perc[f], palette='spring')
    plt.ylabel('% target', fontsize=18)
    plt.xlabel(f, fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    if f == "hospital": ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
    plt.tight_layout()
    plt.show();

<Figure size 864x360 with 0 Axes>

<Figure size 864x360 with 0 Axes>

<Figure size 864x360 with 0 Axes>

<Figure size 864x360 with 0 Axes>

<Figure size 864x360 with 0 Axes>

<Figure size 864x360 with 0 Axes>

Distribution Plot¶

df['readmission_30'].groupby(df.admissiondate.astype("datetime64").dt.year).count().plot(kind="bar", color = '#16419B')

<matplotlib.axes._subplots.AxesSubplot at 0x7fe85a22d470>

age = cancer_df["age_admission"]
sns.distplot(age)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe84e23dcf8>

Correlation Plot¶

def corr_heatmap(v):
    correlations = df[v].corr()

    # Create color map ranging between two colors
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
    plt.tight_layout()
    plt.show();
corr_heatmap(fea_col)

sns.lmplot(x='comorbidities', y='complications', data=df, hue='readmission_30', palette='winter', scatter_kws={'alpha':0.5})
plt.show()

sns.set(style="whitegrid")
ax = sns.countplot(x="race3", data=df)
print(df['race3'].value_counts())

plt.xticks(rotation = -45)
plt.figure(figsize=(100, 20))

White                        3271
African American or Black    1745
Other                         835
Asian/Pacific Islander        102
Name: race3, dtype: int64

<Figure size 7200x1440 with 0 Axes>

<Figure size 7200x1440 with 0 Axes>

Data Transformation¶

Under Sampling¶

from sklearn.utils import shuffle
desired_apriori=0.2

# Get the indices per target value
idx_0 = df[df['readmission_30'] == 0].index
idx_1 = df[df['readmission_30'] == 1].index

# Get original number of records per target value
nb_0 = len(df.loc[idx_0])
nb_1 = len(df.loc[idx_1])

# Calculate the undersampling rate and resulting number of records with target=0
undersampling_rate = ((1-desired_apriori)*nb_1)/(nb_0*desired_apriori)
undersampled_nb_0 = int(undersampling_rate*nb_0)
print('Rate to undersample records with target=0: {}'.format(undersampling_rate))
print('Number of records with target=0 after undersampling: {}'.format(undersampled_nb_0))

# Randomly select records with target=0 to get at the desired a priori
undersampled_idx = shuffle(idx_0, random_state=37, n_samples=undersampled_nb_0)

# Construct list with remaining indices
idx_list = list(undersampled_idx) + list(idx_1)

# Return undersample data frame
df_undersample = df.loc[idx_list].reset_index(drop=True)

Rate to undersample records with target=0: 0.6635330983157071
Number of records with target=0 after undersampling: 3388

sns.countplot(df_undersample.readmission_30)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe858e798d0>

sns.countplot(df_undersample.readmission_90)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe858ded7b8>

Train Test Split¶

from sklearn.model_selection import train_test_split
X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(df_undersample.drop(["readmission_30", "readmission_90"], axis= 1), df_undersample['readmission_30'], df_undersample['readmission_90'], test_size=0.2, random_state=102)

Categorical Data Encoding¶

Target Encoding¶

def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

col_target_encode = ['gender', 'age_admission', 'race3', 'hospital', 'group_type', 'comorbidities', 'complications', 'admissiondate', 'ZIPCODE']

for i in col_target_encode:
  X_train[i], X_test[i] = target_encode(X_train[i], 
                             X_test[i], 
                             target=y1_train, 
                             min_samples_leaf=100,
                             smoothing=10,
                             noise_level=0.01)

X_train.head()

# combine train, test set 
df_for_te = pd.concat([X_train,X_test],axis=0)

y1 = pd.concat([y1_train,y1_test],axis=0)
y2 = pd.concat([y2_train,y2_test],axis=0)

One Hot Encoding¶

df_for_oe = df_undersample

# partition by age group
def age_group(x):
  if (x >= 65) & (x <= 69):
    return 0
  elif (x >= 70) & (x <= 74):
    return 1
  elif (x >= 75) & (x <= 79):
    return 2
  else:
    return 3

df_for_oe.age_admission = df_for_oe.age_admission.apply(lambda x: age_group(x))

# So as comorbidities and complications
def c_group(x):
  if x == 0:
    return 0
  elif x == 1:
    return 1
  elif x == 2:
    return 2
  else:
    return 3


df_for_oe.comorbidities = df_for_oe.comorbidities.apply(lambda x: c_group(x))

df_for_oe.complications = df_for_oe.complications.apply(lambda x: c_group(x))

df_for_oe.head()

df_for_oe.admissiondate = df_for_oe["admissiondate"].map(lambda x: x[-2:]) #Only take the last four number as the year
df_for_oe.head()

for i in df_for_oe.columns:
  df_for_oe[i] = df_for_oe[i].apply(lambda x : str(x))

df_for_oe = pd.get_dummies(df_for_oe[fea_col[:-2]],drop_first=True)
df_for_oe.head()

y1_oe = df_undersample['readmission_30'].astype(int)
y2_oe = df_undersample['readmission_90'].astype(int)
X_train_oe, X_test_oe, y1_train_oe, y1_test_oe, y2_train_oe, y2_test_oe = train_test_split(df_for_oe, y1_oe, y2_oe, test_size=0.2, random_state=102)

Label Encoding¶

df_for_le = df_undersample

encoder = LabelEncoder()
df_for_le = df_for_le[fea_col[:-2]].apply(encoder.fit_transform)

df_for_le.head()

y1_le = df_undersample['readmission_30'].astype(int)
y2_le = df_undersample['readmission_90'].astype(int)
X_train_le, X_test_le, y1_train_le, y1_test_le, y2_train_le, y2_test_le = train_test_split(df_for_le, y1_le, y2_le, test_size=0.2, random_state=102)

Chi Square Test¶

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
categorical_features = df.columns[:10]
df_2 = df[categorical_features].apply(encoder.fit_transform)

df_2.to_csv('labelencoder_df.csv')

df.race3.value_counts()

White                        3269
African American or Black    1744
Other                         835
Asian/Pacific Islander        101
Name: race3, dtype: int64

d1 = df[['race3','readmission_30']]

a = d1[d1.readmission_30 == '0'].iloc[:,0].value_counts()/ sum(d1[d1.readmission_30 == '0'].iloc[:,0].value_counts())
print('target = 1','\n', round(a,2))

target = 1 
 Series([], Name: race3, dtype: float64)

/usr/local/lib/python3.6/dist-packages/pandas/core/ops/array_ops.py:253: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  res_values = method(rvalues)

def trans(df, col, target):
  data = df[[col,target]]
  v1 = data[data[target] == '0'].iloc[:,0].value_counts()/ sum(data[data[target] == '0'].iloc[:,0].value_counts())
  print('target = 0')
  print(round(100*v1,1))
  print('/////////////////////////////////////////////')
  v2 = data[data[target] == '1'].iloc[:,0].value_counts()/ sum(data[data[target] == '1'].iloc[:,0].value_counts())
  print('target = 1')
  print(round(100*v2,1))
  print("=============================================")

col_to_use = ['gender', 'age_admission', 'race3', 'hospital', 'group_type', 'comorbidities',
       'complications', 'admissiondate']  # primary_insurance_plan

for col in col_to_use:
  trans(df,col,'readmission_30')
  print("#############################################")
  trans(df,col,'readmission_90')

df_2 = df[categorical_features].apply(encoder.fit_transform)

Data Modeling¶

Logistic Regression¶

# target encoding
clf_1 = LogisticRegression(max_iter = 1000, random_state=0).fit(X_train, y1_train)
pred_1 = clf_1.predict(X_test)
prob_1 = clf_1.predict_proba(X_test)


# one hot encoding
clf_2 = LogisticRegression(max_iter = 1000, random_state=0).fit(X_train_oe, y1_train_oe)
pred_2 = clf_2.predict(X_test_oe)
prob_2 = clf_2.predict_proba(X_test_oe)

# label encoding
clf_3 = LogisticRegression(max_iter = 1000, random_state=0).fit(X_train_le, y1_train_le)
pred_3 = clf_3.predict(X_test_le)
prob_3 = clf_3.predict_proba(X_test_le)

fig, axs = plt.subplots(3)
fig.suptitle('The more fluctuate they are, the more distinctive readmission probability they have')
axs[0].plot(pd.DataFrame(prob_1))
axs[1].plot(pd.DataFrame(prob_2))
axs[2].plot(pd.DataFrame(prob_3))

[<matplotlib.lines.Line2D at 0x7fe8572340b8>,
 <matplotlib.lines.Line2D at 0x7fe857258a58>]

fig, axs = plt.subplots(3)
fig.suptitle('The more fluctuate they are, the more distinctive readmission probability they have')

sns.heatmap(pd.DataFrame(confusion_matrix(y1_test, pred_1)), annot=True, ax=axs[0], cmap='copper')
sns.heatmap(pd.DataFrame(confusion_matrix(y1_test_oe, pred_2)), annot=True, ax=axs[1], cmap='copper')
sns.heatmap(pd.DataFrame(confusion_matrix(y1_test_le, pred_3)), annot=True, ax=axs[2], cmap='copper')

<matplotlib.axes._subplots.AxesSubplot at 0x7fe84f4d2b70>

XGBoost¶

def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50, readmission = 'readmission_30'):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[readmission].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[readmission],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain[readmission].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[readmission], dtrain_predprob))
    print(confusion_matrix(dtrain_predictions,dtrain[readmission].values))
    print(classification_report(dtrain_predictions,dtrain[readmission].values))

    pd.DataFrame(alg.predict_proba(dtrain[predictors])).plot()
    plt.show()

    sns.heatmap(pd.DataFrame(confusion_matrix(dtrain_predictions,dtrain[readmission].values)), annot=True, cmap='copper', fmt="d")
    plt.show()


    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

# target encoding
df_for_te['readmission_30'] = y1.astype(int)
# X_train['readmission_90'] = y2_train.astype(int)
predictors = [x for x in df_for_te.columns[:-1]]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=60,
 max_depth=12,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=2)
modelfit(xgb1, df_for_te, predictors, readmission = "readmission_30")

Model Report
Accuracy : 0.9856
AUC Score (Train): 0.999996
[[3388   61]
 [   0  786]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      3449
           1       0.93      1.00      0.96       786

    accuracy                           0.99      4235
   macro avg       0.96      0.99      0.98      4235
weighted avg       0.99      0.99      0.99      4235

# one hot encoding
df_for_oe['readmission_30'] = y1_oe.astype(int)
# X_train['readmission_90'] = y2_train.astype(int)
predictors = [x for x in df_for_oe.columns[:-1]]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=50,
 max_depth=20,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=1)
modelfit(xgb1, df_for_oe, predictors, readmission = "readmission_30")

Model Report
Accuracy : 0.863
AUC Score (Train): 0.943279
[[3382  574]
 [   6  273]]
              precision    recall  f1-score   support

           0       1.00      0.85      0.92      3956
           1       0.32      0.98      0.48       279

    accuracy                           0.86      4235
   macro avg       0.66      0.92      0.70      4235
weighted avg       0.95      0.86      0.89      4235

# label encoding
def norm(col):
  return col / np.sqrt(np.sum([x**2 for x in col]))

for i in df_for_le.columns[1:]:
  df_for_le[i] = norm(df_for_le[i])

df_for_le['readmission_30'] = y1_le.astype(int)
# X_train['readmission_90'] = y2_train.astype(int)
predictors = [x for x in df_for_le.columns[:-1]]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=50,
 max_depth=15,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=123)
modelfit(xgb1, df_for_le, predictors, readmission = "readmission_30")

Model Report
Accuracy : 0.9341
AUC Score (Train): 0.997296
[[3386  277]
 [   2  570]]
              precision    recall  f1-score   support

           0       1.00      0.92      0.96      3663
           1       0.67      1.00      0.80       572

    accuracy                           0.93      4235
   macro avg       0.84      0.96      0.88      4235
weighted avg       0.96      0.93      0.94      4235

Random Forest¶

# target encoding
def my_rf(Xtrain,ytrain,Xtest,ytest):
  rf = RandomForestClassifier()
  rf.fit(Xtrain, ytrain)
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
  max_depth=10, max_features='auto', max_leaf_nodes=None,
  min_impurity_split=1e-07, min_samples_leaf=1,
  min_samples_split=2, min_weight_fraction_leaf=0.0,
  n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
  verbose=0, warm_start=False)
  y_pred = rf.predict(Xtest)

  #Print model report:


  pd.DataFrame(rf.predict_proba(Xtest)).plot()
  plt.show()

  sns.heatmap(pd.DataFrame(confusion_matrix(y_pred,ytest)), annot=True, cmap='copper', fmt="d")
  plt.show()

  false_positive_rate, true_positive_rate, thresholds = roc_curve(ytest, y_pred)
  roc_auc = auc(false_positive_rate, true_positive_rate)
  print(roc_auc)

my_rf(Xtrain = X_train,
      ytrain = y1_train,
      Xtest = X_test,
      ytest = y1_test)


my_rf(Xtrain = X_train_oe,
      ytrain = y1_train_oe,
      Xtest = X_test_oe,
      ytest = y1_test_oe)

my_rf(Xtrain = X_train_le,
      ytrain = y1_train_le,
      Xtest = X_test_le,
      ytest = y1_test_le)

0.508253559155621

0.5060137457044673

0.5112911143838978

Conclusion¶

This project applied deep learning models to predict the patients' readmission rate within 30 days. XGBoost performed the best with the validation score up to 91.2% on average after 5-fold cross validation.

	STATEFIPS	STATE	agi_stub	N1	mars1	MARS2	MARS4	ELF	CPREP	PREP	DIR_DEP	N2	NUMDEP	TOTAL_VITA	VITA	TCE	VITA_EIC	RAC	ELDERLY	A00100	N02650	A02650	N00200	A00200	N00300	A00300	N00600	A00600	N00650	A00650	N00700	A00700	N00900	A00900	N01000	A01000	N01400	A01400	N01700	...	N07220	A07220	N07260	A07260	N09400	A09400	N85770	A85770	N85775	A85775	N09750	A09750	N10600	A10600	N59660	A59660	N59720	A59720	N11070	A11070	N10960	A10960	N11560	A11560	N06500	A06500	N10300	A10300	N85530	A85530	N85300	A85300	N11901	A11901	N11900	A11900	N11902	A11902	N12000	A12000
0	1	AL	1	802640.0	474470.0	99850.0	216600.0	717050.0	44090.0	426090.0	580390.0	1259760.0	475750.0	25570.0	17310.0	8250.0	5670.0	212170.0	150660.0	10467448.0	802640.0	10626881.0	640460.0	8368271.0	89770.0	54949.0	40190.0	67982.0	35970.0	45716.0	8330.0	4383.0	144200.0	741047.0	33800.0	35843.0	37360.0	236386.0	102180.0	...	27250.0	8374.0	870.0	483.0	112110.0	153786.0	54300.0	390448.0	56390.0	397723.0	14430.0	8483.0	741920.0	2078353.0	353820.0	1078908.0	324630.0	967560.0	221720.0	283322.0	55630.0	51730.0	23200.0	14464.0	255600.0	161796.0	363870.0	327165.0	0.0	0.0	0.0	0.0	64680.0	53602.0	700940.0	1803125.0	698100.0	1796343.0	2860.0	4917.0
1	1	AL	2	499070.0	218590.0	137460.0	129760.0	448190.0	26230.0	269560.0	365010.0	985860.0	352150.0	11550.0	7570.0	3980.0	370.0	120760.0	112510.0	17977581.0	499070.0	18153235.0	429020.0	14801613.0	90100.0	62310.0	39370.0	91981.0	34800.0	64670.0	41160.0	23705.0	67150.0	254049.0	31780.0	77816.0	36500.0	356129.0	99740.0	...	120230.0	106522.0	7380.0	6388.0	38700.0	86925.0	27970.0	188902.0	29050.0	201292.0	18970.0	13043.0	487380.0	2065629.0	138510.0	295395.0	115390.0	249017.0	100530.0	139491.0	31740.0	28933.0	6510.0	6541.0	380780.0	896996.0	401700.0	1003316.0	0.0	0.0	0.0	0.0	77660.0	118725.0	419640.0	1175607.0	416180.0	1165352.0	4250.0	8894.0
2	1	AL	3	268590.0	89780.0	134440.0	38280.0	241060.0	14160.0	156410.0	175150.0	584720.0	181950.0	2780.0	1430.0	1350.0	0.0	44750.0	78030.0	16502473.0	268590.0	16659518.0	229300.0	12727137.0	82080.0	64795.0	38570.0	122427.0	35410.0	88866.0	54360.0	37829.0	41690.0	266422.0	32780.0	121012.0	30100.0	402312.0	72260.0	...	74240.0	111795.0	5820.0	5791.0	27650.0	70831.0	3850.0	27765.0	6650.0	47490.0	4510.0	4911.0	264520.0	1799819.0	90.0	33.0	20.0	8.0	9660.0	12750.0	16310.0	15019.0	80.0	88.0	254580.0	1303768.0	258100.0	1388200.0	0.0	0.0	0.0	0.0	67820.0	156752.0	201030.0	560461.0	197060.0	547812.0	5440.0	13482.0
3	1	AL	4	170880.0	32180.0	124070.0	11660.0	154120.0	7980.0	101650.0	106500.0	425800.0	130670.0	730.0	420.0	310.0	0.0	20160.0	53770.0	14817087.0	170900.0	14951547.0	146490.0	11098156.0	69200.0	60626.0	34510.0	132608.0	31260.0	96413.0	49240.0	40351.0	29580.0	249368.0	28970.0	153144.0	23990.0	430430.0	53310.0	...	54170.0	88819.0	2290.0	2031.0	19760.0	57506.0	230.0	1509.0	1380.0	9404.0	720.0	1162.0	169120.0	1721162.0	0.0	0.0	0.0	0.0	180.0	237.0	11340.0	10469.0	0.0	0.0	168810.0	1391918.0	169380.0	1465080.0	0.0	0.0	0.0	0.0	48440.0	141721.0	121930.0	396526.0	118460.0	383588.0	3160.0	12369.0
4	1	AL	5	229870.0	22810.0	196990.0	5540.0	208380.0	11120.0	136470.0	117930.0	630970.0	204340.0	690.0	650.0	40.0	0.0	13650.0	74020.0	31131390.0	229850.0	31461757.0	202630.0	22900238.0	123960.0	152168.0	72860.0	417240.0	67830.0	319909.0	109080.0	118952.0	45280.0	679483.0	64050.0	628310.0	38210.0	1017867.0	77350.0	...	47890.0	64780.0	3090.0	1928.0	32290.0	134318.0	0.0	0.0	330.0	2512.0	240.0	574.0	227720.0	4273327.0	0.0	0.0	0.0	0.0	0.0	0.0	18760.0	17760.0	0.0	0.0	228870.0	3979974.0	229110.0	4147216.0	350.0	104.0	50.0	19.0	91100.0	465160.0	138250.0	588068.0	130970.0	529001.0	8430.0	55564.0

	_	visitid	siteid	patientnumber	admissiondate	admissiontime	dischargedate	dischargetime	canceldate	canceltime	drgid	patienttypeid	transferreasonid	room	bed	clinicid	preadmissionnumber	hscrcpatienttypeid	hscrcmajorserviceid	visitnumber	referringmedicalfacilityid	financialclasseffectivedate	placeofsurgeryid	natureofsurgeryid	financialclassid	curfinancialclassid	curinsuranceplan1id	curinsuranceplan2id	curinsuranceplan3id	curinsuranceplan4id	finalbilldate	patientlocationid	methodofarrivalid	natureofadmissionid	sourceofadmissionid	admittinghospitalserviceid	dischargehospitalserviceid	dischargedispositionid	employerid	privateserviceid	...	datefinalizedrevised	datecreated	dateupdated	expr1	group_type	hospital	discharge_disposition	eadnumber	corporateidentifier	ssn	dateofbirth	firstname	lastname	bl_admission_date	bl_discharge_date	dob	age_admission	first_readmit_visitid	first_readmit_date	first_day_diff	readmission_90	readmission_90_sum	readmission_30	readmission_30_sum	comorbidities	complications	financial_class	primary_insurance_plan	gender	race	ethnicity	race_cat	eth_cat	age_admission_group	age_admission_decade	race2	race3	discharge	discharge2	ZIPCODE
0	2504	27742713	7	2200589907	1/28/15	727	2/6/15	1502	NaN	NaN	21475.0	216	NaN	B701	5A	NaN	2.200590e+09	3	NaN	4.371429e+06	1613.0	1/28/15	7.0	79.0	200	203.0	15540.0	9888.0	NaN	NaN	2/13/15	457.0	128.0	97.0	569.0	NaN	565	1057.0	2065.0	21.0	...	4/3/15	1/8/15	7/22/15	CWhitley	Colon_Rectum	MedStar Georgetown University Hospital	Refer/Disch to Home Care	18403645.0	18403645.0	213428177.0	10/14/42	JOHN	HUNTER	1/28/15	2/6/15	10/14/42	72	27933488.0	2/12/15	6.0	1	1	1	1	1	2	MEDICARE	MEDICARE A 2015	Male	White	Not Spanish/Hispanic Orig	White	Not Hispanic or Latino	Senior	70-79	White	White	Value 5	Medical Facility	20839
1	2433	26170864	7	2200260178	3/21/14	1441	3/28/14	1709	NaN	NaN	20718.0	210	NaN	B701	7A	NaN	2.200260e+09	3	NaN	4.123373e+06	1613.0	3/21/14	7.0	79.0	204	204.0	9624.0	NaN	NaN	NaN	4/7/14	457.0	128.0	97.0	564.0	NaN	565	1053.0	2040.0	24.0	...	NaN	3/10/14	5/27/14	ETALYOWE	Pancreas	MedStar Georgetown University Hospital	Disch to home or self care	23691035.0	23691035.0	96621787.0	3/27/65	DAVID	POULOS	3/21/14	3/28/14	3/27/65	48	NaN	NaN	NaN	0	0	0	0	2	0	HMO	UHC POS HMO	Male	White	Not Spanish/Hispanic Orig	White	Not Hispanic or Latino	Adult	40-49	White	White	Value 1	Home/Self Care/Jail	20839
2	2637	21393532	7	7721442411	12/15/11	1252	12/21/11	1508	NaN	NaN	19275.0	210	NaN	C411	0A	NaN	7.721442e+09	3	NaN	3.474557e+06	1218.0	12/15/11	7.0	79.0	204	204.0	14312.0	NaN	NaN	NaN	12/28/11	459.0	128.0	97.0	569.0	NaN	565	1053.0	NaN	21.0	...	1/4/12	11/30/11	5/21/12	ADAVIS2	Pancreas	MedStar Georgetown University Hospital	Disch to home or self care	19153342.0	19153342.0	NaN	5/1/72	MATTHEW	WILLIAMS	12/15/11	12/21/11	5/1/72	39	21508861.0	12/23/11	2.0	1	2	1	1	1	0	HMO	BLUE CHOICE OPEN	Male	White	Not Spanish/Hispanic Orig	White	Not Hispanic or Latino	Adult	30-39	White	White	Value 1	Home/Self Care/Jail	20611
3	1017	6805230	1	3012264655	8/19/04	959	8/22/04	1445	NaN	NaN	1620.0	16	NaN	IC16	A	NaN	3.012265e+09	3	2.0	1.180632e+06	NaN	8/19/04	7.0	12.0	12	164.0	5879.0	NaN	NaN	NaN	8/25/04	61.0	15.0	13.0	24.0	66.0	68	258.0	76.0	1.0	...	NaN	8/11/04	10/27/12	Dwagner	Lung	MedStar Franklin Square Medical Center	Disch to home or self care	308094.0	308094.0	217348836.0	2/8/38	SONIA	BELL	8/19/04	8/22/04	2/8/38	66	NaN	NaN	NaN	0	0	0	0	1	0	HMO	BLUE CHOICE /CAPCARE	Female	White	Not Spanish/Hispanic Orig	White	Not Hispanic or Latino	Senior	60-69	White	White	Value 1	Home/Self Care/Jail	21017
4	1223	10033541	2	5016074006	1/16/07	901	1/20/07	1300	NaN	NaN	3267.0	121	NaN	939	1	NaN	5.016074e+09	3	2.0	1.556189e+06	469.0	1/16/07	7.0	30.0	51	51.0	11581.0	8544.0	NaN	NaN	1/23/07	181.0	57.0	32.0	566.0	164.0	164	969.0	1128.0	5.0	...	NaN	12/28/06	11/20/12	dmon	Lung	MedStar Union Memorial Hospital	Disch to home or self care	11422538.0	11422538.0	216387296.0	9/13/41	BEVERLY	GILBERT	1/16/07	1/20/07	9/13/41	65	NaN	NaN	NaN	0	0	0	0	4	2	MEDICARE	DNU MC IP A 2007	Female	White	Not Spanish/Hispanic Orig	White	Not Hispanic or Latino	Senior	60-69	White	White	Value 1	Home/Self Care/Jail	21017
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5948	3058	14762254	7	7711721758	4/23/08	1847	6/12/08	1530	NaN	NaN	7172.0	210	NaN	B700	4A	NaN	7.711722e+09	3	NaN	2.381948e+06	1218.0	4/23/08	7.0	79.0	200	291.0	11770.0	9642.0	NaN	NaN	6/17/08	457.0	128.0	97.0	564.0	NaN	565	1067.0	NaN	24.0	...	6/17/08	3/16/08	8/5/15	SSMOOT	Pancreas	MedStar Georgetown University Hospital	Disch to on-site Rehab Unit	7765950.0	7765950.0	703079324.0	9/7/22	CHARLES	O ROKE	4/23/08	6/12/08	9/7/22	85	NaN	NaN	NaN	0	0	0	0	2	2	MEDICARE	DNU MC IP A 2008	Male	White	Unknown	White	Declined/Unknown	Senior	80-89	White	White	Value 5	Rehab	25404
5949	3332	24354726	7	7723008749	3/21/13	1802	3/26/13	1200	NaN	NaN	20028.0	216	NaN	B601	7A	NaN	7.723009e+09	3	NaN	3.837373e+06	1613.0	3/21/13	7.0	79.0	200	203.0	14447.0	9888.0	NaN	NaN	4/3/13	456.0	128.0	97.0	569.0	NaN	565	1053.0	NaN	21.0	...	4/24/13	3/15/13	6/17/13	SRAINEY	Pancreas	MedStar Georgetown University Hospital	Disch to home or self care	1825430.0	1825430.0	232743849.0	11/24/44	IRIS	FEARNOW	3/21/13	3/26/13	11/24/44	68	NaN	NaN	NaN	0	0	0	0	0	0	MEDICARE	MEDICARE A 2013	Female	White	Unknown	White	Declined/Unknown	Senior	60-69	White	White	Value 1	Home/Self Care/Jail	25403
5950	1490	16065164	2	5019733715	11/11/08	547	12/9/08	1425	NaN	NaN	9288.0	110	NaN	524	1	NaN	5.019734e+09	3	2.0	1.920733e+06	478.0	11/11/08	7.0	30.0	51	168.0	11783.0	9938.0	NaN	NaN	12/16/08	435.0	57.0	32.0	566.0	164.0	164	978.0	1128.0	5.0	...	NaN	10/28/08	11/20/12	LEskow	Esophagus	MedStar Union Memorial Hospital	Trans to Short-Term Gen Hosp	13406920.0	13406920.0	405484346.0	11/8/39	WILLIAM	PETRI	11/11/08	12/9/08	11/8/39	69	NaN	NaN	NaN	0	0	0	0	2	3	MEDICARE	DNU MC IP A 2008	Male	White	Not Spanish/Hispanic Orig	White	Not Hispanic or Latino	Senior	60-69	White	White	Value 4	Acute Hospital	17202
5951	4128	16985166	9	23399827	5/21/09	523	6/18/09	1031	NaN	NaN	9540.0	231	NaN	2N18	PW	NaN	2.339983e+07	3	NaN	8.520581e+06	NaN	5/21/09	7.0	88.0	226	226.0	10432.0	NaN	NaN	NaN	6/22/09	507.0	137.0	105.0	514.0	NaN	594	564.0	NaN	26.0	...	NaN	5/20/09	3/28/10	NaN	Colon_Rectum	MedStar Washington Hospital Center	Transferred/discharged to an organized home he...	14629631.0	14629631.0	310985594.0	1/15/86	COURTNEY	FATE	5/21/09	6/18/09	1/15/86	23	NaN	NaN	NaN	0	0	0	0	2	3	BLUE CROSS/BLUE SHIELD	B72 CAREFIRST BC	Female	White	Not Hispanic Or Latino	White	Not Hispanic or Latino	Adult	20-29	White	White	Value 5	Medical Facility	60491
5952	5881	23193132	11	34161216	8/3/12	805	8/18/12	1821	NaN	NaN	19086.0	308	NaN	4F27	#NAME?	NaN	3.416122e+07	3	NaN	9.000010e+11	NaN	8/3/12	7.0	NaN	324	317.0	14338.0	12868.0	NaN	NaN	10/2/12	628.0	158.0	168.0	612.0	1024.0	1024	1210.0	2331.0	29.0	...	10/2/12	7/27/12	3/5/13	VMH107	Lung	MedStar Washington Hospital Center	Disch to home or self care	20536126.0	20536126.0	580034021.0	6/4/33	IVA	SMITH	8/3/12	8/18/12	6/4/33	79	NaN	NaN	NaN	0	0	0	0	1	0	MEDICARE	MEDICARE A 2012	Female	African American	Not Spanish/Hispanic Orig	African American or Black	Not Hispanic or Latino	Senior	70-79	African American or Black	African American or Black	Value 1	Home/Self Care/Jail	99999

	gender	age_admission	race3	hospital	group_type	comorbidities	complications	admissiondate	ZIPCODE
3343	0.187423	0.212648	0.188576	0.250988	0.154047	0.190098	0.248883	0.201220	0.199441
1462	0.183125	0.204065	0.188942	0.166404	0.221607	0.193539	0.156095	0.195473	0.201769
177	0.184055	0.201926	0.209138	0.183579	0.272852	0.162446	0.156612	0.198475	0.229520
3167	0.213763	0.183469	0.210828	0.201750	0.164165	0.253383	0.238175	0.199546	0.203117
2984	0.186920	0.211674	0.185344	0.245803	0.166822	0.212015	0.219753	0.198108	0.201634

	gender	age_admission	race3	hospital	group_type	comorbidities	complications	admissiondate	ZIPCODE	readmission_90
0	Female	0	White	MedStar Good Samaritan Hospital	Lung	1	0	11/10/03	21014	0
1	Male	3	White	MedStar Georgetown University Hospital	Colon_Rectum	2	0	10/14/05	20007	0
2	Female	3	Other	MedStar Georgetown University Hospital	Stomach	0	1	5/25/12	20111	1
3	Female	3	White	MedStar Good Samaritan Hospital	Lung	2	3	12/5/07	21093	0
4	Male	2	White	MedStar Franklin Square Medical Center	Lung	1	0	6/13/03	21028	0

	gender_Male	age_admission_2	age_admission_3	race3_Other	race3_White	hospital_MedStar Georgetown University Hospital	hospital_MedStar Good Samaritan Hospital	group_type_Lung	group_type_Stomach	comorbidities_1	comorbidities_2	complications_1	complications_3	admissiondate_03	admissiondate_05	admissiondate_07	admissiondate_12	...
0	0	0	0	0	1	0	1	1	0	1	0	0	0	1	0	0	0	...
1	1	0	1	0	1	1	0	0	0	0	1	0	0	0	1	0	0	...
2	0	0	1	1	0	1	0	0	1	0	0	1	0	0	0	0	1	...
3	0	0	1	0	1	0	1	1	0	0	1	0	1	0	0	1	0	...
4	1	1	0	0	1	0	0	1	0	1	0	0	0	1	0	0	0	...

	gender_Male	age_admission_2	age_admission_3	race3_Other	race3_White	hospital_MedStar Georgetown University Hospital	hospital_MedStar Good Samaritan Hospital	group_type_Lung	group_type_Stomach	comorbidities_1	comorbidities_2	complications_1	complications_3	admissiondate_03	admissiondate_05	admissiondate_07	admissiondate_12	...
0	0	0	0	0	1	0	1	1	0	1	0	0	0	1	0	0	0	...
1	1	0	1	0	1	1	0	0	0	0	1	0	0	0	1	0	0	...
2	0	0	1	1	0	1	0	0	1	0	0	1	0	0	0	0	1	...
3	0	0	1	0	1	0	1	1	0	0	1	0	1	0	0	1	0	...
4	1	1	0	0	1	0	0	1	0	1	0	0	0	1	0	0	0	...

	gender_Male	age_admission_2	age_admission_3	race3_Other	race3_White	hospital_MedStar Georgetown University Hospital	hospital_MedStar Good Samaritan Hospital	group_type_Lung	group_type_Stomach	comorbidities_1	comorbidities_2	complications_1	complications_3	admissiondate_03	admissiondate_05	admissiondate_07	admissiondate_12	...
0	0	0	0	0	1	0	1	1	0	1	0	0	0	1	0	0	0	...
1	1	0	1	0	1	1	0	0	0	0	1	0	0	0	1	0	0	...
2	0	0	1	1	0	1	0	0	1	0	0	1	0	0	0	0	1	...
3	0	0	1	0	1	0	1	1	0	0	1	0	1	0	0	1	0	...
4	1	1	0	0	1	0	0	1	0	1	0	0	0	1	0	0	0	...

	gender_Male	age_admission_2	age_admission_3	race3_Other	race3_White	hospital_MedStar Georgetown University Hospital	hospital_MedStar Good Samaritan Hospital	group_type_Lung	group_type_Stomach	comorbidities_1	comorbidities_2	complications_1	complications_3	admissiondate_03	admissiondate_05	admissiondate_07	admissiondate_12	...
0	0	0	0	0	1	0	1	1	0	1	0	0	0	1	0	0	0	...
1	1	0	1	0	1	1	0	0	0	0	1	0	0	0	1	0	0	...
2	0	0	1	1	0	1	0	0	1	0	0	1	0	0	0	0	1	...
3	0	0	1	0	1	0	1	1	0	0	1	0	1	0	0	1	0	...
4	1	1	0	0	1	0	0	1	0	1	0	0	0	1	0	0	0	...