ml_kaggle-home-loan-credit-risk-eda-checkpoint.py (Source)

# Notes
#
# 1. Produce code that shows basic info on both test and training datasets
# 2. The test dataset has an additional entry for code_gender.  Only x4 entries have it so it should be removed. (df_app_train.CODE_GENDER.value_counts())
#


# In[1] Importing modules
import os
import warnings
warnings.filterwarnings('ignore')
import sys

if sys.platform == 'linux':
    sys.path.append('/home/randlow/github/blog2/listings/machine-learning/') # linux
elif sys.platform == 'win32':
    sys.path.append('\\Users\\randl\\github\\blog2\\listings\\machine-learning\\') # win32

import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
import scipy as sp

import matplotlib.pyplot as plt
import seaborn as sns
import bokeh as bk

import rand_eda as eda
import pickleshare
# In[2] Changing directory

from pathlib import Path
home = str(Path.home())
if sys.platform == 'linux':
    inputDir = "/datasets/kaggle/home-credit-default-risk" # linux
elif sys.platform == 'win32':
    inputDir = "\datasets\kaggle\home-credit-default-risk" # windows

fullDir = home+inputDir
os.chdir(fullDir)
# In[3]: Reading files into work space

df_app_train = pd.read_csv('application_train.csv',index_col=0)
df_app_test = pd.read_csv('application_test.csv',index_col=0)
# In[4]: Print summary of dataframe info
basic_info_train = eda.print_basic_info_df(df_app_train)
basic_info_test = eda.print_basic_info_df(df_app_test)

comb_basic_info,comb_miss_val,comb_miss_val_app_train,comb_miss_val_app_test = eda.print_compare_df(df_app_train,df_app_test)

# In[5]: Encoding categorical variables

# One of the issues in encoding of categorical variables in the features is that the training and the testing dataset
# may have a different number of categories in each feature.  This can happen because the testing dataset is a smaller
# dataset than the training dataset.  In fact, one example of this is the `CODE_GENDER` variable where the training dataset has
# x4 rows with XNA whereas the testing dataset does not have any.  Other features where this happens is the FLAG_DOCUMENT_X columns.
# You may consider deleting some of the rows if it is perceived that the "additional" category in the feature is irrevelant, however you may be losing
# useful information. This is a design decision.
#
# The issue of importance is to ensure that both the training and testing datasets are aligned and have the same number of features.  Thus, if there are differences
# in both datasets, this will impact the `label_encoding` operation.  Therefore, on features that you label encode on the training dataset, do the same on the testing dataset.
# Do not run `label_encoding` 


df_app_train, label_encode_list_train = eda.label_encoding_df(df_app_train)
df_app_test, label_encode_list_test = eda.label_encoding_df(df_app_test) # label-encoding

print('label_encode columns (Training)'.format(label_encode_list_train))
print('label_encode columns (Testing)'.format(label_encode_list_train))
# In[16]: One-hot encoding
df_app_train = pd.get_dummies(df_app_train)
df_app_test = pd.get_dummies(df_app_test)

# In[17]:
# We perform one-hot encoding and see that many more columns have been added to the dataset
#

print('Training dataset shape: '.format(0),df_app_train.shape)
print('Testing dataset shape: '.format(0),df_app_test.shape)

# We create a set to store what the columns are.  A set is a datatype that can only accept unique entries.

app_train_col = set(df_app_train.columns)
app_test_col = set(df_app_test.columns)

exc_col = app_train_col-app_test_col

# In[18]:

# We need to align both the training and testing datasets to have the same number of features, otherwise the machine learning technique does not
# work if both datasets have different sets of features.  Thus we extract the `TARGET` column elsewhere.  In the align function, we are performing an
# `INNER` join on the `axis=1` that are the columns so it will only retain features that exist in both datasets.
# We extract the `TARGET` column, as th

train_target_labels = df_app_train['TARGET']
df_app_train_align, df_app_test_align = df_app_train.align(df_app_test,join='inner',axis=1)

df_app_train_align['TARGET'] = train_target_labels

del df_app_test
del df_app_train



# In[19]:

# Now when we evaluate the shapes of both the training and testing datasets, we see they have the same number of feature columns.

print('Training dataset shape: '.format(0),df_app_train_align.shape)
print('Testing dataset shape: '.format(0),df_app_test_align.shape)

# We create sets of the column names of the aligned datasets and when we evaluate the difference between the aligned and non-aligned training and test datasets
# We see which columns have been removed by the `align` function.

app_train_align_col = set(df_app_train_align.columns)
app_test_align_col = set(df_app_test_align.columns)

t1 = app_train_col - app_train_align_col
t2 = app_test_col - app_test_align_col

print(t1)
print(t2)


# In[20]:

# An important task is to evaluate the quality of the feature dataset for outliers, missing values, or erroneous data. 
# The function below performs the following:
#  * Splits the dataset into two populations of `TARGET==0` and `TARGET==1` (i.e., 'No default', and 'Default')
#  * Produces the histogram of each population
#  * Produces the KDE of each population
#  * Produces the rugplot of each population.
#
# The histogram and KDEs serve to show us the shapes of each populations distribution for the feature.
# If the population distribution looks very different, these might be key features in identifying what is a TARGET=1 and TARGET=0 element.
# The rug plots can be less useful in this respect as there are may be too many datapoints, but rugplots are generally useful graphical univariate representations of a distribution.
# The most detailed apparoch to evaluate the cleanliness of your data is to produce these plots for every feature and examine each one. 

varList = df_app_train_align.columns[:5].values
hVal_list,pVal_list= eda.plot_kde_hist_var(df_app_train_align,varList)

# We can examine outlier/erroneous values by looking at the population.  For example, we can see that `DAYS_BIRTH` is varily even across the entire distribution.
# We can also say that those who default are generally younger as the `Default` population has higher frequencies for customers with lower `DAYS_BIRTH`.
# For the `DAYS_EMPLOYED`, we can tell this feature exhibits erroneous data as theer is a sudden spike in occurencies above 350,000 days, so we should investigate this further.


varList = ['DAYS_BIRTH','DAYS_EMPLOYED']
hVal_list,pVal_list= eda.plot_kde_hist_var(df_app_train_align,varList,drawAll=True)



# In[26]:

# We can investigate `DAYS_EMPLOYED` with greater detail and find that it has a maximum value of `365243` which does not make sense.

df_app_train_align['DAYS_EMPLOYED'].describe()

# The next question is how often does this point appear? 55374


# In[100]:

maxVal = max(df_app_train_align['DAYS_EMPLOYED'])
numErrorVal = df_app_train_align[df_app_train_align['DAYS_EMPLOYED']==maxVal].shape[0]
print(numErrorVal)


# So we want to have a graphical representation of `DAYS_EMPLOYED` so we can see if it is due to a few small outliers or many datapoints.  What we find from the dataset is that all the datapoints have negative values except for approximately >50,000 points that are erroneous values.

# We should analyze what the anomalous data looks like.  We find there are precisely 55374 points that are anomalous, and the anomalous points have a lower default rate than the normal.  Also, the number of anomalous points are about 18% which is quite high.  This means we should probably do something about it.

# In[100:]

# Sometimes erroneous values are used in multiple entries in a dataframe, so we can search for which other columns contain the erroneous value.  In the below case, both the training and test dataframes only contain the
# error in `DAYS_EMPLOYED` and calculate the percentage of these erroneous entries in the column.  We see that there are about 20% of entries have the erroneous value

df_col_errVal_train, errCol_list_train = eda.chk_val_col(df_app_train_align,maxVal)
df_col_errVal_test, errCol_list_test = eda.chk_val_col(df_app_test_align,maxVal)

print('Training dataset:\n {}'.format(df_col_errVal_train))
print('Testing dataset:\n {}'.format(df_col_errVal_test))

eda.plot_kde_hist_var(df_app_train_align,errCol_list_train)
# In[27]:

# Below function takes in the dataframe in question, and the dataframe containing a list of the erroneous values
# and the maximum value to replace with NaNs

df_app_train_align = eda.fill_errorVal_df(df_app_train_align,errCol_list_train,maxVal)
df_app_test_align = eda.fill_errorVal_df(df_app_test_align,errCol_list_test,maxVal)
# In[28]:

eda.plot_kde_hist_var(df_app_train_align,errCol_list_train,drawAll=False)

# In[34]:
# Correlation analysis
# We now want to observe the correlations between our features and the target

df_app_train_corr = df_app_train_align.corr()
df_app_train_corr.head()


# Once we have the correlation matrix, we focus on which features have the positive (negative) correlations with the target variable.  We can see below that the `TARGET` is positively correlated to `REGION_RATING_**`, and negatively correlated with `EXT_SOURCE_*` and the `DAYS_BIRTH` and `DAYS_EMPLOYED` variables.  This makes sense as the probability of default is should be lower if the applicant is older or has been employed longer.
# 
# THe `NAME_INCOME_TYPE_Working`, and the `REGION_RATING_*` variables are categorical variables.

# In[35]:


df_app_train_corr_target = df_app_train_corr['TARGET'].sort_values()
print('+ve corr: \n{0}'.format(df_app_train_corr_target.tail(20)))
print('-ve corr: \n{0}'.format(df_app_train_corr_target.head(20)))


# ## Analyze features with greatest correlation magnitude

# At this point, since we know that features like age, time in employment, ext_src1, etc. will impact the likelihood of default.  We analyze the KDEs of the different feature distributions and compare between those that defaulted and did not default to see if we can ascertain any insightful information.

# In[36]:


var_pos_corr_list = df_app_train_corr_target.head(10).index.values
var_neg_corr_list = df_app_train_corr_target[-2:-10:-1].index.values

print(var_pos_corr_list)
print(var_neg_corr_list)


# We plot the KDEs of the most positively (negatively) correlated features with the `TARGET`.  This is to evaluate whether there are any strange distributions between the `default` and `do not default` items.
# 
# If the distributions for each feature are very different for `default` and `do not default`, this is good and we should look out for this. 
# 
# So we can see that `EXT_SOURCE_3` has the most different distributions between default and no default.

# In[37]:



eda.plot_kde_hist_var(df_app_train_align,var_pos_corr_list,drawAll=True)
eda.plot_kde_hist_var(df_app_train_align,var_neg_corr_list,drawAll=True)

# In[38]:


# ### Analyze `DAYS_EMPLOYED`

# We take the number of `DAYS_EMPLOYED`, and add an additional column to make it `YEARS_EMPLOYED`.  We then create a new column that allows us to bin each observation based on the quantile/qunintile it is in.  Since there about 50 employable years, we create 10 bins.  Now that each observation is in a bin, we can use a `groupby` command to group each set of obserations. 

# In[39]:


daysEmp_data = df_app_train_align[['TARGET','DAYS_EMPLOYED']]
daysEmp_data.loc[:,'YEARS_EMPLOYED'] = daysEmp_data['DAYS_EMPLOYED']/365

daysEmp_data['YEARS_EMPLOYED'].hist()

daysEmp_data['YEARS_BINNED'] = pd.cut(daysEmp_data['YEARS_EMPLOYED'],bins=np.linspace(-50,0,num=11))
daysEmp_data.head(10)
daysEmp_data['YEARS_BINNED'].unique()


# Since we do the group by, we can see that the less the amount of time you've been employed, you're more likely to default.

# In[40]:


daysEmp_group = daysEmp_data.groupby('YEARS_BINNED').mean()
daysEmp_group


# In[41]:


sns.barplot(daysEmp_group.index,daysEmp_group['TARGET']*100)
plt.xticks(rotation=60)
plt.ylabel('% default')
plt.xlabel('Days Employed Groups (Years)')


# In[42]:

eda.plot_kde_hist_var(df_app_train_align,['DAYS_EMPLOYED'],drawAll=True)

# ### Analyzing credit scores

# We saw that external sources had the highest correlations with `TARGET`, followed by `DAYS_BIRTH` and `DAYS_EMPLOYED`. So we want to take a closer look at these features and their interplay with `TARGET`.
# 
# | Feature Name | Corr. with TARGET |
# | --- | --- |
# EXT_SOURCE_3 | -0.178919 |
# EXT_SOURCE_2 | -0.160472 |
# EXT_SOURCE_1 | -0.155317 |
# DAYS_BIRTH |  -0.078239 |
# DAYS_EMPLOYED |   -0.074958 |

# In[43]:


df_ext_src = df_app_train_align[['TARGET','EXT_SOURCE_3','EXT_SOURCE_2','EXT_SOURCE_1','DAYS_BIRTH']] # 'DAYS_EMPLOYED'
df_ext_src_corr = df_ext_src.corr()
sns.heatmap(df_ext_src_corr,vmin=-1.0,vmax=1.0,annot=True)


# ### Additional graphical analysis for major features

# We want to create a `pairplot` and a `pairgrid` to have a graphical analysis of the most important features of the dataset.  As the original dataset is quite large, we take a sample of it such that we remove all the rows that have `NaN` and then take a random sample of 5000 points.
# 
# We have a 6x6 grid in`pairplot` as `TARGET` is explicitly considered.

# In[44]:


df_ext_src.shape
df_ext_src_sample = df_ext_src.dropna().sample(5000)
sns.pairplot(df_ext_src_sample)


# We use `pairgrid` to create a more informative plot.  In this pairgrid `TARGET` is denoted by the `hue`.  Orange is `TARGET==1` (default), and Blue is `TARGET==0` (no default).
# 
# The pairgrid can be explained as follows:
# 
# * Upper triangle: This is a scatter plot between the two variables in the X & Y axes, and has the `TARGET` variable as a different hue.
# * Diagonal: This is a kde plot of the distribution of each variable
# * Bottom triangle:  This is a kde plot

# In[45]:

grid = sns.PairGrid(data = df_ext_src_sample, diag_sharey=True,
                    hue = 'TARGET', 
                    vars = [x for x in list(df_ext_src_sample.columns) if x != 'TARGET'])

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.2)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plotM
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r);


# In[100]:

storeDir = fullDir+'/pickleshare'

db = pickleshare.PickleShareDB(storeDir)
db['df_app_test_align'] = df_app_test_align
db['df_app_train_align'] = df_app_train_align
db['df_app_train_corr_target'] = df_app_train_corr_target