# ml_kaggle-home-loan-credit-risk-eda-checkpoint.py (Source)

  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 # Notes # # 1. Produce code that shows basic info on both test and training datasets # 2. The test dataset has an additional entry for code_gender. Only x4 entries have it so it should be removed. (df_app_train.CODE_GENDER.value_counts()) # # In[1] Importing modules import os import warnings warnings.filterwarnings('ignore') import sys if sys.platform == 'linux': sys.path.append('/home/randlow/github/blog2/listings/machine-learning/') # linux elif sys.platform == 'win32': sys.path.append('\\Users\\randl\\github\\blog2\\listings\\machine-learning\\') # win32 import pandas as pd import numpy as np import sklearn from sklearn import preprocessing import scipy as sp import matplotlib.pyplot as plt import seaborn as sns import bokeh as bk import rand_eda as eda import pickleshare # In[2] Changing directory from pathlib import Path home = str(Path.home()) if sys.platform == 'linux': inputDir = "/datasets/kaggle/home-credit-default-risk" # linux elif sys.platform == 'win32': inputDir = "\datasets\kaggle\home-credit-default-risk" # windows fullDir = home+inputDir os.chdir(fullDir) # In[3]: Reading files into work space df_app_train = pd.read_csv('application_train.csv',index_col=0) df_app_test = pd.read_csv('application_test.csv',index_col=0) # In[4]: Print summary of dataframe info basic_info_train = eda.print_basic_info_df(df_app_train) basic_info_test = eda.print_basic_info_df(df_app_test) comb_basic_info,comb_miss_val,comb_miss_val_app_train,comb_miss_val_app_test = eda.print_compare_df(df_app_train,df_app_test) # In[5]: Encoding categorical variables # One of the issues in encoding of categorical variables in the features is that the training and the testing dataset # may have a different number of categories in each feature. This can happen because the testing dataset is a smaller # dataset than the training dataset. In fact, one example of this is the CODE_GENDER variable where the training dataset has # x4 rows with XNA whereas the testing dataset does not have any. Other features where this happens is the FLAG_DOCUMENT_X columns. # You may consider deleting some of the rows if it is perceived that the "additional" category in the feature is irrevelant, however you may be losing # useful information. This is a design decision. # # The issue of importance is to ensure that both the training and testing datasets are aligned and have the same number of features. Thus, if there are differences # in both datasets, this will impact the label_encoding operation. Therefore, on features that you label encode on the training dataset, do the same on the testing dataset. # Do not run label_encoding df_app_train, label_encode_list_train = eda.label_encoding_df(df_app_train) df_app_test, label_encode_list_test = eda.label_encoding_df(df_app_test) # label-encoding print('label_encode columns (Training)'.format(label_encode_list_train)) print('label_encode columns (Testing)'.format(label_encode_list_train)) # In[16]: One-hot encoding df_app_train = pd.get_dummies(df_app_train) df_app_test = pd.get_dummies(df_app_test) # In[17]: # We perform one-hot encoding and see that many more columns have been added to the dataset # print('Training dataset shape: '.format(0),df_app_train.shape) print('Testing dataset shape: '.format(0),df_app_test.shape) # We create a set to store what the columns are. A set is a datatype that can only accept unique entries. app_train_col = set(df_app_train.columns) app_test_col = set(df_app_test.columns) exc_col = app_train_col-app_test_col # In[18]: # We need to align both the training and testing datasets to have the same number of features, otherwise the machine learning technique does not # work if both datasets have different sets of features. Thus we extract the TARGET column elsewhere. In the align function, we are performing an # INNER join on the axis=1 that are the columns so it will only retain features that exist in both datasets. # We extract the TARGET column, as th train_target_labels = df_app_train['TARGET'] df_app_train_align, df_app_test_align = df_app_train.align(df_app_test,join='inner',axis=1) df_app_train_align['TARGET'] = train_target_labels del df_app_test del df_app_train # In[19]: # Now when we evaluate the shapes of both the training and testing datasets, we see they have the same number of feature columns. print('Training dataset shape: '.format(0),df_app_train_align.shape) print('Testing dataset shape: '.format(0),df_app_test_align.shape) # We create sets of the column names of the aligned datasets and when we evaluate the difference between the aligned and non-aligned training and test datasets # We see which columns have been removed by the align function. app_train_align_col = set(df_app_train_align.columns) app_test_align_col = set(df_app_test_align.columns) t1 = app_train_col - app_train_align_col t2 = app_test_col - app_test_align_col print(t1) print(t2) # In[20]: # An important task is to evaluate the quality of the feature dataset for outliers, missing values, or erroneous data. # The function below performs the following: # * Splits the dataset into two populations of TARGET==0 and TARGET==1 (i.e., 'No default', and 'Default') # * Produces the histogram of each population # * Produces the KDE of each population # * Produces the rugplot of each population. # # The histogram and KDEs serve to show us the shapes of each populations distribution for the feature. # If the population distribution looks very different, these might be key features in identifying what is a TARGET=1 and TARGET=0 element. # The rug plots can be less useful in this respect as there are may be too many datapoints, but rugplots are generally useful graphical univariate representations of a distribution. # The most detailed apparoch to evaluate the cleanliness of your data is to produce these plots for every feature and examine each one. varList = df_app_train_align.columns[:5].values hVal_list,pVal_list= eda.plot_kde_hist_var(df_app_train_align,varList) # We can examine outlier/erroneous values by looking at the population. For example, we can see that DAYS_BIRTH is varily even across the entire distribution. # We can also say that those who default are generally younger as the Default population has higher frequencies for customers with lower DAYS_BIRTH. # For the DAYS_EMPLOYED, we can tell this feature exhibits erroneous data as theer is a sudden spike in occurencies above 350,000 days, so we should investigate this further. varList = ['DAYS_BIRTH','DAYS_EMPLOYED'] hVal_list,pVal_list= eda.plot_kde_hist_var(df_app_train_align,varList,drawAll=True) # In[26]: # We can investigate DAYS_EMPLOYED with greater detail and find that it has a maximum value of 365243 which does not make sense. df_app_train_align['DAYS_EMPLOYED'].describe() # The next question is how often does this point appear? 55374 # In[100]: maxVal = max(df_app_train_align['DAYS_EMPLOYED']) numErrorVal = df_app_train_align[df_app_train_align['DAYS_EMPLOYED']==maxVal].shape[0] print(numErrorVal) # So we want to have a graphical representation of DAYS_EMPLOYED so we can see if it is due to a few small outliers or many datapoints. What we find from the dataset is that all the datapoints have negative values except for approximately >50,000 points that are erroneous values. # We should analyze what the anomalous data looks like. We find there are precisely 55374 points that are anomalous, and the anomalous points have a lower default rate than the normal. Also, the number of anomalous points are about 18% which is quite high. This means we should probably do something about it. # In[100:] # Sometimes erroneous values are used in multiple entries in a dataframe, so we can search for which other columns contain the erroneous value. In the below case, both the training and test dataframes only contain the # error in DAYS_EMPLOYED and calculate the percentage of these erroneous entries in the column. We see that there are about 20% of entries have the erroneous value df_col_errVal_train, errCol_list_train = eda.chk_val_col(df_app_train_align,maxVal) df_col_errVal_test, errCol_list_test = eda.chk_val_col(df_app_test_align,maxVal) print('Training dataset:\n {}'.format(df_col_errVal_train)) print('Testing dataset:\n {}'.format(df_col_errVal_test)) eda.plot_kde_hist_var(df_app_train_align,errCol_list_train) # In[27]: # Below function takes in the dataframe in question, and the dataframe containing a list of the erroneous values # and the maximum value to replace with NaNs df_app_train_align = eda.fill_errorVal_df(df_app_train_align,errCol_list_train,maxVal) df_app_test_align = eda.fill_errorVal_df(df_app_test_align,errCol_list_test,maxVal) # In[28]: eda.plot_kde_hist_var(df_app_train_align,errCol_list_train,drawAll=False) # In[34]: # Correlation analysis # We now want to observe the correlations between our features and the target df_app_train_corr = df_app_train_align.corr() df_app_train_corr.head() # Once we have the correlation matrix, we focus on which features have the positive (negative) correlations with the target variable. We can see below that the TARGET is positively correlated to REGION_RATING_**, and negatively correlated with EXT_SOURCE_* and the DAYS_BIRTH and DAYS_EMPLOYED variables. This makes sense as the probability of default is should be lower if the applicant is older or has been employed longer. # # THe NAME_INCOME_TYPE_Working, and the REGION_RATING_* variables are categorical variables. # In[35]: df_app_train_corr_target = df_app_train_corr['TARGET'].sort_values() print('+ve corr: \n{0}'.format(df_app_train_corr_target.tail(20))) print('-ve corr: \n{0}'.format(df_app_train_corr_target.head(20))) # ## Analyze features with greatest correlation magnitude # At this point, since we know that features like age, time in employment, ext_src1, etc. will impact the likelihood of default. We analyze the KDEs of the different feature distributions and compare between those that defaulted and did not default to see if we can ascertain any insightful information. # In[36]: var_pos_corr_list = df_app_train_corr_target.head(10).index.values var_neg_corr_list = df_app_train_corr_target[-2:-10:-1].index.values print(var_pos_corr_list) print(var_neg_corr_list) # We plot the KDEs of the most positively (negatively) correlated features with the TARGET. This is to evaluate whether there are any strange distributions between the default and do not default items. # # If the distributions for each feature are very different for default and do not default, this is good and we should look out for this. # # So we can see that EXT_SOURCE_3 has the most different distributions between default and no default. # In[37]: eda.plot_kde_hist_var(df_app_train_align,var_pos_corr_list,drawAll=True) eda.plot_kde_hist_var(df_app_train_align,var_neg_corr_list,drawAll=True) # In[38]: # ### Analyze DAYS_EMPLOYED # We take the number of DAYS_EMPLOYED, and add an additional column to make it YEARS_EMPLOYED. We then create a new column that allows us to bin each observation based on the quantile/qunintile it is in. Since there about 50 employable years, we create 10 bins. Now that each observation is in a bin, we can use a groupby command to group each set of obserations. # In[39]: daysEmp_data = df_app_train_align[['TARGET','DAYS_EMPLOYED']] daysEmp_data.loc[:,'YEARS_EMPLOYED'] = daysEmp_data['DAYS_EMPLOYED']/365 daysEmp_data['YEARS_EMPLOYED'].hist() daysEmp_data['YEARS_BINNED'] = pd.cut(daysEmp_data['YEARS_EMPLOYED'],bins=np.linspace(-50,0,num=11)) daysEmp_data.head(10) daysEmp_data['YEARS_BINNED'].unique() # Since we do the group by, we can see that the less the amount of time you've been employed, you're more likely to default. # In[40]: daysEmp_group = daysEmp_data.groupby('YEARS_BINNED').mean() daysEmp_group # In[41]: sns.barplot(daysEmp_group.index,daysEmp_group['TARGET']*100) plt.xticks(rotation=60) plt.ylabel('% default') plt.xlabel('Days Employed Groups (Years)') # In[42]: eda.plot_kde_hist_var(df_app_train_align,['DAYS_EMPLOYED'],drawAll=True) # ### Analyzing credit scores # We saw that external sources had the highest correlations with TARGET, followed by DAYS_BIRTH and DAYS_EMPLOYED. So we want to take a closer look at these features and their interplay with TARGET. # # | Feature Name | Corr. with TARGET | # | --- | --- | # EXT_SOURCE_3 | -0.178919 | # EXT_SOURCE_2 | -0.160472 | # EXT_SOURCE_1 | -0.155317 | # DAYS_BIRTH | -0.078239 | # DAYS_EMPLOYED | -0.074958 | # In[43]: df_ext_src = df_app_train_align[['TARGET','EXT_SOURCE_3','EXT_SOURCE_2','EXT_SOURCE_1','DAYS_BIRTH']] # 'DAYS_EMPLOYED' df_ext_src_corr = df_ext_src.corr() sns.heatmap(df_ext_src_corr,vmin=-1.0,vmax=1.0,annot=True) # ### Additional graphical analysis for major features # We want to create a pairplot and a pairgrid to have a graphical analysis of the most important features of the dataset. As the original dataset is quite large, we take a sample of it such that we remove all the rows that have NaN and then take a random sample of 5000 points. # # We have a 6x6 grid inpairplot as TARGET is explicitly considered. # In[44]: df_ext_src.shape df_ext_src_sample = df_ext_src.dropna().sample(5000) sns.pairplot(df_ext_src_sample) # We use pairgrid to create a more informative plot. In this pairgrid TARGET is denoted by the hue. Orange is TARGET==1 (default), and Blue is TARGET==0 (no default). # # The pairgrid can be explained as follows: # # * Upper triangle: This is a scatter plot between the two variables in the X & Y axes, and has the TARGET variable as a different hue. # * Diagonal: This is a kde plot of the distribution of each variable # * Bottom triangle: This is a kde plot # In[45]: grid = sns.PairGrid(data = df_ext_src_sample, diag_sharey=True, hue = 'TARGET', vars = [x for x in list(df_ext_src_sample.columns) if x != 'TARGET']) # Upper is a scatter plot grid.map_upper(plt.scatter, alpha = 0.2) # Diagonal is a histogram grid.map_diag(sns.kdeplot) # Bottom is density plotM grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r); # In[100]: storeDir = fullDir+'/pickleshare' db = pickleshare.PickleShareDB(storeDir) db['df_app_test_align'] = df_app_test_align db['df_app_train_align'] = df_app_train_align db['df_app_train_corr_target'] = df_app_train_corr_target