# ml_kaggle-home-loan-credit-risk-model-random-forest.py (Source)

  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 #!/usr/bin/env python # coding: utf-8 # A commonly used model for exploring classification problems is the **random forest classifier**. # # It is called a random forest as it an ensemble (i.e., multiple) of decision trees and merges them to obtain a more accurate and stable prediction. Random forests lead to less overfit compared to a single decision tree especially if there are sufficient trees in the forest. It is also called 'random' as a random subset of features are considered by the algorithim each time a node is being split. In addition, where a decision tree uses the best possible thresholds for splitting a node, you can use a random threshold in a random forest. # Random forests are ideal as a predictive tool, and not a descriptive tool. A decision tree is more suitable if you are evaluating relationships within the data. # # Random forests are usually trained using the "bagging" approach (i.e., bootstrap aggregation). The "bagging" approach is such that given an initial training dataset $D$ of size $n$, bagging generates $m$ new datasets $D_i$ each of size $n$ by sampling from $D$ uniformly with replacement. Thus, $m$ models can be fitted on the $m$ new datasets that have been created from the initial training dataset $D$ via bootstrapping with replacement. These $m$ models are then combined by averaging the output (i.e., regression) or voting (i.e., classification). # # Random forests are also useful as it is possible the measure the relative importance of each feaure on the prediction. This is performed by analyzing a feature's importance based on how often the tree nodes, and how many trees use that feature. Understanding which features are important allows us to drop those that add little or no value to our classification problem. # # Loading in required modules # In: # importing all system modules import os import sys import warnings from pathlib import Path warnings.filterwarnings('ignore') if sys.platform == 'linux': sys.path.append('/home/randlow/github/blog2/listings/machine-learning/') # linux elif sys.platform == 'win32': sys.path.append('\\Users\\randl\\github\\blog2\\listings\\machine-learning\\') # win32 # importing data science modules import pandas as pd import numpy as np import sklearn import scipy as sp import pickleshare # importing graphics modules import matplotlib.pyplot as plt import seaborn as sns import bokeh as bk # importing personal data science modules import rand_eda as eda # # Loading pickled dataframes # # To see how the below dataframes were obtained see the post on the [Kaggle: Credit risk (Feature Engineering)](/posts/machine-learning/kaggle-home-loan-credit-risk-feat-eng/) # # In: home = str(Path.home()) if sys.platform == 'linux': inputDir = "/datasets/kaggle/home-credit-default-risk" # linux elif sys.platform == 'win32': inputDir = "\datasets\kaggle\home-credit-default-risk" # windows storeDir = home+inputDir+'/pickleshare' db = pickleshare.PickleShareDB(storeDir) print(db.keys()) df_app_test_align = db['df_app_test_align'] df_app_train_align = db['df_app_train_align'] #df_app_train_align_expert = db['df_app_train_align_expert'] #df_app_test_align_expert = db['df_app_test_align_expert'] #df_app_train_poly_align = db['df_app_train_poly_align'] #df_app_test_poly_align = db['df_app_test_poly_align'] # # Selection of feature set for model training & testing # Assign which ever datasets you want to train and test. This is because as part of feature engineering, you will often build new and different feature datasets and would like to test each one out to evaluate whether it improves model performance. # # As the imputer is being fitted on the training data and used to transform both the training and test datasets, the training data needs to have the same number of features as the test dataset. This means that the TARGET column must be removed from the training dataset, and stored in train_labels for use later. # In: train = df_app_train_align.copy() test = df_app_test_align.copy() train_labels = train.pop('TARGET') features = list(train.columns) # # Feature set preprocessing # In: from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan,strategy='median') from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range= (0,1)) # We fit the imputer and scaler on the training data, and perform the imputer and scaling transformations on both the training and test datasets. # # Scikit-learn models only accept arrays. So the imputer and scalers can accept DataFrames as inputs and they output the train and test variables as arrays for use into Scikit-Learn's machine learning models. # In[ ]: imputer.fit(train) train = imputer.transform(train) test = imputer.transform(test) scaler.fit(train) train = scaler.transform(train) test = scaler.transform(test) # In[ ]: train.shape # # Model implementation ([Random Forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)) # In this implementation of random forest, we are using a 100 trees (n_estimators=100) using all processors (n_jobs=-1) # In[ ]: from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators = 100, random_state=10, verbose = 1, n_jobs = -1) # In[ ]: rf.fit(train,train_labels) # ## Exploring random forest feature importances # Decision trees are non-parametric supervised learning models that infer the value of a target variable by analyzing decision rules from the features of the dataset. Since the random forest consists of many decision trees, a random forest can be used to produce what the most important features are to predict the target variable by analzying all the trees for which features use that tree to node # # We can see here that our random forest selected EXT_SOURCE_2/3, DAYS_BIRTH as the top 3 most important features. These feature importances produced by the random forest can be used for further feature engineering and culling features that are of low importance (e.g., FLAG_DOCUMENT_x) # In[ ]: feat_importance_values = rf.feature_importances_ df_feat_importance = pd.DataFrame({'Feature':features,'Importance': feat_importance_values}) eda.plot_feat_importance(df_feat_importance) # We apply our fitted random forest model to predict the TARGET outcomes from the test dataset # In[ ]: rf_pred = rf.predict_proba(test)[:,1] # # Kaggle submission # We create the submission dataframe as per the Kaggle home-credit-default-risk competition guidelines # In[ ]: submit = pd.DataFrame() submit['SK_ID_CURR'] = df_app_test_align.index submit['TARGET'] = rf_pred print(submit.head()) print(submit.shape) # Submit the csv file to Kaggle for scoring # In[ ]: submit.to_csv('random-forest-home-loan-credit-risk.csv',index=False) get_ipython().system("kaggle competitions submit -c home-credit-default-risk -f random-forest-home-loan-credit-risk.csv -m 'submitted'") # We review our random forest scores from Kaggle and find that there is a slight improvement to 0.687 compared to 0.662 based upon the logit model (publicScore). We will try other featured engineering datasets and other more sophisticaed machine learning models in the next posts. # In[ ]: get_ipython().system('kaggle competitions submissions -c home-credit-default-risk') # # Converting iPython notebook to Python code # # This allows us to run the code in Spyder. # In[ ]: get_ipython().system("jupyter nbconvert ml_kaggle-home-loan-credit-risk-model-random-forest.ipynb --output-dir='~/github/blog2/listings/machine-learning/' --to python")