# ml_kaggle-home-loan-credit-risk-model-logit.py (Source)

  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 #!/usr/bin/env python # coding: utf-8 # A simple yet effective tool for classification tasks is the **logit** model. This model is often used as a baseline approach, and we can use more sophisticated machine learning models to evaluate the performance improvements. # # # # Loading in required modules # In[1]: # importing all system modules import os import sys import warnings from pathlib import Path warnings.filterwarnings('ignore') if sys.platform == 'linux': sys.path.append('/home/randlow/github/blog2/listings/machine-learning/') # linux elif sys.platform == 'win32': sys.path.append('\\Users\\randl\\github\\blog2\\listings\\machine-learning\\') # win32 # importing data science modules import pandas as pd import numpy as np import sklearn import scipy as sp import pickleshare # importing graphics modules import matplotlib.pyplot as plt import seaborn as sns # importing personal data science modules import rand_eda as eda # # Loading pickled dataframes # # To see how the below dataframes were obtained see the post on the [Kaggle: Credit risk (Feature Engineering)](/posts/machine-learning/kaggle-home-loan-credit-risk-feat-eng/) # # In[2]: home = str(Path.home()) if sys.platform == 'linux': inputDir = "/datasets/kaggle/home-credit-default-risk" # linux elif sys.platform == 'win32': inputDir = "\datasets\kaggle\home-credit-default-risk" # windows storeDir = home+inputDir+'/pickleshare' db = pickleshare.PickleShareDB(storeDir) print(db.keys()) df_app_test_align = db['df_app_test_align'] df_app_train_align = db['df_app_train_align'] #df_app_train_align_expert = db['df_app_train_align_expert'] #df_app_test_align_expert = db['df_app_test_align_expert'] #df_app_train_poly_align = db['df_app_train_poly_align'] #df_app_test_poly_align = db['df_app_test_poly_align'] # # Selection of feature set for model training & testing # Assign which ever datasets you want to train and test. This is because as part of feature engineering, you will often build new and different feature datasets and would like to test each one out to evaluate whether it improves model performance. # # As the imputer is being fitted on the training data and used to transform both the training and test datasets, the training data needs to have the same number of features as the test dataset. This means that the TARGET column must be removed from the training dataset. # In[3]: train = df_app_train_align.copy() test = df_app_test_align.copy() train_labels = train['TARGET'] train = train.drop(columns=['TARGET']) # # Feature set preprocessing # In[4]: from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan,strategy='median') from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range= (0,1)) # We fit the imputer and scaler on the training data, and perform the imputer and scaling transformations on both the training and test datasets. # In[5]: imputer.fit(train) train = imputer.transform(train) test = imputer.transform(test) scaler.fit(train) train = scaler.transform(train) test = scaler.transform(test) # # Model implementation ([Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)) # We initialize the logistic regression with a low regularization parameter to prevent overfitting (i.e., C=0.0001) which will improve out of sample performance (i.e., performance on the test dataset) # In[6]: from sklearn.linear_model import LogisticRegression log_reg = LogisticRegression(C=0.0001) log_reg.fit(train,train_labels) # We take the 2nd column (i.e., [:,1]) of the logistic model predictions as we are interested in predicting when the loans are in default (i.e., Target=1) # In[7]: log_reg_pred = log_reg.predict_proba(test)[:,1] # Create the submission dataframe. We check to make sure it has the right type of data as expected by te Kaggle competition submission requirements and the right number of rows # In[8]: submit = pd.DataFrame() submit['SK_ID_CURR'] = df_app_test_align.index submit['TARGET'] = log_reg_pred print(submit.head()) print(submit.shape) # We create a csv of our model output, and submit it to Kaggle. # # Kaggle submission # In[9]: submit.to_csv('logit-home-loan-credit-risk.csv',index=False) get_ipython().system("kaggle competitions submit -c home-credit-default-risk -f logit-home-loan-credit-risk.csv -m 'submitted'") # The submission to Kaggle indicated that the predictive power on the test dataset was 0.6623 (66%) which is better than a 50-50 chance! Let's try a more sophisticated model. # In[10]: get_ipython().system('kaggle competitions submissions -c home-credit-default-risk') # # Converting iPython notebook to Python code # # This allows us to run the code in Spyder. # In[14]: get_ipython().system("jupyter nbconvert ml_kaggle-home-loan-credit-risk-model-logit.ipynb --to script --output-dir='~/github/blog2/listings'") # In[ ]: