ml_kaggle-home-loan-credit-risk-model-logit.py (Source)

#!/usr/bin/env python
# coding: utf-8

# A simple yet effective tool for classification tasks is the **logit** model.  This model is often used as a baseline approach, and we can use more sophisticated machine learning models to evaluate the performance improvements.
# 
# <!-- TEASER_END -->

# # Loading in required modules

# In[1]:


# importing all system modules
import os
import sys
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')
if sys.platform == 'linux':
    sys.path.append('/home/randlow/github/blog2/listings/machine-learning/') # linux
elif sys.platform == 'win32':
    sys.path.append('\\Users\\randl\\github\\blog2\\listings\\machine-learning\\') # win32

# importing data science modules
import pandas as pd
import numpy as np
import sklearn
import scipy as sp
import pickleshare

# importing graphics modules
import matplotlib.pyplot as plt
import seaborn as sns

# importing personal data science modules
import rand_eda as eda


# # Loading pickled dataframes
# 
# To see how the below dataframes were obtained see the post on the [Kaggle: Credit risk (Feature Engineering)](/posts/machine-learning/kaggle-home-loan-credit-risk-feat-eng/)
# 

# In[2]:


home = str(Path.home())
if sys.platform == 'linux':
    inputDir = "/datasets/kaggle/home-credit-default-risk" # linux
elif sys.platform == 'win32':
    inputDir = "\datasets\kaggle\home-credit-default-risk" # windows

storeDir = home+inputDir+'/pickleshare'

db = pickleshare.PickleShareDB(storeDir)
print(db.keys())

df_app_test_align = db['df_app_test_align'] 
df_app_train_align = db['df_app_train_align'] 
#df_app_train_align_expert  = db['df_app_train_align_expert'] 
#df_app_test_align_expert = db['df_app_test_align_expert'] 
#df_app_train_poly_align = db['df_app_train_poly_align']
#df_app_test_poly_align = db['df_app_test_poly_align'] 


# # Selection of feature set for model training & testing

# Assign which ever datasets you want to `train` and `test`.  This is because as part of feature engineering, you will often build new and different feature datasets and would like to test each one out to evaluate whether it improves model performance.
# 
# As the imputer is being fitted on the training data and used to transform both the training and test datasets, the training data needs to have the same number of features as the test dataset.  This means that the `TARGET` column must be removed from the training dataset.

# In[3]:


train = df_app_train_align.copy()
test = df_app_test_align.copy()

train_labels = train['TARGET']
train = train.drop(columns=['TARGET'])


# # Feature set preprocessing

# In[4]:


from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='median')

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range= (0,1))


# We fit the imputer and scaler on the training data, and perform the imputer and scaling transformations on both the training and test datasets.

# In[5]:


imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)

scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)


# # Model implementation ([Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html))

# We initialize the logistic regression with a low regularization parameter to prevent overfitting (i.e., C=0.0001) which will improve out of sample performance (i.e., performance on the test dataset)

# In[6]:


from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(C=0.0001)
log_reg.fit(train,train_labels)


# We take the 2nd column (i.e., [:,1]) of the logistic model predictions as we are interested in predicting when the loans are in default (i.e., `Target`=1)

# In[7]:


log_reg_pred = log_reg.predict_proba(test)[:,1]


# Create the submission dataframe.  We check to make sure it has the right type of data as expected by te Kaggle competition submission requirements and the right number of rows

# In[8]:


submit = pd.DataFrame()
submit['SK_ID_CURR'] = df_app_test_align.index
submit['TARGET'] = log_reg_pred
print(submit.head())
print(submit.shape)


# We create a csv of our model output, and submit it to Kaggle.

# # Kaggle submission

# In[9]:


submit.to_csv('logit-home-loan-credit-risk.csv',index=False)
get_ipython().system("kaggle competitions submit -c home-credit-default-risk -f logit-home-loan-credit-risk.csv -m 'submitted'")


# The submission to Kaggle indicated that the predictive power on the test dataset was 0.6623 (66%) which is better than a 50-50 chance!  Let's try a more sophisticated model.

# In[10]:


get_ipython().system('kaggle competitions submissions -c home-credit-default-risk')


# # Converting iPython notebook to Python code
# 
# This allows us to run the code in Spyder.

# In[14]:


get_ipython().system("jupyter nbconvert ml_kaggle-home-loan-credit-risk-model-logit.ipynb --to script --output-dir='~/github/blog2/listings'")


# In[ ]: