ml_kaggle-home-loan-credit-risk-feateng.py (Source)

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 29 22:15:16 2019

@author: randlow
"""

import sys
import pandas as pd
import pickleshare as pkl
import os


from pathlib import Path

home = str(Path.home())
if sys.platform == 'linux':
    inputDir = "/datasets/kaggle/home-credit-default-risk" # linux
elif sys.platform == 'win32':
    inputDir = "\datasets\kaggle\home-credit-default-risk" # windows

fullDir = home+inputDir
os.chdir(fullDir)

storeDir = fullDir+'/pickleshare'

db = pkl.PickleShareDB(storeDir)

# In[50]"
print(db.keys())
x1 = db['df_app_test_align']

# Based on our empirical analysis of just analyzing correlations between our target variable and the feature variables, we can perform feature engineering.  Typically feature engineering means we perform operations such as:
# 
# 1. Polynomial features: THis includes all interactions and powers of each feature variable
# 2. Expert knowledge features:
# In[46]:

df_app_train = pd.read_csv('application_train.csv',index_col=0)
df_app_test = pd.read_csv('application_test.csv',index_col=0)


# In[47]:


print(var_pos_corr[0:10])

imp_var = var_pos_corr[0:4]
print(imp_var)


# In[48]:


poly_features_train = df_app_train[imp_var]
poly_features_test = df_app_test[imp_var]

poly_target_train = df_app_train['TARGET']

poly_features_train.columns


# ## Imputing NaN points

# In[49]:


imputer = preprocessing.Imputer(strategy='median')
poly_features_train = imputer.fit_transform(poly_features_train) # fitting means we find the median then apply
poly_features_test = imputer.transform(poly_features_test) # we only transform, as we want to use the fit used on the training dataset


# ## Creating polynomial features

# In[50]:


poly_transformer = preprocessing.PolynomialFeatures(degree=3)

poly_transformer.fit(poly_features_train)

poly_features_train = poly_transformer.transform(poly_features_train)
poly_features_test = poly_transformer.transform(poly_features_test)

print('Polynomial features: {}'.format(poly_features_train.shape))


# In[51]:


poly_transformer.get_feature_names()[:15]


# In[52]:


poly_transformer.get_feature_names(input_features=imp_var)


# Since we now have a bigger dataset with additional artificially created features, let's evaluate if these new features have higher correlation than the original set of features. `poly_features_train` is an array, so we need to create a DataFrame out of it, and add the `Target` column to it.

# In[53]:


df_poly_features_train = pd.DataFrame(poly_features_train, columns = poly_transformer.get_feature_names(input_features=imp_var))
df_poly_features_train['TARGET'] = poly_target_train


# In[54]:


poly_corrs = df_poly_features_train.corr()['TARGET'].sort_values()
print('+ve correlations:\n{}'.format(poly_corrs.tail(20)))
print('-ve correlations:\n{}'.format(poly_corrs.head(20)))


# Now that we have a new `df_poly_features_train` that includes all the existing features, and polynomial features, we need to add in the `SK_ID_CURR` column too

# In[59]:


df_app_train.index.values


# In[56]:


df_poly_features_train['SK_ID_CURR'] = df_app_train['SK_ID_CURR'] # Now that we have
#df_app_poly_train = df_app_train.merge(df_poly_features_train, on = 'SK_ID_CURR', how='left')


# In[ ]:





# ## Add new features to the `test` dataset
# 

# In[ ]:


df_poly_features_test = pf.DataFrame(poly_features_test, columns = poly_transformer.get_feature_names(input_features=imp_var))
df_poly_features_test['SK_ID_CURR'] = df_app_test['SK_ID_CURR']
df_app_poly_test = df_app_test.merge(df_poly_features_test, on='SK_ID_CURR', how='left')


# ## Align the `train` and `test` datasets

# In[ ]:


df_app_poly_train, df_app_poly_test = app_train_poly_train.align(df_app_poly_test, join='inner',axis=1) 


# haha

# hehe

# In[61]:


get_ipython().system('jupyter nbconvert --to script ml_kaggle_home-loan-credit-risk.ipynb')


# In[ ]: