# ml_kaggle-home-loan-credit-risk-feateng-checkpoint.py (Source)

  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Jan 29 22:15:16 2019 @author: randlow """ import sys import pandas as pd import pickleshare as pkl import os from pathlib import Path home = str(Path.home()) if sys.platform == 'linux': inputDir = "/datasets/kaggle/home-credit-default-risk" # linux elif sys.platform == 'win32': inputDir = "\datasets\kaggle\home-credit-default-risk" # windows fullDir = home+inputDir os.chdir(fullDir) storeDir = fullDir+'/pickleshare' db = pkl.PickleShareDB(storeDir) # In[50]" print(db.keys()) x1 = db['df_app_test_align'] # Based on our empirical analysis of just analyzing correlations between our target variable and the feature variables, we can perform feature engineering. Typically feature engineering means we perform operations such as: # # 1. Polynomial features: THis includes all interactions and powers of each feature variable # 2. Expert knowledge features: # In[46]: df_app_train = pd.read_csv('application_train.csv',index_col=0) df_app_test = pd.read_csv('application_test.csv',index_col=0) # In[47]: print(var_pos_corr[0:10]) imp_var = var_pos_corr[0:4] print(imp_var) # In[48]: poly_features_train = df_app_train[imp_var] poly_features_test = df_app_test[imp_var] poly_target_train = df_app_train['TARGET'] poly_features_train.columns # ## Imputing NaN points # In[49]: imputer = preprocessing.Imputer(strategy='median') poly_features_train = imputer.fit_transform(poly_features_train) # fitting means we find the median then apply poly_features_test = imputer.transform(poly_features_test) # we only transform, as we want to use the fit used on the training dataset # ## Creating polynomial features # In[50]: poly_transformer = preprocessing.PolynomialFeatures(degree=3) poly_transformer.fit(poly_features_train) poly_features_train = poly_transformer.transform(poly_features_train) poly_features_test = poly_transformer.transform(poly_features_test) print('Polynomial features: {}'.format(poly_features_train.shape)) # In[51]: poly_transformer.get_feature_names()[:15] # In[52]: poly_transformer.get_feature_names(input_features=imp_var) # Since we now have a bigger dataset with additional artificially created features, let's evaluate if these new features have higher correlation than the original set of features. poly_features_train is an array, so we need to create a DataFrame out of it, and add the Target column to it. # In[53]: df_poly_features_train = pd.DataFrame(poly_features_train, columns = poly_transformer.get_feature_names(input_features=imp_var)) df_poly_features_train['TARGET'] = poly_target_train # In[54]: poly_corrs = df_poly_features_train.corr()['TARGET'].sort_values() print('+ve correlations:\n{}'.format(poly_corrs.tail(20))) print('-ve correlations:\n{}'.format(poly_corrs.head(20))) # Now that we have a new df_poly_features_train that includes all the existing features, and polynomial features, we need to add in the SK_ID_CURR column too # In[59]: df_app_train.index.values # In[56]: df_poly_features_train['SK_ID_CURR'] = df_app_train['SK_ID_CURR'] # Now that we have #df_app_poly_train = df_app_train.merge(df_poly_features_train, on = 'SK_ID_CURR', how='left') # In[ ]: # ## Add new features to the test dataset # # In[ ]: df_poly_features_test = pf.DataFrame(poly_features_test, columns = poly_transformer.get_feature_names(input_features=imp_var)) df_poly_features_test['SK_ID_CURR'] = df_app_test['SK_ID_CURR'] df_app_poly_test = df_app_test.merge(df_poly_features_test, on='SK_ID_CURR', how='left') # ## Align the train and test datasets # In[ ]: df_app_poly_train, df_app_poly_test = app_train_poly_train.align(df_app_poly_test, join='inner',axis=1) # haha # hehe # In[61]: get_ipython().system('jupyter nbconvert --to script ml_kaggle_home-loan-credit-risk.ipynb') # In[ ]: