rand_eda.py (Source)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 24 17:06:06 2019

This code consists of helper functions to perform Exploratory Data Analysis (EDA)

@author: randlow
"""

import pandas as pd
import matplotlib.pyplot as plt
import sys
import seaborn as sns
import scipy as sp
import numpy as np
import shelve
from sklearn import preprocessing


'''
Takes a two-level multi-index and flattens it into a one-level

inputs
------
    df (dataframe): a df with a multi_index column

outputs
-------
    df dataframe: returns df with single index columns

'''
def flatten_multi_index(df):

  colName = []

  for idx0,val0 in enumerate(df.columns.levels[0]):
    for idx1,val1 in enumerate(df.columns.levels[1]):
      combine = (val0,val1)
      colName.append('_'.join(combine))

  df.columns = colName

  return df

'''
Performs aggregation statistical functions on numeric columns of a dataset and
 collapses the multi index into a single index.

Inputs
------
    df (dataframe): dataframe to perform aggregation operations on
    groupby_var (string): this is the column to group by
    drop_var (list): list of columns to drop.  These are columns that add no value when aggregation operations are applied (e.g., ID columns)

Outputs
-------
    df_agg : a dataframe with a flattened multi_index and aggregate statistics across all numeric columns
'''
def agg_num(df, groupby_var, drop_var_list=list(), df_name=''):
    for col in df:
        for drop_var in drop_var_list:
            if col == drop_var:
                df.drop(columns = col,inplace=True)
# extract out the groupby ID variable, extract all numeric columns, then add back
# the groupby ID variable
    groupby_ids = df[groupby_var]
    df_numeric = df.select_dtypes('number')
    df_numeric[groupby_var] = groupby_ids

# group the dataframe accordingly
    df_agg = df_numeric.groupby(groupby_var).agg(['mean','median','sum','count','max','min'])

# flatten the multi index naming convention to a single index
# prefix the column names with a dataframe name (important if we merge this data into a training dataframe)
# re-insert the groupby index data
    df_agg = flatten_multi_index(df_agg)
    df_agg = df_agg.add_prefix('{}_'.format(df_name))

    return df_agg

'''
Performs count functions on categorical columns of a dataset and collapses the multi multi_index
into a single index

Inputs
------
    df (dataframe): input DataFrame
    groupby_var (string): name of variable to groupby

Outputs
-------
    df (dataframe): ouptut dataframe

'''

def cnt_cat(df,groupby_var,df_name):

    groupby_ids = df[groupby_var]
    df_cat= pd.get_dummies(df.select_dtypes('object'))
    df_cat[groupby_var] = groupby_ids
    df_cat_agg = df_cat.groupby(groupby_var).agg(['sum','mean'])


    df_cat_agg = flatten_multi_index(df_cat_agg)
    df_cat_agg = df_cat_agg.add_prefix('{}_'.format(df_name))


    return df_cat_agg

'''
prints detailed categorical information from the dataframe

inputs: dataframe

outputs: nothing

'''
def extract_cat_var(df):

    cat_colnames = list(df.select_dtypes('object').columns)

    for col in cat_colnames:
      print('Categorical column: {}\n'.format(col))
      print('Number of unique entries: {}\n'.format(df[col].nunique()))
      print('Unique entry names:\n{}\n'.format(df[col].unique()))
      print('Value counts of each entry:\n{}\n'.format(df[col].value_counts(dropna=False)))
      print('---------------------------------------------------------')

    return

'''
Performs Label encoding with a default of two unique entries
per category
'''

def label_encoding_df(df,cat_limit = 2):

    le = preprocessing.LabelEncoder()
    le_count = 0
    label_encode_list = []
    for col in df:
        if df[col].dtype=='object':
            if df[col].nunique(dropna=False) <= cat_limit:
                print(col)
                le_count += 1
                le.fit(df[col])
                df[col] = le.transform(df[col])
                label_encode_list.append(col)

    print('{0} columns were label encoded'.format(le_count))

    return df, label_encode_list

'''
Given a data frame and a list of feature variables,
* Produce the KDE and histogram plots for the Target=True and Target=False populations.
* Statistical differences between both populations.

Use this function to graphically evaluate whether certain feature variables exhibit
different characterstics for the Target=True and Target=False populations
'''
def plot_kde_hist_var(df,varList,calcStat = True, drawAll = False):
    numVar = len(varList)

    plt.figure(figsize=(10,numVar*4))
    ks_stat_list = []
    ks_pval_list = []
    try:
        for i,var in enumerate(varList):
            tgt_true = df.loc[df['TARGET']==1,var]
            tgt_false = df.loc[df['TARGET']==0,var]

            # calculate statistical significance between both populations
            if calcStat == True:
                (ks_stat,ks_pval)= sp.stats.ks_2samp(tgt_true,tgt_false)
                ks_stat_list.append(ks_stat)
                ks_pval_list.append(ks_pval)
                ks_hval_list = [True for hyp in ks_pval_list if hyp<0.05]

            #
            median_tgt_true = tgt_true.median()
            median_tgt_false = tgt_false.median()
            corrVal = df['TARGET'].corr(df[var])
            print('Median Value of {} when Target (True): {:.6f}'.format(var,median_tgt_true))
            print('Median Value of {} when Target (False): {:.6f}'.format(var,median_tgt_false))
            print('Pearson Correlation of {} with Target (True): {:.6f}'.format(var,corrVal))

            # drawing KDE distributions
            tgt_true.dropna(inplace=True) # require to dropna for sns.distplot function
            tgt_false.dropna(inplace=True)
            plt.subplot(numVar,1,i+1)
            sns.distplot(tgt_true,rug=drawAll,kde=drawAll,label='Target: True')
            sns.distplot(tgt_false,rug=drawAll,kde=drawAll,label='Target: False')
            plt.legend()
            #plt.title(var)
    except TypeError as error:
        print(error)
        print('Features are objects.  Need ints/floats')

    return ks_hval_list, ks_pval_list

'''
Given a dataframe and a list of feature variables, the histogram of
the feature variables is produced
'''
def plot_hist_var(df,varList):

    numVar = len(varList)

    plt.figure(figsize=(10,numVar*4))

    for i,var in enumerate(varList):
        df[var].hist()

    return

'''
Given a dataframe, information regarding the missing/null values
of the dataframe is produced.
'''
def print_tab_miss_val(df,miss_val_thresh=50,numColPrint=10,printData=False):
    # Evaluate missing values in the data
    num_miss_val = df.isnull().sum()
    pct_miss_val = num_miss_val/df.shape[0]*100

    tab_miss_val = pd.concat([num_miss_val,pct_miss_val],axis=1)
    tab_miss_val.columns = ['Missing Values','Percentage']
    tab_miss_val  = tab_miss_val[tab_miss_val['Missing Values']>0]
    tab_miss_val['Percentage'] = tab_miss_val['Percentage'].round(1)
    tab_miss_val.sort_values(['Percentage'],ascending=False,inplace=True)

    numCol_miss_val = tab_miss_val.shape[0]
    numCol_total = df.shape[1]
    pctCol_miss_val = round((numCol_miss_val/numCol_total)*100)

    numCol_crit_miss_val = tab_miss_val[tab_miss_val['Percentage'] > miss_val_thresh].shape[0]
    pctCol_crit_miss_val = round(numCol_crit_miss_val/numCol_total*100)

    info_miss_val = pd.Series(data=[numCol_miss_val,pctCol_miss_val,numCol_crit_miss_val,pctCol_crit_miss_val],
              index=['Cols Missing Values','Cols Missing Values (%)',
            'Cols Critical Missing Values', 'Cols Critical Missing Values (%)'])

    if printData==True:
        print(info_miss_val)
        print('\n Top {} columns with missing values is as follows:'.format(numColPrint))
        print(tab_miss_val['Percentage'].head(numColPrint))

    return info_miss_val, tab_miss_val

# basic helper function to help print values that are in a series dataformat
def convSeries2Str(seriesData):
    strList = ''
    for idx,val in seriesData.iteritems():
        strVal = '{}({}), '.format(idx,val)
        strList = strList + strVal

    return strList




'''
prints basic information regarding the dataframe
'''
def print_basic_info_df(df,bal_thresh=30):

    (numRow,numCol) = df.shape
    memory = int(sys.getsizeof(df)/(10**6))

    dtypeVals = df.dtypes.value_counts()
    dtypeStr = convSeries2Str(dtypeVals)

    # Extract the unique variables of each  column that are strings, and extract the unique variables including NaNs
    catVals = df.select_dtypes('object').nunique(dropna=False)
    catStr = convSeries2Str(catVals)

    # Is the dataframe balanced?
    if 'TARGET' in df:
        (numRow,numCol) = df.shape
        pctTarget_true = int(df['TARGET'].sum()/numRow*100)
        if pctTarget_true > 100-bal_thresh or pctTarget_true < bal_thresh:
            isBalanced='No'
        else:
            isBalanced='True'
    else:
        isBalanced='N/A'
        pctTarget_true='N/A'

    series_data = [numRow, numCol, dtypeStr,memory,pctTarget_true,isBalanced,catStr]
    series_idx = ['Num rows','Num cols','Dtype','Memory (MB)','True (%)','Is Balanced','Categorical cols']
    series_info = pd.Series(series_data,index = series_idx)

    dict_info = [{'Num rows': numRow, 'Num cols': numCol,'Dtype': dtypeStr,
    'Memory (MB)': memory,'True (%)': pctTarget_true,'Is Balanced':isBalanced,
    'Category cols': catStr} ]

    return series_info

'''
Provides a comparison of two dataframes.

Used to compare characteristics between a test and training dataset.
'''
def print_compare_df(df1,df2,miss_val_thresh=50,bal_thresh=30,printCompareData=False):

    # Prints combined basic data of each dataframe
    df1_basicinfo = print_basic_info_df(df1)
    df2_basicinfo = print_basic_info_df(df2)
    comb_basic_info = pd.concat([df1_basicinfo,df2_basicinfo],axis=1)

    # Compare missing value data
    miss_val_info_df1, miss_val_tab_df1 =  print_tab_miss_val(df1)
    miss_val_info_df2, miss_val_tab_df2 =  print_tab_miss_val(df2)
    comb_miss_val_info = pd.concat([miss_val_info_df1,miss_val_info_df2],axis=1)


    s1 = set(df1.dtypes)
    s2 = set(df2.dtypes)

    # Compare two dataframes for number of missing categories, and values in each category
    # As the training and test datasets are of different sizes, the training dataset may have values
    # in the feature columns that are not in the test datasets.
    # This code analyzes whether there are more than 5 different unique variables between feature columns
    # of the test and training datasets.
    if s1 == s2:
        for x in list(s1):

            df1_catCols = df1.select_dtypes(x).nunique(dropna=False)
            df2_catCols = df2.select_dtypes(x).nunique(dropna=False)
            diff_catColsList = df1_catCols - df2_catCols
            diff_catCols = diff_catColsList[(diff_catColsList<5) & (diff_catColsList>-5) & (diff_catColsList!=0)]
            for y in diff_catCols.index:
                df1_valCnt = df1[y].value_counts()
                df1_valCnt.name = df1_valCnt.name+'_DF1'
                df2_valCnt = df2[y].value_counts()
                df2_valCnt.name = df2_valCnt.name+'_DF2'
                comb_valCnt = pd.concat([df1_valCnt,df2_valCnt],axis=1)

                if printCompareData==True:
                    print(comb_valCnt)
                    plt.figure()
                    comb_valCnt.plot.bar(rot=60,title=y)



    return comb_basic_info, comb_miss_val_info, miss_val_tab_df1, miss_val_tab_df2


'''
Returns the column name if a certain value occurs in any column of the dataframe.
Returns data on the frequency of that value in the column.

Used when dataframe contain certain types of values to denote NaNs.

Inputs:
df
val

Outputs:
df_errCol
errCol_list
'''
def chk_val_col(df,val):
    errCol_list = [x for x in df if val in df[x].unique()]
    errPct_list = []
    for errCol in errCol_list:
        numAll = df.shape[0]
        numErr = df[df[errCol]==val].shape[0]
        errPct_list.append(numErr/numAll*100)

    df_errCol = pd.DataFrame(data=errPct_list,index=errCol_list,columns=['Error val %'])

    errCol_Pct_list = list(zip(errCol_list,errPct_list))

    return df_errCol, errCol_list

'''
Replaces all error values in a specified list of columns in a dataframe with np.NaN

Inputs:
df: DataFrame
errCol_list: List of column names in the DataFrame where the error values are
errVal: The error value

Outputs:
df: Returns a dataframe with all the error values in each specified column in the dataframe with np.NaN
'''
def fill_errorVal_df(df,errCol_list,errVal):


    for errCol in errCol_list:
        df[errCol].replace({errVal: np.nan},inplace=True)

    return df

'''
Plots a bar chart of the most/least important features in a dataset after Random Forest/GBT model fit.

Inputs:
df: DataFrame with a column named `Importance` that was extracted from the Random Forest/GBT feature importance
numFeat: Number of top/bottom features to produce in the plot

Outputs:
Produces the most important and least important features in the DataFrame.
'''
def plot_feat_importance(df,numFeat=10):

    df = df.sort_values('Importance',ascending=False).reset_index()
    top_feat = df.head(numFeat)
    bottom_feat = df.tail(numFeat)

    fig,axes = plt.subplots(1,2,figsize=(15,10))
    ax0 = sns.barplot(x='Feature',y='Importance',data=top_feat, ax=axes[0])
    ax0.set_title('Top {} features'.format(numFeat))
    for item in ax0.get_xticklabels():
        item.set_rotation(90)
    ax1 = sns.barplot(x='Feature',y='Importance',data=bottom_feat, ax=axes[1])
    for item in ax1.get_xticklabels():
        item.set_rotation(90)
    ax1.set_title('Bottom {} features'.format(numFeat))

    return