|
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 24 17:06:06 2019
This code consists of helper functions to perform Exploratory Data Analysis (EDA)
@author: randlow
"""
import pandas as pd
import matplotlib.pyplot as plt
import sys
import seaborn as sns
import scipy as sp
import numpy as np
import shelve
from sklearn import preprocessing
'''
Takes a two-level multi-index and flattens it into a one-level
inputs
------
df (dataframe): a df with a multi_index column
outputs
-------
df dataframe: returns df with single index columns
'''
def flatten_multi_index(df):
colName = []
for idx0,val0 in enumerate(df.columns.levels[0]):
for idx1,val1 in enumerate(df.columns.levels[1]):
combine = (val0,val1)
colName.append('_'.join(combine))
df.columns = colName
return df
'''
Performs aggregation statistical functions on numeric columns of a dataset and
collapses the multi index into a single index.
Inputs
------
df (dataframe): dataframe to perform aggregation operations on
groupby_var (string): this is the column to group by
drop_var (list): list of columns to drop. These are columns that add no value when aggregation operations are applied (e.g., ID columns)
Outputs
-------
df_agg : a dataframe with a flattened multi_index and aggregate statistics across all numeric columns
'''
def agg_num(df, groupby_var, drop_var_list=list(), df_name=''):
for col in df:
for drop_var in drop_var_list:
if col == drop_var:
df.drop(columns = col,inplace=True)
# extract out the groupby ID variable, extract all numeric columns, then add back
# the groupby ID variable
groupby_ids = df[groupby_var]
df_numeric = df.select_dtypes('number')
df_numeric[groupby_var] = groupby_ids
# group the dataframe accordingly
df_agg = df_numeric.groupby(groupby_var).agg(['mean','median','sum','count','max','min'])
# flatten the multi index naming convention to a single index
# prefix the column names with a dataframe name (important if we merge this data into a training dataframe)
# re-insert the groupby index data
df_agg = flatten_multi_index(df_agg)
df_agg = df_agg.add_prefix('{}_'.format(df_name))
return df_agg
'''
Performs count functions on categorical columns of a dataset and collapses the multi multi_index
into a single index
Inputs
------
df (dataframe): input DataFrame
groupby_var (string): name of variable to groupby
Outputs
-------
df (dataframe): ouptut dataframe
'''
def cnt_cat(df,groupby_var,df_name):
groupby_ids = df[groupby_var]
df_cat= pd.get_dummies(df.select_dtypes('object'))
df_cat[groupby_var] = groupby_ids
df_cat_agg = df_cat.groupby(groupby_var).agg(['sum','mean'])
df_cat_agg = flatten_multi_index(df_cat_agg)
df_cat_agg = df_cat_agg.add_prefix('{}_'.format(df_name))
return df_cat_agg
'''
prints detailed categorical information from the dataframe
inputs: dataframe
outputs: nothing
'''
def extract_cat_var(df):
cat_colnames = list(df.select_dtypes('object').columns)
for col in cat_colnames:
print('Categorical column: {}\n'.format(col))
print('Number of unique entries: {}\n'.format(df[col].nunique()))
print('Unique entry names:\n{}\n'.format(df[col].unique()))
print('Value counts of each entry:\n{}\n'.format(df[col].value_counts(dropna=False)))
print('---------------------------------------------------------')
return
'''
Performs Label encoding with a default of two unique entries
per category
'''
def label_encoding_df(df,cat_limit = 2):
le = preprocessing.LabelEncoder()
le_count = 0
label_encode_list = []
for col in df:
if df[col].dtype=='object':
if df[col].nunique(dropna=False) <= cat_limit:
print(col)
le_count += 1
le.fit(df[col])
df[col] = le.transform(df[col])
label_encode_list.append(col)
print('{0} columns were label encoded'.format(le_count))
return df, label_encode_list
'''
Given a data frame and a list of feature variables,
* Produce the KDE and histogram plots for the Target=True and Target=False populations.
* Statistical differences between both populations.
Use this function to graphically evaluate whether certain feature variables exhibit
different characterstics for the Target=True and Target=False populations
'''
def plot_kde_hist_var(df,varList,calcStat = True, drawAll = False):
numVar = len(varList)
plt.figure(figsize=(10,numVar*4))
ks_stat_list = []
ks_pval_list = []
try:
for i,var in enumerate(varList):
tgt_true = df.loc[df['TARGET']==1,var]
tgt_false = df.loc[df['TARGET']==0,var]
# calculate statistical significance between both populations
if calcStat == True:
(ks_stat,ks_pval)= sp.stats.ks_2samp(tgt_true,tgt_false)
ks_stat_list.append(ks_stat)
ks_pval_list.append(ks_pval)
ks_hval_list = [True for hyp in ks_pval_list if hyp<0.05]
#
median_tgt_true = tgt_true.median()
median_tgt_false = tgt_false.median()
corrVal = df['TARGET'].corr(df[var])
print('Median Value of {} when Target (True): {:.6f}'.format(var,median_tgt_true))
print('Median Value of {} when Target (False): {:.6f}'.format(var,median_tgt_false))
print('Pearson Correlation of {} with Target (True): {:.6f}'.format(var,corrVal))
# drawing KDE distributions
tgt_true.dropna(inplace=True) # require to dropna for sns.distplot function
tgt_false.dropna(inplace=True)
plt.subplot(numVar,1,i+1)
sns.distplot(tgt_true,rug=drawAll,kde=drawAll,label='Target: True')
sns.distplot(tgt_false,rug=drawAll,kde=drawAll,label='Target: False')
plt.legend()
#plt.title(var)
except TypeError as error:
print(error)
print('Features are objects. Need ints/floats')
return ks_hval_list, ks_pval_list
'''
Given a dataframe and a list of feature variables, the histogram of
the feature variables is produced
'''
def plot_hist_var(df,varList):
numVar = len(varList)
plt.figure(figsize=(10,numVar*4))
for i,var in enumerate(varList):
df[var].hist()
return
'''
Given a dataframe, information regarding the missing/null values
of the dataframe is produced.
'''
def print_tab_miss_val(df,miss_val_thresh=50,numColPrint=10,printData=False):
# Evaluate missing values in the data
num_miss_val = df.isnull().sum()
pct_miss_val = num_miss_val/df.shape[0]*100
tab_miss_val = pd.concat([num_miss_val,pct_miss_val],axis=1)
tab_miss_val.columns = ['Missing Values','Percentage']
tab_miss_val = tab_miss_val[tab_miss_val['Missing Values']>0]
tab_miss_val['Percentage'] = tab_miss_val['Percentage'].round(1)
tab_miss_val.sort_values(['Percentage'],ascending=False,inplace=True)
numCol_miss_val = tab_miss_val.shape[0]
numCol_total = df.shape[1]
pctCol_miss_val = round((numCol_miss_val/numCol_total)*100)
numCol_crit_miss_val = tab_miss_val[tab_miss_val['Percentage'] > miss_val_thresh].shape[0]
pctCol_crit_miss_val = round(numCol_crit_miss_val/numCol_total*100)
info_miss_val = pd.Series(data=[numCol_miss_val,pctCol_miss_val,numCol_crit_miss_val,pctCol_crit_miss_val],
index=['Cols Missing Values','Cols Missing Values (%)',
'Cols Critical Missing Values', 'Cols Critical Missing Values (%)'])
if printData==True:
print(info_miss_val)
print('\n Top {} columns with missing values is as follows:'.format(numColPrint))
print(tab_miss_val['Percentage'].head(numColPrint))
return info_miss_val, tab_miss_val
# basic helper function to help print values that are in a series dataformat
def convSeries2Str(seriesData):
strList = ''
for idx,val in seriesData.iteritems():
strVal = '{}({}), '.format(idx,val)
strList = strList + strVal
return strList
'''
prints basic information regarding the dataframe
'''
def print_basic_info_df(df,bal_thresh=30):
(numRow,numCol) = df.shape
memory = int(sys.getsizeof(df)/(10**6))
dtypeVals = df.dtypes.value_counts()
dtypeStr = convSeries2Str(dtypeVals)
# Extract the unique variables of each column that are strings, and extract the unique variables including NaNs
catVals = df.select_dtypes('object').nunique(dropna=False)
catStr = convSeries2Str(catVals)
# Is the dataframe balanced?
if 'TARGET' in df:
(numRow,numCol) = df.shape
pctTarget_true = int(df['TARGET'].sum()/numRow*100)
if pctTarget_true > 100-bal_thresh or pctTarget_true < bal_thresh:
isBalanced='No'
else:
isBalanced='True'
else:
isBalanced='N/A'
pctTarget_true='N/A'
series_data = [numRow, numCol, dtypeStr,memory,pctTarget_true,isBalanced,catStr]
series_idx = ['Num rows','Num cols','Dtype','Memory (MB)','True (%)','Is Balanced','Categorical cols']
series_info = pd.Series(series_data,index = series_idx)
dict_info = [{'Num rows': numRow, 'Num cols': numCol,'Dtype': dtypeStr,
'Memory (MB)': memory,'True (%)': pctTarget_true,'Is Balanced':isBalanced,
'Category cols': catStr} ]
return series_info
'''
Provides a comparison of two dataframes.
Used to compare characteristics between a test and training dataset.
'''
def print_compare_df(df1,df2,miss_val_thresh=50,bal_thresh=30,printCompareData=False):
# Prints combined basic data of each dataframe
df1_basicinfo = print_basic_info_df(df1)
df2_basicinfo = print_basic_info_df(df2)
comb_basic_info = pd.concat([df1_basicinfo,df2_basicinfo],axis=1)
# Compare missing value data
miss_val_info_df1, miss_val_tab_df1 = print_tab_miss_val(df1)
miss_val_info_df2, miss_val_tab_df2 = print_tab_miss_val(df2)
comb_miss_val_info = pd.concat([miss_val_info_df1,miss_val_info_df2],axis=1)
s1 = set(df1.dtypes)
s2 = set(df2.dtypes)
# Compare two dataframes for number of missing categories, and values in each category
# As the training and test datasets are of different sizes, the training dataset may have values
# in the feature columns that are not in the test datasets.
# This code analyzes whether there are more than 5 different unique variables between feature columns
# of the test and training datasets.
if s1 == s2:
for x in list(s1):
df1_catCols = df1.select_dtypes(x).nunique(dropna=False)
df2_catCols = df2.select_dtypes(x).nunique(dropna=False)
diff_catColsList = df1_catCols - df2_catCols
diff_catCols = diff_catColsList[(diff_catColsList<5) & (diff_catColsList>-5) & (diff_catColsList!=0)]
for y in diff_catCols.index:
df1_valCnt = df1[y].value_counts()
df1_valCnt.name = df1_valCnt.name+'_DF1'
df2_valCnt = df2[y].value_counts()
df2_valCnt.name = df2_valCnt.name+'_DF2'
comb_valCnt = pd.concat([df1_valCnt,df2_valCnt],axis=1)
if printCompareData==True:
print(comb_valCnt)
plt.figure()
comb_valCnt.plot.bar(rot=60,title=y)
return comb_basic_info, comb_miss_val_info, miss_val_tab_df1, miss_val_tab_df2
'''
Returns the column name if a certain value occurs in any column of the dataframe.
Returns data on the frequency of that value in the column.
Used when dataframe contain certain types of values to denote NaNs.
Inputs:
df
val
Outputs:
df_errCol
errCol_list
'''
def chk_val_col(df,val):
errCol_list = [x for x in df if val in df[x].unique()]
errPct_list = []
for errCol in errCol_list:
numAll = df.shape[0]
numErr = df[df[errCol]==val].shape[0]
errPct_list.append(numErr/numAll*100)
df_errCol = pd.DataFrame(data=errPct_list,index=errCol_list,columns=['Error val %'])
errCol_Pct_list = list(zip(errCol_list,errPct_list))
return df_errCol, errCol_list
'''
Replaces all error values in a specified list of columns in a dataframe with np.NaN
Inputs:
df: DataFrame
errCol_list: List of column names in the DataFrame where the error values are
errVal: The error value
Outputs:
df: Returns a dataframe with all the error values in each specified column in the dataframe with np.NaN
'''
def fill_errorVal_df(df,errCol_list,errVal):
for errCol in errCol_list:
df[errCol].replace({errVal: np.nan},inplace=True)
return df
'''
Plots a bar chart of the most/least important features in a dataset after Random Forest/GBT model fit.
Inputs:
df: DataFrame with a column named `Importance` that was extracted from the Random Forest/GBT feature importance
numFeat: Number of top/bottom features to produce in the plot
Outputs:
Produces the most important and least important features in the DataFrame.
'''
def plot_feat_importance(df,numFeat=10):
df = df.sort_values('Importance',ascending=False).reset_index()
top_feat = df.head(numFeat)
bottom_feat = df.tail(numFeat)
fig,axes = plt.subplots(1,2,figsize=(15,10))
ax0 = sns.barplot(x='Feature',y='Importance',data=top_feat, ax=axes[0])
ax0.set_title('Top {} features'.format(numFeat))
for item in ax0.get_xticklabels():
item.set_rotation(90)
ax1 = sns.barplot(x='Feature',y='Importance',data=bottom_feat, ax=axes[1])
for item in ax1.get_xticklabels():
item.set_rotation(90)
ax1.set_title('Bottom {} features'.format(numFeat))
return
|