rand_eda_old.py (Source)

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 24 17:06:06 2019

This code consists of helper functions to perform Exploratory Data Analysis (EDA)

@author: randlow
"""

import pandas as pd
import matplotlib.pyplot as plt
import sys
import seaborn as sns
import scipy as sp
import numpy as np
import shelve
from sklearn import preprocessing

'''
Performs Label encoding with a default of two unique entries
per category
'''

def label_encoding_df(df,cat_limit = 2):

    le = preprocessing.LabelEncoder()
    le_count = 0
    label_encode_list = []
    for col in df:
        if df[col].dtype=='object':
            if df[col].nunique(dropna=False) <= cat_limit:
                print(col)
                le_count += 1
                le.fit(df[col])
                df[col] = le.transform(df[col])
                label_encode_list.append(col)
            
    print('{0} columns were label encoded'.format(le_count))

    return df, label_encode_list

'''
Plots the 
Given a data frame and a list of feature variables, this function will
produce the KDE and histogram plots for the Target=True and Target=False populations.
It also calculates statistical differences between both populations.
Use this function to graphically evaluate whether certain feature variables exhibit
different characterstics for the Target=True and Target=False populations
'''
def plot_kde_hist_var(df,varList,calcStat = True, drawAll = False): 
    numVar = len(varList)
    
    plt.figure(figsize=(10,numVar*4))
    ks_stat_list = []
    ks_pval_list = []
    try:
        for i,var in enumerate(varList):    
            tgt_true = df.loc[df['TARGET']==1,var]
            tgt_false = df.loc[df['TARGET']==0,var]
            
            # calculate statistical significance between both populations
            if calcStat == True:
                (ks_stat,ks_pval)= sp.stats.ks_2samp(tgt_true,tgt_false)
                ks_stat_list.append(ks_stat)
                ks_pval_list.append(ks_pval)
                ks_hval_list = [True for hyp in ks_pval_list if hyp<0.05]
            
            # 
            median_tgt_true = tgt_true.median()
            median_tgt_false = tgt_false.median()
            corrVal = df['TARGET'].corr(df[var])
            print('Median Value of {} when Target (True): {:.6f}'.format(var,median_tgt_true))
            print('Median Value of {} when Target (False): {:.6f}'.format(var,median_tgt_false))
            print('Pearson Correlation of {} with Target (True): {:.6f}'.format(var,corrVal))
                  
            # drawing KDE distributions
            tgt_true.dropna(inplace=True) # require to dropna for sns.distplot function
            tgt_false.dropna(inplace=True)
            plt.subplot(numVar,1,i+1)
            sns.distplot(tgt_true,rug=drawAll,kde=drawAll,label='Target: True')
            sns.distplot(tgt_false,rug=drawAll,kde=drawAll,label='Target: False')
            plt.legend()
            #plt.title(var)
    except TypeError as error:
        print(error)
        print('Features are objects.  Need ints/floats')

    return ks_hval_list, ks_pval_list

'''
Given a dataframe and a list of feature variables, the histogram of
the feature variables is produced
'''
def plot_hist_var(df,varList):
    
    numVar = len(varList)
            
    plt.figure(figsize=(10,numVar*4))
    
    for i,var in enumerate(varList):
        df[var].hist()
    
    return

'''
Given a dataframe, information regarding the missing/null values
of the dataframe is produced.
'''
def print_tab_miss_val(df,miss_val_thresh=50,numColPrint=10,printData=False):
    # Evaluate missing values in the data
    num_miss_val = df.isnull().sum()
    pct_miss_val = num_miss_val/df.shape[0]*100

    tab_miss_val = pd.concat([num_miss_val,pct_miss_val],axis=1) 
    tab_miss_val.columns = ['Missing Values','Percentage']
    tab_miss_val  = tab_miss_val[tab_miss_val['Missing Values']>0] 
    tab_miss_val['Percentage'] = tab_miss_val['Percentage'].round(1)
    tab_miss_val.sort_values(['Percentage'],ascending=False,inplace=True)
    
    numCol_miss_val = tab_miss_val.shape[0]
    numCol_total = df.shape[1]
    pctCol_miss_val = round((numCol_miss_val/numCol_total)*100)
    
    numCol_crit_miss_val = tab_miss_val[tab_miss_val['Percentage'] > miss_val_thresh].shape[0]
    pctCol_crit_miss_val = round(numCol_crit_miss_val/numCol_total*100)
    
    info_miss_val = pd.Series(data=[numCol_miss_val,pctCol_miss_val,numCol_crit_miss_val,pctCol_crit_miss_val],
              index=['Cols Missing Values','Cols Missing Values (%)',
            'Cols Critical Missing Values', 'Cols Critical Missing Values (%)'])

    if printData==True:
        print(info_miss_val)
        print('\n Top {} columns with missing values is as follows:'.format(numColPrint))    
        print(tab_miss_val['Percentage'].head(numColPrint))
    
    return info_miss_val, tab_miss_val

# basic helper function to help print values that are in a series dataformat
def convSeries2Str(seriesData):
    strList = ''    
    for idx,val in seriesData.iteritems():
        strVal = '{}({}), '.format(idx,val)
        strList = strList + strVal
    
    return strList

'''
prints basic information regarding the dataframe
'''
def print_basic_info_df(df,bal_thresh=30):

    (numRow,numCol) = df.shape
    memory = int(sys.getsizeof(df)/(10**6))

    dtypeVals = df.dtypes.value_counts()
    dtypeStr = convSeries2Str(dtypeVals)

    # Extract the unique variables of each  column that are strings, and extract the unique variables including NaNs
    catVals = df.select_dtypes('object').nunique(dropna=False)
    catStr = convSeries2Str(catVals)
    
    # Is the dataframe balanced?    
    if 'TARGET' in df:
        (numRow,numCol) = df.shape
        pctTarget_true = int(df['TARGET'].sum()/numRow*100)
        if pctTarget_true > 100-bal_thresh or pctTarget_true < bal_thresh:
            isBalanced='No'
        else:
            isBalanced='True'
    else:
        isBalanced='N/A'
        pctTarget_true='N/A'
    
    series_data = [numRow, numCol, dtypeStr,memory,pctTarget_true,isBalanced,catStr]
    series_idx = ['Num rows','Num cols','Dtype','Memory (MB)','True (%)','Is Balanced','Categorical cols']
    series_info = pd.Series(series_data,index = series_idx)
    
    dict_info = [{'Num rows': numRow, 'Num cols': numCol,'Dtype': dtypeStr,
    'Memory (MB)': memory,'True (%)': pctTarget_true,'Is Balanced':isBalanced,
    'Category cols': catStr} ]
    
    return series_info

'''
Provides a comparison of two dataframes.

Used to compare characteristics between a test and training dataset.
'''
def print_compare_df(df1,df2,miss_val_thresh=50,bal_thresh=30,printCompareData=False):
    
    # Prints combined basic data of each dataframe
    df1_basicinfo = print_basic_info_df(df1)
    df2_basicinfo = print_basic_info_df(df2)
    comb_basic_info = pd.concat([df1_basicinfo,df2_basicinfo],axis=1)
    
    # Compare missing value data                
    miss_val_info_df1, miss_val_tab_df1 =  print_tab_miss_val(df1)
    miss_val_info_df2, miss_val_tab_df2 =  print_tab_miss_val(df2)
    comb_miss_val_info = pd.concat([miss_val_info_df1,miss_val_info_df2],axis=1)
    
           
    s1 = set(df1.dtypes)
    s2 = set(df2.dtypes)
    
    # Compare two dataframes for number of missing categories, and values in each category
    # As the training and test datasets are of different sizes, the training dataset may have values
    # in the feature columns that are not in the test datasets.
    # This code analyzes whether there are more than 5 different unique variables between feature columns
    # of the test and training datasets.
    if s1 == s2:                
        for x in list(s1):
            
            df1_catCols = df1.select_dtypes(x).nunique(dropna=False)
            df2_catCols = df2.select_dtypes(x).nunique(dropna=False)
            diff_catColsList = df1_catCols - df2_catCols
            diff_catCols = diff_catColsList[(diff_catColsList<5) & (diff_catColsList>-5) & (diff_catColsList!=0)]
            for y in diff_catCols.index:
                df1_valCnt = df1[y].value_counts()
                df1_valCnt.name = df1_valCnt.name+'_DF1'
                df2_valCnt = df2[y].value_counts()
                df2_valCnt.name = df2_valCnt.name+'_DF2'
                comb_valCnt = pd.concat([df1_valCnt,df2_valCnt],axis=1)
                
                if printCompareData==True:
                    print(comb_valCnt)
                    plt.figure()
                    comb_valCnt.plot.bar(rot=60,title=y)
    
    

    return comb_basic_info, comb_miss_val_info, miss_val_tab_df1, miss_val_tab_df2


'''
Returns the column name if a certain value occurs in any column of the dataframe.
Returns data on the frequency of that value in the column.

Used when dataframe contain certain types of values to denote NaNs.

Inputs:
df
val

Outputs:
df_errCol
errCol_list
'''
def chk_val_col(df,val):            
    errCol_list = [x for x in df if val in df[x].unique()]
    errPct_list = []
    for errCol in errCol_list:
        numAll = df.shape[0]
        numErr = df[df[errCol]==val].shape[0]
        errPct_list.append(numErr/numAll*100)
    
    df_errCol = pd.DataFrame(data=errPct_list,index=errCol_list,columns=['Error val %'])
    
    errCol_Pct_list = list(zip(errCol_list,errPct_list))
    
    return df_errCol, errCol_list



        
    



'''
Replaces all error values in a specified list of columns in a dataframe with np.NaN

Inputs:
df: DataFrame
errCol_list: List of column names in the DataFrame where the error values are
errVal: The error value

Outputs:
df: Returns a dataframe with all the error values in each specified column in the dataframe with np.NaN
'''
def fill_errorVal_df(df,errCol_list,errVal):
    
    
    for errCol in errCol_list:
        df[errCol].replace({errVal: np.nan},inplace=True)
        
    return df

'''
Plots a bar chart of the most/least important features in a dataset after Random Forest/GBT model fit.

Inputs:
df: DataFrame with a column named `Importance` that was extracted from the Random Forest/GBT feature importance
numFeat: Number of top/bottom features to produce in the plot

Outputs:
Produces the most important and least important features in the DataFrame.
'''
def plot_feat_importance(df,numFeat=10):
    
    df = df.sort_values('Importance',ascending=False).reset_index()
    top_feat = df.head(numFeat)
    bottom_feat = df.tail(numFeat) 

    fig,axes = plt.subplots(1,2,figsize=(15,10))
    ax0 = sns.barplot(x='Feature',y='Importance',data=top_feat, ax=axes[0])
    ax0.set_title('Top {} features'.format(numFeat))
    for item in ax0.get_xticklabels():
        item.set_rotation(90)
    ax1 = sns.barplot(x='Feature',y='Importance',data=bottom_feat, ax=axes[1])
    for item in ax1.get_xticklabels():
        item.set_rotation(90)
    ax1.set_title('Bottom {} features'.format(numFeat))
    
    return