View output EDA.html.

Setup

setwd("~/projects/Classes/FoundationsOfDataScience_sliderule/github/capstone/GiveMeSomeCredit")

suppressMessages(library(dplyr)) # summarise
library(mice)
library(caret)
# library(psych)  # includes describe()

library(data.table)
library(FSelector)
library(ggplot2)

library(gridExtra)

Start exploring the data set

cs <- read.csv("cs-training.csv")

str(cs)

## 'data.frame':    150000 obs. of  12 variables:
##  $ X                                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SeriousDlqin2yrs                    : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ RevolvingUtilizationOfUnsecuredLines: num  0.766 0.957 0.658 0.234 0.907 ...
##  $ age                                 : int  45 40 38 30 49 74 57 39 27 57 ...
##  $ NumberOfTime30.59DaysPastDueNotWorse: int  2 0 1 0 1 0 0 0 0 0 ...
##  $ DebtRatio                           : num  0.803 0.1219 0.0851 0.036 0.0249 ...
##  $ MonthlyIncome                       : int  9120 2600 3042 3300 63588 3500 NA 3500 NA 23684 ...
##  $ NumberOfOpenCreditLinesAndLoans     : int  13 4 2 5 7 3 8 8 2 9 ...
##  $ NumberOfTimes90DaysLate             : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ NumberRealEstateLoansOrLines        : int  6 0 0 0 1 1 3 0 0 4 ...
##  $ NumberOfTime60.89DaysPastDueNotWorse: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NumberOfDependents                  : int  2 1 0 0 0 1 0 0 NA 2 ...

# can drop the 'X' column, as it is just the observation number in sequence

# Gets rid of the "X" column
cs <- rbind(cs)[ , -1]  

# convert SeriousDlqin2yrs to a factor
cs$SeriousDlqin2yrs <- factor(cs$SeriousDlqin2yrs, 
                              levels = c(0, 1), labels = c("ok", "delinquent"))

nb_samples <- nrow(cs)

Out of the 150,000 samples, the rate of delinquency is 6.68% (10,026 observations).

Demographics

One of the ages is 0 - impute? or drop?

Many are over age 90. Are these accurate?

table(cs$NumberOfDependents)

## 
##     0     1     2     3     4     5     6     7     8     9    10    13 
## 86902 26316 19522  9483  2862   746   158    51    24     5     5     1 
##    20 
##     1

There are a couple of suspect outliers (>10) but not too bad.

Finances

DebtRatio

#boxplot(cs$DebtRatio)
boxplot(log(cs$DebtRatio + 1))

There are many outliers in this variable, with a median of 0.37 and a mean of 353.01

MonthlyIncome

### boxplot(cs$MonthlyIncome)
boxplot(log(cs$MonthlyIncome + 1))

summary(log(cs$MonthlyIncome + 1))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   8.132   8.594   8.411   9.018  14.920   29731

There are 29,731 NAs in MonthlyIncome.

Creating additional Finances variables

# https://www.kaggle.com/c/GiveMeSomeCredit/forums/t/1166/congratulations-to-the-winners/7231#post7231
# "DebtRatio was computed by substituting 1 to MonthlyIncome, where
# MonthlyIncome was not available"

# create a derived variable called MonthlyExpenses taking the above NA
# disposition into account
cs$MonthlyExpenses <- ifelse(is.na(cs$MonthlyIncome),
                             1 * cs$DebtRatio,
                             cs$MonthlyIncome * cs$DebtRatio)

# create another derived variable called NetMonthlySurplus
cs$NetMonthlySurplus <- ifelse(is.na(cs$MonthlyIncome),
                             0 - cs$MonthlyExpenses,
                             cs$MonthlyIncome - cs$MonthlyExpenses)

Some boxplots for these created variables:

boxplot(log(1 + cs$MonthlyExpenses))

boxplot(log(cs$NetMonthlySurplus))

32,783 have MonthlyExpenses equalling DebtRatio, and 37,138 have NetMonthlySurplus of less than/equal to 1

table(cs$SeriousDlqin2yrs, cs$NetMonthlySurplus <= 1)

##             
##               FALSE   TRUE
##   ok         105248  34726
##   delinquent   7614   2412

The delinquency rate (our baseline) in the sample is 6.68%. The accuracy for primitive predictor cs$NetMonthlySurplus <= 1 is 71.77%, which is already greater than a baseline accuracy of 50%.

The sensitivity for primitive predictor cs$NetMonthlySurplus <= 1 is 24.06%, and the specificity is 75.19%.

For our model, we want a higher sensitivity than that to predict a much higher portion of the delinquent borrowers, since these are the costly borrowers.

Deliquencies

#table(cs$NumberOfTime30.59DaysPastDueNotWorse)
#table(cs$NumberOfTime60.89DaysPastDueNotWorse)
#table(cs$NumberOfTimes90DaysLate)

# there are 5 times of 96 and 264 times of 98 in each of these
# these are nonsensical values, and will be converted to NA

cs$NumberOfTime30.59DaysPastDueNotWorse <- ifelse(cs$NumberOfTime30.59DaysPastDueNotWorse >= 96, NA,
                                                  cs$NumberOfTime30.59DaysPastDueNotWorse)

cs$NumberOfTime60.89DaysPastDueNotWorse <- ifelse(cs$NumberOfTime60.89DaysPastDueNotWorse >= 96, NA,
                                                  cs$NumberOfTime60.89DaysPastDueNotWorse)

cs$NumberOfTimes90DaysLate <- ifelse(cs$NumberOfTimes90DaysLate >= 96, NA,
                                     cs$NumberOfTimes90DaysLate)

# perhaps we'll impute these or perhaps we'll leave them

Creating additional Deliquencies variables

The Delinquencies variables are all related. Let’s try to consolidate to a single new variable.

cs$ConsolidatedNumberOfDaysPastDue <- (cs$NumberOfTime30.59DaysPastDueNotWorse * 30) +
  (cs$NumberOfTime60.89DaysPastDueNotWorse * 60) +
  (cs$NumberOfTimes90DaysLate * 90)

# the following is just to demonstrate that >90 days in the data is not a subset
# of >60, and the calculation above is valid
t1 <- subset(cs, ConsolidatedNumberOfDaysPastDue > 0)
t1[2,c("NumberOfTime30.59DaysPastDueNotWorse", "NumberOfTime60.89DaysPastDueNotWorse", "NumberOfTimes90DaysLate", "ConsolidatedNumberOfDaysPastDue")]

##   NumberOfTime30.59DaysPastDueNotWorse
## 3                                    1
##   NumberOfTime60.89DaysPastDueNotWorse NumberOfTimes90DaysLate
## 3                                    0                       1
##   ConsolidatedNumberOfDaysPastDue
## 3                             120

#table(cs$ConsolidatedNumberOfDaysPastDue)
boxplot(ConsolidatedNumberOfDaysPastDue ~ SeriousDlqin2yrs, data = cs, 
        main = "Consolidated number of days past due")

The boxplot shows a positive correlation for delinquencies.

Existing Credit Utilization

### Existing Credit Utilization

# a few of these seem unrealistically high. are they accurate?
#table(cs$NumberRealEstateLoansOrLines)
summary(cs$NumberRealEstateLoansOrLines)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.018   2.000  54.000

#table(cs$NumberOfOpenCreditLinesAndLoans)
summary(cs$NumberOfOpenCreditLinesAndLoans)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.000   8.000   8.453  11.000  58.000

# some serious outliers in this variable, perhaps cutoff / normalize
boxplot(log(cs$RevolvingUtilizationOfUnsecuredLines))

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn

Additional data cleanup

zero age and NAs in MonthlyIncome, NumberOfDependents

One of the age observations is 0. let’s impute this observation, along with MonthlyIncome and NumberOfDependents NAs.

cs$age <- ifelse(cs$age == 0, NA, cs$age)
sum(is.na(cs$age))

## [1] 1

set.seed(144)

# Multiple imputation
simplified = cs[c("age", "DebtRatio", "MonthlyIncome", "NumberOfDependents", "SeriousDlqin2yrs", 
  "NumberRealEstateLoansOrLines", "NumberOfOpenCreditLinesAndLoans"
  )]

### This takes a long time so save the results
##imputed = complete(mice(simplified))
##save(imputed,file="imputed_simplified.Rda")
load("imputed_simplified.Rda")  # load value in R object 'imputed'

Here is what the imputed values look like:

# age
imputed[rownames(subset(cs, is.na(cs$age))),]$age

## [1] 40

# MonthlyIncome and NumberOfDependents imputed value summaries
imputed.mi <- imputed[rownames(subset(cs, is.na(cs$MonthlyIncome))),]$MonthlyIncome
imputed.nd <- imputed[rownames(subset(cs, is.na(cs$NumberOfDependents))),]$NumberOfDependents
#describe(imputed.mi)
boxplot(imputed.mi)

#describe(imputed.nd)
hist(imputed.nd)

Save the imputed values back to main dataframe

cs$age <- imputed$age
cs$MonthlyIncome <- imputed$MonthlyIncome
cs$NumberOfDependents <- imputed$NumberOfDependents

Delinquency NAs

For logisitic regression, we would like all our data to have no NAs, so will convert NAs in the delinquency fields to their respective median value, which in all cases is zero.

m30 <- median(cs$NumberOfTime30.59DaysPastDueNotWorse, na.rm = TRUE)
m60 <- median(cs$NumberOfTime60.89DaysPastDueNotWorse, na.rm = TRUE)
m90 <- median(cs$NumberOfTimes90DaysLate, na.rm = TRUE)
mCons <- median(cs$ConsolidatedNumberOfDaysPastDue, na.rm = TRUE)

cs$NumberOfTime30.59DaysPastDueNotWorse <- ifelse(is.na(cs$NumberOfTime30.59DaysPastDueNotWorse),
                                                  m30,
                                                  cs$NumberOfTime30.59DaysPastDueNotWorse)

cs$NumberOfTime60.89DaysPastDueNotWorse <- ifelse(is.na(cs$NumberOfTime60.89DaysPastDueNotWorse),
                                                  m60,
                                                  cs$NumberOfTime60.89DaysPastDueNotWorse)

cs$NumberOfTimes90DaysLate <- ifelse(is.na(cs$NumberOfTimes90DaysLate),
                                     m90,
                                     cs$NumberOfTimes90DaysLate)

cs$ConsolidatedNumberOfDaysPastDue <- ifelse(is.na(cs$ConsolidatedNumberOfDaysPastDue),
                                             mCons,
                                             cs$ConsolidatedNumberOfDaysPastDue)

Save cleaned version of data

# summary(cs)

# convert back to integer before saving
seriousd.int <- as.integer(factor(cs$SeriousDlqin2yrs))
seriousd.int <- ifelse(seriousd.int == 2, 1, 0)
cs$SeriousDlqin2yrs <- (seriousd.int)

write.table(cs, "cs-training-cleaned.csv", quote=F, row.names=F, sep=",")

Feature selection

Added use of FSelector package based on mentor feedback.

Had to spend a few hours getting FSelector package installed on my system.

`information.gain()` in `FSelector`

original_feature_weights <- information.gain(SeriousDlqin2yrs ~ 
                                               NumberOfDependents +
                                               NumberRealEstateLoansOrLines + 
                                               MonthlyIncome + 
                                               DebtRatio + 
                                               NumberOfOpenCreditLinesAndLoans + 
                                               age + 
                                               NumberOfTime60.89DaysPastDueNotWorse + 
                                               NumberOfTime30.59DaysPastDueNotWorse +
                                               NumberOfTimes90DaysLate +    
                                               RevolvingUtilizationOfUnsecuredLines, cs)

# Scale and order features
scale_factor = sum(original_feature_weights$attr_importance)
scaled_feature_weights <- original_feature_weights * (100 / scale_factor)
setorder(scaled_feature_weights, attr_importance)  
feature_names <- row.names(scaled_feature_weights)

# Took me an hour of screwing around to figure out how to have labels not get
# truncated in the plot.  Turned out you must set the mar par param with par;
# cannot be set in barplot()!! and you must set par after png()

#par(mar = c(5, 4, 4, 2) + 0.1)
png("original_varImpFSelector.png", width=8, height=6, units="in", res=100)
par(mar = c(5, 20, 4, 2) + 0.1)
barplot(scaled_feature_weights$attr_importance, 
        names.arg = feature_names,
        horiz = TRUE, las = 1,
        xlab = "Scaled relative feature importance", 
        main = "Original Feature Importance from information.gain()            ."
        )
dev.off()

## quartz_off_screen 
##                 2

feature_weights <- information.gain(SeriousDlqin2yrs ~ 
                                      NumberOfDependents +
                                      NumberRealEstateLoansOrLines + 
                                      MonthlyIncome + 
                                      NumberOfOpenCreditLinesAndLoans + 
                                      age + 
                                      MonthlyExpenses + 
                                      NetMonthlySurplus + 
#                                       NumberOfTime60.89DaysPastDueNotWorse + 
#                                       NumberOfTime30.59DaysPastDueNotWorse +
#                                       NumberOfTimes90DaysLate +    
                                      ConsolidatedNumberOfDaysPastDue +
                                      RevolvingUtilizationOfUnsecuredLines, cs)
print(feature_weights)

##                                      attr_importance
## NumberOfDependents                      0.0002863713
## NumberRealEstateLoansOrLines            0.0006043086
## MonthlyIncome                           0.0005528109
## NumberOfOpenCreditLinesAndLoans         0.0010893387
## age                                     0.0019202191
## MonthlyExpenses                         0.0002244544
## NetMonthlySurplus                       0.0005476050
## ConsolidatedNumberOfDaysPastDue         0.0198172721
## RevolvingUtilizationOfUnsecuredLines    0.0111463670

# Scale and order features
scale_factor = sum(feature_weights$attr_importance)
scaled_feature_weights <- feature_weights * (100 / scale_factor)
setorder(scaled_feature_weights, attr_importance)  
feature_names <- row.names(scaled_feature_weights)

png("varImpFSelector.png", width=8, height=6, units="in", res=100)
par(mar = c(5, 20, 4, 2) + 0.1)
barplot(scaled_feature_weights$attr_importance, 
        names.arg = feature_names,
        horiz = TRUE, las = 1,
        xlab = "Scaled relative feature importance", 
        main = "2nd Iteration Feat. Import. from information.gain()"
        )
dev.off()

## quartz_off_screen 
##                 2

# get a formula for the top features
subset_features <- cutoff.k(feature_weights, 6)
f <- as.simple.formula(subset_features, "SeriousDlqin2yrs")
print(f)

## SeriousDlqin2yrs ~ RevolvingUtilizationOfUnsecuredLines + ConsolidatedNumberOfDaysPastDue + 
##     NumberRealEstateLoansOrLines + MonthlyIncome + NumberOfDependents + 
##     NetMonthlySurplus
## <environment: 0x7fa26bc1d0e0>

FSelector information.gain() on original features

FSelector information.gain() on second iteration features

Some feature EDA

sample_avg = mean(cs$SeriousDlqin2yrs)
########################################
#  age
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$age, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_age <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_age))
for (i in seq_along(deciles_age)){
  pct_delinquent[i] <- 100 * mean(deciles_age[[i]]$SeriousDlqin2yrs)
}
# results
D1 <- data.frame(pct_delinquent, bucket)
# str(D1)
p1 <- ggplot(data = D1, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("age buckets")
p1

########################################
#  NumberOfDependents
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$NumberOfDependents, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_NumberOfDependents <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_NumberOfDependents))
for (i in seq_along(deciles_NumberOfDependents)){
  pct_delinquent[i] <- 100 * mean(deciles_NumberOfDependents[[i]]$SeriousDlqin2yrs)
}
# results
D2 <- data.frame(pct_delinquent, bucket)
# str(D2)
p2 <- ggplot(data = D2, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("NumberOfDependents buckets")
p2

########################################
#  DebtRatio
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$DebtRatio, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_DebtRatio <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_DebtRatio))
for (i in seq_along(deciles_DebtRatio)){
  pct_delinquent[i] <- 100 * mean(deciles_DebtRatio[[i]]$SeriousDlqin2yrs)
}
# results
D3 <- data.frame(pct_delinquent, bucket)
# str(D3)
p3 <- ggplot(data = D3, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("DebtRatio buckets")
p3

########################################
#  MonthlyIncome
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$MonthlyIncome, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_MonthlyIncome <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_MonthlyIncome))
for (i in seq_along(deciles_MonthlyIncome)){
  pct_delinquent[i] <- 100 * mean(deciles_MonthlyIncome[[i]]$SeriousDlqin2yrs)
}
# results
D4 <- data.frame(pct_delinquent, bucket)
# str(D4)
p4 <- ggplot(data = D4, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("MonthlyIncome buckets")
p4

########################################
#  MonthlyExpenses
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$MonthlyExpenses, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_MonthlyExpenses <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_MonthlyExpenses))
for (i in seq_along(deciles_MonthlyExpenses)){
  pct_delinquent[i] <- 100 * mean(deciles_MonthlyExpenses[[i]]$SeriousDlqin2yrs)
}
# results
D5 <- data.frame(pct_delinquent, bucket)
# str(D5)
p5 <- ggplot(data = D5, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("MonthlyExpenses buckets")
p5

########################################
#  NetMonthlySurplus
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$NetMonthlySurplus, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_NetMonthlySurplus <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_NetMonthlySurplus))
for (i in seq_along(deciles_NetMonthlySurplus)){
  pct_delinquent[i] <- 100 * mean(deciles_NetMonthlySurplus[[i]]$SeriousDlqin2yrs)
}
# results
D6 <- data.frame(pct_delinquent, bucket)
# str(D6)
p6 <- ggplot(data = D6, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("NetMonthlySurplus buckets")
p6

########################################
#  NumberRealEstateLoansOrLines
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$NumberRealEstateLoansOrLines, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_NumberRealEstateLoansOrLines <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_NumberRealEstateLoansOrLines))
for (i in seq_along(deciles_NumberRealEstateLoansOrLines)){
  pct_delinquent[i] <- 100 * mean(deciles_NumberRealEstateLoansOrLines[[i]]$SeriousDlqin2yrs)
}
# results
D7 <- data.frame(pct_delinquent, bucket)
# str(D7)
p7 <- ggplot(data = D7, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("NumberRealEstateLoansOrLines buckets")
p7

########################################
#  NumberOfOpenCreditLinesAndLoans
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$NumberOfOpenCreditLinesAndLoans, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_NumberOfOpenCreditLinesAndLoans <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_NumberOfOpenCreditLinesAndLoans))
for (i in seq_along(deciles_NumberOfOpenCreditLinesAndLoans)){
  pct_delinquent[i] <- 100 * mean(deciles_NumberOfOpenCreditLinesAndLoans[[i]]$SeriousDlqin2yrs)
}
# results
D8 <- data.frame(pct_delinquent, bucket)
# str(D8)
p8 <- ggplot(data = D8, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("NumberOfOpenCreditLinesAndLoans buckets")
p8

########################################
#  RevolvingUtilizationOfUnsecuredLines
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$RevolvingUtilizationOfUnsecuredLines, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_RevolvingUtilizationOfUnsecuredLines <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_RevolvingUtilizationOfUnsecuredLines))
for (i in seq_along(deciles_RevolvingUtilizationOfUnsecuredLines)){
  pct_delinquent[i] <- 100 * mean(deciles_RevolvingUtilizationOfUnsecuredLines[[i]]$SeriousDlqin2yrs)
}
# results
D9 <- data.frame(pct_delinquent, bucket)
# str(D9)
p9 <- ggplot(data = D9, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("RevolvingUtilizationOfUnsecuredLines buckets")
p9

########################################
#  NumberOfTime60.89DaysPastDueNotWorse
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$NumberOfTime60.89DaysPastDueNotWorse, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_NumberOfTime60.89DaysPastDueNotWorse <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_NumberOfTime60.89DaysPastDueNotWorse))
for (i in seq_along(deciles_NumberOfTime60.89DaysPastDueNotWorse)){
  pct_delinquent[i] <- 100 * mean(deciles_NumberOfTime60.89DaysPastDueNotWorse[[i]]$SeriousDlqin2yrs)
}
# results
D10 <- data.frame(pct_delinquent, bucket)
# str(D10)
p10 <- ggplot(data = D10, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("NumberOfTime60.89DaysPastDueNotWorse buckets")
p10

########################################
#  NumberOfTime30.59DaysPastDueNotWorse
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$NumberOfTime30.59DaysPastDueNotWorse, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_NumberOfTime30.59DaysPastDueNotWorse <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_NumberOfTime30.59DaysPastDueNotWorse))
for (i in seq_along(deciles_NumberOfTime30.59DaysPastDueNotWorse)){
  pct_delinquent[i] <- 100 * mean(deciles_NumberOfTime30.59DaysPastDueNotWorse[[i]]$SeriousDlqin2yrs)
}
# results
D11 <- data.frame(pct_delinquent, bucket)
# str(D11)
p11 <- ggplot(data = D11, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("NumberOfTime30.59DaysPastDueNotWorse buckets")
p11

########################################
#  NumberOfTimes90DaysLate
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$NumberOfTimes90DaysLate, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_NumberOfTimes90DaysLate <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_NumberOfTimes90DaysLate))
for (i in seq_along(deciles_NumberOfTimes90DaysLate)){
  pct_delinquent[i] <- 100 * mean(deciles_NumberOfTimes90DaysLate[[i]]$SeriousDlqin2yrs)
}
# results
D12 <- data.frame(pct_delinquent, bucket)
# str(D12)
p12 <- ggplot(data = D12, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("NumberOfTimes90DaysLate buckets")
p12

########################################
#  ConsolidatedNumberOfDaysPastDue
########################################

# generated with gen_feature_bucket_r.py

# split into ten groups based on age
num_buckets <- 10
bucket <- seq(1, num_buckets)
ar <- rank(cs$ConsolidatedNumberOfDaysPastDue, ties.method = "first")
decile <- cut(ar, quantile(ar, probs=0:10/10), include.lowest=TRUE, labels=FALSE)
deciles_ConsolidatedNumberOfDaysPastDue <- split(cs, decile)
pct_delinquent <- vector("numeric", length(deciles_ConsolidatedNumberOfDaysPastDue))
for (i in seq_along(deciles_ConsolidatedNumberOfDaysPastDue)){
  pct_delinquent[i] <- 100 * mean(deciles_ConsolidatedNumberOfDaysPastDue[[i]]$SeriousDlqin2yrs)
}
# results
D13 <- data.frame(pct_delinquent, bucket)
# str(D13)
p13 <- ggplot(data = D13, aes(x = factor(bucket), y = pct_delinquent)) + 
  geom_bar(stat ="identity")  + 
  geom_hline(aes (yintercept = 100 * sample_avg), alpha = 6/10, color = 'red', linetype = 1, lwd = 2) +
  ggtitle("ConsolidatedNumberOfDaysPastDue buckets")
p13

# group charts into one
png("feature_bucketed_discr.png", width=12, height=16, units="in", res=100)
grid.arrange( p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13,  ncol = 3)
dev.off()

## quartz_off_screen 
##                 2

EDA and Data cleanup

David Crook

November 5, 2015

Setup

Start exploring the data set

Demographics

Finances

DebtRatio

MonthlyIncome

Creating additional Finances variables

Deliquencies

Creating additional Deliquencies variables

Existing Credit Utilization

Additional data cleanup

zero age and NAs in MonthlyIncome, NumberOfDependents

Delinquency NAs

Save cleaned version of data

Feature selection

`information.gain()` in `FSelector`

Some feature EDA

EDA and Data cleanup

David Crook

November 5, 2015

Setup

Start exploring the data set

Demographics

Finances

DebtRatio

MonthlyIncome

Creating additional Finances variables

Deliquencies

Creating additional Deliquencies variables

Existing Credit Utilization

Additional data cleanup

zero age and NAs in MonthlyIncome, NumberOfDependents

Delinquency NAs

Save cleaned version of data

Feature selection

information.gain() in FSelector

Some feature EDA

`information.gain()` in `FSelector`