Output: ExtraCredit.html

Extra credit : Kaggle competition entry

Setup

# set randomizer's seed
set.seed(142)

Use Random Forest with model to make a prediction set.

Read in training data:

# read in cleaned version saved by EDA.Rmd
cs <- read.csv("cs-training-cleaned.csv")

# restore levels to factors
cs$SeriousDlqin2yrs <- factor(cs$SeriousDlqin2yrs, 
                              levels = c(0, 1), labels = c("ok", "delinquent"))
#str(cs)

build models

Classification Models

There will be built three types of classification models: a Classification (CART) Tree, a Random Forest, and a Logistic Regression.

# set up some turning and cross-validation parameters that will be used across
# the classifiers
caret_optimized_metric <- 'logLoss'   # equivalent to 1 / 2 of Deviance

caret_train_control <- trainControl(
  classProbs=TRUE,             # compute class probabilities
  summaryFunction=mnLogLoss,   # equivalent to 1 / 2 of Deviance
  method='repeatedcv',         # repeated Cross Validation
  number=5,                    # 5 folds
  repeats=2,                   # 2 repeats
  allowParallel=FALSE)

x_vars_features = c(
  'RevolvingUtilizationOfUnsecuredLines',
  'age',
  'MonthlyIncome',
  'MonthlyExpenses',
  'NetMonthlySurplus',
  'NumberOfOpenCreditLinesAndLoans',
  'NumberRealEstateLoansOrLines',
  'NumberOfDependents',
  'ConsolidatedNumberOfDaysPastDue'
  )

CART

# cp values
cp.gridxc = expand.grid( .cp = seq(0.0001, 0.003, 0.0001))

# Use the whole training data set and the constructed features
cart_modelxc = train(
  x=cs[, x_vars_features],
  y=cs$SeriousDlqin2yrs,
  method='rpart',     # CART
  metric=caret_optimized_metric,
  trControl=caret_train_control,
  tuneGrid = cp.gridxc
  )
cart_modelxc
## CART 
## 
## 150000 samples
##      9 predictor
##      2 classes: 'ok', 'delinquent' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 120000, 120000, 120000, 120000, 120000, 119999, ... 
## Resampling results across tuning parameters:
## 
##   cp      logLoss    logLoss SD  
##   0.0001  0.2388388  0.0129715460
##   0.0002  0.2202875  0.0104713701
##   0.0003  0.2119765  0.0047288191
##   0.0004  0.2077843  0.0024919028
##   0.0005  0.2053597  0.0018045256
##   0.0006  0.2050764  0.0019228830
##   0.0007  0.2044124  0.0015318330
##   0.0008  0.2044333  0.0016417188
##   0.0009  0.2044154  0.0016520509
##   0.0010  0.2040628  0.0008840367
##   0.0011  0.2040002  0.0008818325
##   0.0012  0.2039737  0.0008618920
##   0.0013  0.2039565  0.0008346703
##   0.0014  0.2039562  0.0008349199
##   0.0015  0.2040280  0.0010632222
##   0.0016  0.2040280  0.0010632222
##   0.0017  0.2040239  0.0010734946
##   0.0018  0.2040971  0.0010344408
##   0.0019  0.2040901  0.0010296605
##   0.0020  0.2042352  0.0010087886
##   0.0021  0.2042302  0.0010099040
##   0.0022  0.2042302  0.0010099040
##   0.0023  0.2042302  0.0010099040
##   0.0024  0.2042407  0.0010173197
##   0.0025  0.2042768  0.0010170439
##   0.0026  0.2042957  0.0010133730
##   0.0027  0.2043213  0.0010018062
##   0.0028  0.2043213  0.0010018062
##   0.0029  0.2043173  0.0010068727
##   0.0030  0.2043601  0.0009968838
## 
## logLoss was used to select the optimal model using  the smallest value.
## The final value used for the model was cp = 0.0014.
# show the tree for the tuned .cp value 
cart_modelxc.best.tree = cart_modelxc$finalModel
prp(cart_modelxc.best.tree)

Saved copy of tree:

CART Tree xc

Random Forest xc

B <- 600

rf_modelxc <- train(
  x=cs[, x_vars_features],
  y=cs$SeriousDlqin2yrs,
  method='rf',        # Random Forest
  metric=caret_optimized_metric,
  ntree=B,            # number of trees in the Random Forest
  nodesize=100,       # minimum node size set small enough to allow for complex trees,
                      # but not so small as to require too large B to eliminate high variance
  importance=TRUE,    # evaluate importance of predictors
  keep.inbag=TRUE,
  trControl=caret_train_control,
  tuneGrid=NULL)

competition test data for predictions

Read in test data:

cs.test <- read.csv("cs-test.csv")
str(cs.test)
## 'data.frame':    101503 obs. of  12 variables:
##  $ X                                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SeriousDlqin2yrs                    : logi  NA NA NA NA NA NA ...
##  $ RevolvingUtilizationOfUnsecuredLines: num  0.8855 0.4633 0.0433 0.2803 1 ...
##  $ age                                 : int  43 57 59 38 27 63 50 79 68 23 ...
##  $ NumberOfTime30.59DaysPastDueNotWorse: int  0 0 0 1 0 0 0 1 0 98 ...
##  $ DebtRatio                           : num  0.1775 0.5272 0.6876 0.926 0.0199 ...
##  $ MonthlyIncome                       : int  5700 9141 5083 3200 3865 4140 0 3301 NA 0 ...
##  $ NumberOfOpenCreditLinesAndLoans     : int  4 15 12 7 4 4 5 8 4 0 ...
##  $ NumberOfTimes90DaysLate             : int  0 0 0 0 0 0 0 0 0 98 ...
##  $ NumberRealEstateLoansOrLines        : int  0 4 1 2 0 0 0 1 1 0 ...
##  $ NumberOfTime60.89DaysPastDueNotWorse: int  0 0 0 0 0 0 0 0 0 98 ...
##  $ NumberOfDependents                  : int  0 2 2 0 1 1 3 1 0 0 ...
# Gets rid of the "X" column
cs.test <- rbind(cs.test)[ , -1]  

# convert SeriousDlqin2yrs in testdata to numeric
cs.test$SeriousDlqin2yrs <- as.numeric(cs.test$SeriousDlqin2yrs)

nbt_samples <- nrow(cs.test)
nbt_samples
## [1] 101503
str(cs.test)
## 'data.frame':    101503 obs. of  11 variables:
##  $ SeriousDlqin2yrs                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ RevolvingUtilizationOfUnsecuredLines: num  0.8855 0.4633 0.0433 0.2803 1 ...
##  $ age                                 : int  43 57 59 38 27 63 50 79 68 23 ...
##  $ NumberOfTime30.59DaysPastDueNotWorse: int  0 0 0 1 0 0 0 1 0 98 ...
##  $ DebtRatio                           : num  0.1775 0.5272 0.6876 0.926 0.0199 ...
##  $ MonthlyIncome                       : int  5700 9141 5083 3200 3865 4140 0 3301 NA 0 ...
##  $ NumberOfOpenCreditLinesAndLoans     : int  4 15 12 7 4 4 5 8 4 0 ...
##  $ NumberOfTimes90DaysLate             : int  0 0 0 0 0 0 0 0 0 98 ...
##  $ NumberRealEstateLoansOrLines        : int  0 4 1 2 0 0 0 1 1 0 ...
##  $ NumberOfTime60.89DaysPastDueNotWorse: int  0 0 0 0 0 0 0 0 0 98 ...
##  $ NumberOfDependents                  : int  0 2 2 0 1 1 3 1 0 0 ...
summary(cs.test)
##  SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines      age        
##  Min.   : NA      Min.   :    0.000                    Min.   : 21.00  
##  1st Qu.: NA      1st Qu.:    0.030                    1st Qu.: 41.00  
##  Median : NA      Median :    0.153                    Median : 52.00  
##  Mean   :NaN      Mean   :    5.310                    Mean   : 52.41  
##  3rd Qu.: NA      3rd Qu.:    0.564                    3rd Qu.: 63.00  
##  Max.   : NA      Max.   :21821.000                    Max.   :104.00  
##  NA's   :101503                                                        
##  NumberOfTime30.59DaysPastDueNotWorse   DebtRatio        
##  Min.   : 0.0000                      Min.   :     0.00  
##  1st Qu.: 0.0000                      1st Qu.:     0.17  
##  Median : 0.0000                      Median :     0.36  
##  Mean   : 0.4538                      Mean   :   344.48  
##  3rd Qu.: 0.0000                      3rd Qu.:     0.85  
##  Max.   :98.0000                      Max.   :268326.00  
##                                                          
##  MonthlyIncome     NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
##  Min.   :      0   Min.   : 0.000                  Min.   : 0.0000        
##  1st Qu.:   3408   1st Qu.: 5.000                  1st Qu.: 0.0000        
##  Median :   5400   Median : 8.000                  Median : 0.0000        
##  Mean   :   6855   Mean   : 8.454                  Mean   : 0.2967        
##  3rd Qu.:   8200   3rd Qu.:11.000                  3rd Qu.: 0.0000        
##  Max.   :7727000   Max.   :85.000                  Max.   :98.0000        
##  NA's   :20103                                                            
##  NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
##  Min.   : 0.000               Min.   : 0.0000                     
##  1st Qu.: 0.000               1st Qu.: 0.0000                     
##  Median : 1.000               Median : 0.0000                     
##  Mean   : 1.013               Mean   : 0.2703                     
##  3rd Qu.: 2.000               3rd Qu.: 0.0000                     
##  Max.   :37.000               Max.   :98.0000                     
##                                                                   
##  NumberOfDependents
##  Min.   : 0.000    
##  1st Qu.: 0.000    
##  Median : 0.000    
##  Mean   : 0.769    
##  3rd Qu.: 1.000    
##  Max.   :43.000    
##  NA's   :2626

clean up the data, and impute missing values

# create a derived variable called MonthlyExpenses taking the above NA
# disposition into account
cs.test$MonthlyExpenses <- ifelse(is.na(cs.test$MonthlyIncome),
                             1 * cs.test$DebtRatio,
                             cs.test$MonthlyIncome * cs.test$DebtRatio)

# create another derived variable called NetMonthlySurplus
cs.test$NetMonthlySurplus <- ifelse(is.na(cs.test$MonthlyIncome),
                             0 - cs.test$MonthlyExpenses,
                             cs.test$MonthlyIncome - cs.test$MonthlyExpenses)


# Convert these to NA, and the variable to numeric
cs.test$NumberOfTime30.59DaysPastDueNotWorse <- as.numeric(ifelse(cs.test$NumberOfTime30.59DaysPastDueNotWorse >= 96, NA,
                                                  cs.test$NumberOfTime30.59DaysPastDueNotWorse))

cs.test$NumberOfTime60.89DaysPastDueNotWorse <- as.numeric(ifelse(cs.test$NumberOfTime60.89DaysPastDueNotWorse >= 96, NA,
                                                  cs.test$NumberOfTime60.89DaysPastDueNotWorse))

cs.test$NumberOfTimes90DaysLate <- as.numeric(ifelse(cs.test$NumberOfTimes90DaysLate >= 96, NA,
                                     cs.test$NumberOfTimes90DaysLate))





# convert the NAs back to their variable's mean
m30 <- mean(cs.test$NumberOfTime30.59DaysPastDueNotWorse, na.rm = TRUE)
m60 <- mean(cs.test$NumberOfTime60.89DaysPastDueNotWorse, na.rm = TRUE)
m90 <- mean(cs.test$NumberOfTimes90DaysLate, na.rm = TRUE)

cs.test$NumberOfTime30.59DaysPastDueNotWorse <- ifelse(is.na(cs.test$NumberOfTime30.59DaysPastDueNotWorse),
                                                  m30,
                                                  cs.test$NumberOfTime30.59DaysPastDueNotWorse)

cs.test$NumberOfTime60.89DaysPastDueNotWorse <- ifelse(is.na(cs.test$NumberOfTime60.89DaysPastDueNotWorse),
                                                  m60,
                                                  cs.test$NumberOfTime60.89DaysPastDueNotWorse)

cs.test$NumberOfTimes90DaysLate <- ifelse(is.na(cs.test$NumberOfTimes90DaysLate),
                                     m90,
                                     cs.test$NumberOfTimes90DaysLate)

# create the ConsolidatedNumberOfDaysPastDue
cs.test$ConsolidatedNumberOfDaysPastDue <- (cs.test$NumberOfTime30.59DaysPastDueNotWorse * 30) +
  (cs.test$NumberOfTime60.89DaysPastDueNotWorse * 60) +
  (cs.test$NumberOfTimes90DaysLate * 90)

# summary(cs.test)

run imputation

MonthlyIncome and NumberOfDependents contain NAs

set.seed(144)

# Multiple imputation
simplifiedxc = cs.test[c("age", 
  'RevolvingUtilizationOfUnsecuredLines',
  'MonthlyIncome',
  'MonthlyExpenses',
  'NetMonthlySurplus',
  'NumberOfOpenCreditLinesAndLoans',
  'NumberRealEstateLoansOrLines',
  'NumberOfDependents',
  'ConsolidatedNumberOfDaysPastDue'
  )]

### This takes a long time so save the results
#imputedxc = complete(mice(simplifiedxc))
#save(imputedxc,file="imputed_testxc.Rda")
load("imputed_testxc.Rda")  # load value in R object 'imputedxc'

# save results
cs.test$MonthlyIncome <- imputedxc$MonthlyIncome
cs.test$NumberOfDependents <- imputedxc$NumberOfDependents

generate predictions

CART

cartxc_test_pred_probs <- predict(
  cart_modelxc, newdata=cs.test[, x_vars_features], type='prob')
summary(cartxc_test_pred_probs)
##        ok           delinquent     
##  Min.   :0.3239   Min.   :0.03968  
##  1st Qu.:0.9603   1st Qu.:0.03968  
##  Median :0.9603   Median :0.03968  
##  Mean   :0.9333   Mean   :0.06666  
##  3rd Qu.:0.9603   3rd Qu.:0.03968  
##  Max.   :0.9603   Max.   :0.67613
results <- data.frame(Id = 1:nrow(cs.test), Probability = cartxc_test_pred_probs[, 2])
write.table(results, "cartEntry.csv", quote=F, row.names=F, sep=",")

Random Forest

rfxc_test_pred_probs <- predict(
  rf_modelxc, newdata=cs.test[, x_vars_features], type='prob')
summary(rfxc_test_pred_probs)
##        ok           delinquent      
##  Min.   :0.0050   Min.   :0.000000  
##  1st Qu.:0.9900   1st Qu.:0.000000  
##  Median :0.9983   Median :0.001667  
##  Mean   :0.9690   Mean   :0.030971  
##  3rd Qu.:1.0000   3rd Qu.:0.010000  
##  Max.   :1.0000   Max.   :0.995000
results <- data.frame(Id = 1:nrow(cs.test), Probability = rfxc_test_pred_probs[, 2])
str(results)
## 'data.frame':    101503 obs. of  2 variables:
##  $ Id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Probability: num  0.005 0.005 0 0.005 0.015 ...
write.table(results, "rfEntry.csv", quote=F, row.names=F, sep=",")

Submit predictions

On the final leaderboard at Kaggle, there were a total of 925 team submissions. The winning entry from team “Perfect Storm” had a AUC of 0.869558.

The CART submission

CART Ranking

Its AUC was scored at 0.702828. It would have ranked at 820.

The Random Forest submission

RF Ranking

Its AUC was scored at 0.794789. It would have ranked at 772. This placement is only better than 16.54 of competitor team entries, or about one-sixth of the entries.

Getting an increase in AUC of 0.074769 would have been required to win the competition.