Output: ExtraCredit.html
Setup
# set randomizer's seed
set.seed(142)
Use Random Forest with model to make a prediction set.
Read in training data:
# read in cleaned version saved by EDA.Rmd
cs <- read.csv("cs-training-cleaned.csv")
# restore levels to factors
cs$SeriousDlqin2yrs <- factor(cs$SeriousDlqin2yrs,
levels = c(0, 1), labels = c("ok", "delinquent"))
#str(cs)
There will be built three types of classification models: a Classification (CART) Tree, a Random Forest, and a Logistic Regression.
# set up some turning and cross-validation parameters that will be used across
# the classifiers
caret_optimized_metric <- 'logLoss' # equivalent to 1 / 2 of Deviance
caret_train_control <- trainControl(
classProbs=TRUE, # compute class probabilities
summaryFunction=mnLogLoss, # equivalent to 1 / 2 of Deviance
method='repeatedcv', # repeated Cross Validation
number=5, # 5 folds
repeats=2, # 2 repeats
allowParallel=FALSE)
x_vars_features = c(
'RevolvingUtilizationOfUnsecuredLines',
'age',
'MonthlyIncome',
'MonthlyExpenses',
'NetMonthlySurplus',
'NumberOfOpenCreditLinesAndLoans',
'NumberRealEstateLoansOrLines',
'NumberOfDependents',
'ConsolidatedNumberOfDaysPastDue'
)
# cp values
cp.gridxc = expand.grid( .cp = seq(0.0001, 0.003, 0.0001))
# Use the whole training data set and the constructed features
cart_modelxc = train(
x=cs[, x_vars_features],
y=cs$SeriousDlqin2yrs,
method='rpart', # CART
metric=caret_optimized_metric,
trControl=caret_train_control,
tuneGrid = cp.gridxc
)
cart_modelxc
## CART
##
## 150000 samples
## 9 predictor
## 2 classes: 'ok', 'delinquent'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 120000, 120000, 120000, 120000, 120000, 119999, ...
## Resampling results across tuning parameters:
##
## cp logLoss logLoss SD
## 0.0001 0.2388388 0.0129715460
## 0.0002 0.2202875 0.0104713701
## 0.0003 0.2119765 0.0047288191
## 0.0004 0.2077843 0.0024919028
## 0.0005 0.2053597 0.0018045256
## 0.0006 0.2050764 0.0019228830
## 0.0007 0.2044124 0.0015318330
## 0.0008 0.2044333 0.0016417188
## 0.0009 0.2044154 0.0016520509
## 0.0010 0.2040628 0.0008840367
## 0.0011 0.2040002 0.0008818325
## 0.0012 0.2039737 0.0008618920
## 0.0013 0.2039565 0.0008346703
## 0.0014 0.2039562 0.0008349199
## 0.0015 0.2040280 0.0010632222
## 0.0016 0.2040280 0.0010632222
## 0.0017 0.2040239 0.0010734946
## 0.0018 0.2040971 0.0010344408
## 0.0019 0.2040901 0.0010296605
## 0.0020 0.2042352 0.0010087886
## 0.0021 0.2042302 0.0010099040
## 0.0022 0.2042302 0.0010099040
## 0.0023 0.2042302 0.0010099040
## 0.0024 0.2042407 0.0010173197
## 0.0025 0.2042768 0.0010170439
## 0.0026 0.2042957 0.0010133730
## 0.0027 0.2043213 0.0010018062
## 0.0028 0.2043213 0.0010018062
## 0.0029 0.2043173 0.0010068727
## 0.0030 0.2043601 0.0009968838
##
## logLoss was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.0014.
# show the tree for the tuned .cp value
cart_modelxc.best.tree = cart_modelxc$finalModel
prp(cart_modelxc.best.tree)
Saved copy of tree:
B <- 600
rf_modelxc <- train(
x=cs[, x_vars_features],
y=cs$SeriousDlqin2yrs,
method='rf', # Random Forest
metric=caret_optimized_metric,
ntree=B, # number of trees in the Random Forest
nodesize=100, # minimum node size set small enough to allow for complex trees,
# but not so small as to require too large B to eliminate high variance
importance=TRUE, # evaluate importance of predictors
keep.inbag=TRUE,
trControl=caret_train_control,
tuneGrid=NULL)
Read in test data:
cs.test <- read.csv("cs-test.csv")
str(cs.test)
## 'data.frame': 101503 obs. of 12 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SeriousDlqin2yrs : logi NA NA NA NA NA NA ...
## $ RevolvingUtilizationOfUnsecuredLines: num 0.8855 0.4633 0.0433 0.2803 1 ...
## $ age : int 43 57 59 38 27 63 50 79 68 23 ...
## $ NumberOfTime30.59DaysPastDueNotWorse: int 0 0 0 1 0 0 0 1 0 98 ...
## $ DebtRatio : num 0.1775 0.5272 0.6876 0.926 0.0199 ...
## $ MonthlyIncome : int 5700 9141 5083 3200 3865 4140 0 3301 NA 0 ...
## $ NumberOfOpenCreditLinesAndLoans : int 4 15 12 7 4 4 5 8 4 0 ...
## $ NumberOfTimes90DaysLate : int 0 0 0 0 0 0 0 0 0 98 ...
## $ NumberRealEstateLoansOrLines : int 0 4 1 2 0 0 0 1 1 0 ...
## $ NumberOfTime60.89DaysPastDueNotWorse: int 0 0 0 0 0 0 0 0 0 98 ...
## $ NumberOfDependents : int 0 2 2 0 1 1 3 1 0 0 ...
# Gets rid of the "X" column
cs.test <- rbind(cs.test)[ , -1]
# convert SeriousDlqin2yrs in testdata to numeric
cs.test$SeriousDlqin2yrs <- as.numeric(cs.test$SeriousDlqin2yrs)
nbt_samples <- nrow(cs.test)
nbt_samples
## [1] 101503
str(cs.test)
## 'data.frame': 101503 obs. of 11 variables:
## $ SeriousDlqin2yrs : num NA NA NA NA NA NA NA NA NA NA ...
## $ RevolvingUtilizationOfUnsecuredLines: num 0.8855 0.4633 0.0433 0.2803 1 ...
## $ age : int 43 57 59 38 27 63 50 79 68 23 ...
## $ NumberOfTime30.59DaysPastDueNotWorse: int 0 0 0 1 0 0 0 1 0 98 ...
## $ DebtRatio : num 0.1775 0.5272 0.6876 0.926 0.0199 ...
## $ MonthlyIncome : int 5700 9141 5083 3200 3865 4140 0 3301 NA 0 ...
## $ NumberOfOpenCreditLinesAndLoans : int 4 15 12 7 4 4 5 8 4 0 ...
## $ NumberOfTimes90DaysLate : int 0 0 0 0 0 0 0 0 0 98 ...
## $ NumberRealEstateLoansOrLines : int 0 4 1 2 0 0 0 1 1 0 ...
## $ NumberOfTime60.89DaysPastDueNotWorse: int 0 0 0 0 0 0 0 0 0 98 ...
## $ NumberOfDependents : int 0 2 2 0 1 1 3 1 0 0 ...
summary(cs.test)
## SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age
## Min. : NA Min. : 0.000 Min. : 21.00
## 1st Qu.: NA 1st Qu.: 0.030 1st Qu.: 41.00
## Median : NA Median : 0.153 Median : 52.00
## Mean :NaN Mean : 5.310 Mean : 52.41
## 3rd Qu.: NA 3rd Qu.: 0.564 3rd Qu.: 63.00
## Max. : NA Max. :21821.000 Max. :104.00
## NA's :101503
## NumberOfTime30.59DaysPastDueNotWorse DebtRatio
## Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 0.17
## Median : 0.0000 Median : 0.36
## Mean : 0.4538 Mean : 344.48
## 3rd Qu.: 0.0000 3rd Qu.: 0.85
## Max. :98.0000 Max. :268326.00
##
## MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
## Min. : 0 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 3408 1st Qu.: 5.000 1st Qu.: 0.0000
## Median : 5400 Median : 8.000 Median : 0.0000
## Mean : 6855 Mean : 8.454 Mean : 0.2967
## 3rd Qu.: 8200 3rd Qu.:11.000 3rd Qu.: 0.0000
## Max. :7727000 Max. :85.000 Max. :98.0000
## NA's :20103
## NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 1.000 Median : 0.0000
## Mean : 1.013 Mean : 0.2703
## 3rd Qu.: 2.000 3rd Qu.: 0.0000
## Max. :37.000 Max. :98.0000
##
## NumberOfDependents
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 0.769
## 3rd Qu.: 1.000
## Max. :43.000
## NA's :2626
clean up the data, and impute missing values
# create a derived variable called MonthlyExpenses taking the above NA
# disposition into account
cs.test$MonthlyExpenses <- ifelse(is.na(cs.test$MonthlyIncome),
1 * cs.test$DebtRatio,
cs.test$MonthlyIncome * cs.test$DebtRatio)
# create another derived variable called NetMonthlySurplus
cs.test$NetMonthlySurplus <- ifelse(is.na(cs.test$MonthlyIncome),
0 - cs.test$MonthlyExpenses,
cs.test$MonthlyIncome - cs.test$MonthlyExpenses)
# Convert these to NA, and the variable to numeric
cs.test$NumberOfTime30.59DaysPastDueNotWorse <- as.numeric(ifelse(cs.test$NumberOfTime30.59DaysPastDueNotWorse >= 96, NA,
cs.test$NumberOfTime30.59DaysPastDueNotWorse))
cs.test$NumberOfTime60.89DaysPastDueNotWorse <- as.numeric(ifelse(cs.test$NumberOfTime60.89DaysPastDueNotWorse >= 96, NA,
cs.test$NumberOfTime60.89DaysPastDueNotWorse))
cs.test$NumberOfTimes90DaysLate <- as.numeric(ifelse(cs.test$NumberOfTimes90DaysLate >= 96, NA,
cs.test$NumberOfTimes90DaysLate))
# convert the NAs back to their variable's mean
m30 <- mean(cs.test$NumberOfTime30.59DaysPastDueNotWorse, na.rm = TRUE)
m60 <- mean(cs.test$NumberOfTime60.89DaysPastDueNotWorse, na.rm = TRUE)
m90 <- mean(cs.test$NumberOfTimes90DaysLate, na.rm = TRUE)
cs.test$NumberOfTime30.59DaysPastDueNotWorse <- ifelse(is.na(cs.test$NumberOfTime30.59DaysPastDueNotWorse),
m30,
cs.test$NumberOfTime30.59DaysPastDueNotWorse)
cs.test$NumberOfTime60.89DaysPastDueNotWorse <- ifelse(is.na(cs.test$NumberOfTime60.89DaysPastDueNotWorse),
m60,
cs.test$NumberOfTime60.89DaysPastDueNotWorse)
cs.test$NumberOfTimes90DaysLate <- ifelse(is.na(cs.test$NumberOfTimes90DaysLate),
m90,
cs.test$NumberOfTimes90DaysLate)
# create the ConsolidatedNumberOfDaysPastDue
cs.test$ConsolidatedNumberOfDaysPastDue <- (cs.test$NumberOfTime30.59DaysPastDueNotWorse * 30) +
(cs.test$NumberOfTime60.89DaysPastDueNotWorse * 60) +
(cs.test$NumberOfTimes90DaysLate * 90)
# summary(cs.test)
MonthlyIncome
and NumberOfDependents
contain NAs
set.seed(144)
# Multiple imputation
simplifiedxc = cs.test[c("age",
'RevolvingUtilizationOfUnsecuredLines',
'MonthlyIncome',
'MonthlyExpenses',
'NetMonthlySurplus',
'NumberOfOpenCreditLinesAndLoans',
'NumberRealEstateLoansOrLines',
'NumberOfDependents',
'ConsolidatedNumberOfDaysPastDue'
)]
### This takes a long time so save the results
#imputedxc = complete(mice(simplifiedxc))
#save(imputedxc,file="imputed_testxc.Rda")
load("imputed_testxc.Rda") # load value in R object 'imputedxc'
# save results
cs.test$MonthlyIncome <- imputedxc$MonthlyIncome
cs.test$NumberOfDependents <- imputedxc$NumberOfDependents
CART
cartxc_test_pred_probs <- predict(
cart_modelxc, newdata=cs.test[, x_vars_features], type='prob')
summary(cartxc_test_pred_probs)
## ok delinquent
## Min. :0.3239 Min. :0.03968
## 1st Qu.:0.9603 1st Qu.:0.03968
## Median :0.9603 Median :0.03968
## Mean :0.9333 Mean :0.06666
## 3rd Qu.:0.9603 3rd Qu.:0.03968
## Max. :0.9603 Max. :0.67613
results <- data.frame(Id = 1:nrow(cs.test), Probability = cartxc_test_pred_probs[, 2])
write.table(results, "cartEntry.csv", quote=F, row.names=F, sep=",")
Random Forest
rfxc_test_pred_probs <- predict(
rf_modelxc, newdata=cs.test[, x_vars_features], type='prob')
summary(rfxc_test_pred_probs)
## ok delinquent
## Min. :0.0050 Min. :0.000000
## 1st Qu.:0.9900 1st Qu.:0.000000
## Median :0.9983 Median :0.001667
## Mean :0.9690 Mean :0.030971
## 3rd Qu.:1.0000 3rd Qu.:0.010000
## Max. :1.0000 Max. :0.995000
results <- data.frame(Id = 1:nrow(cs.test), Probability = rfxc_test_pred_probs[, 2])
str(results)
## 'data.frame': 101503 obs. of 2 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Probability: num 0.005 0.005 0 0.005 0.015 ...
write.table(results, "rfEntry.csv", quote=F, row.names=F, sep=",")
On the final leaderboard at Kaggle, there were a total of 925
team submissions. The winning entry from team “Perfect Storm” had a AUC of 0.869558
.
Its AUC was scored at 0.702828
. It would have ranked at 820
.
Its AUC was scored at 0.794789
. It would have ranked at 772
. This placement is only better than 16.54 of competitor team entries, or about one-sixth of the entries.
Getting an increase in AUC of 0.074769 would have been required to win the competition.