View output Modeling.html.
# set randomizer's seed
set.seed(142)
Out of the 150,000 samples, the incidence of loan delinquency is 6.68%.
Since the “test” dataset provided in the competition did not include a value for the dependent variable (SeriousDlqin2yrs
), the models created will be tested and validated with a subset of the training data.
First, the competition training data is split into a Training set and a Test set:
# lower the proportion of training to have models build faster. ideally would
# set this higher to ~0.8, but that takes hours to train the random forest model
train_proportion <- .2
train_indices <- createDataPartition(
y=cs$SeriousDlqin2yrs,
p=train_proportion,
list=FALSE)
cs_train <- cs[train_indices, ]
cs_test <- cs[-train_indices, ]
Then the training set is further split from the Training set as a Validation set for the purpose of estimating Out-Of-Sample performance metrics:
valid_proportion_of_train <- 1 / 3
valid_indices <- createDataPartition(
y=cs_train$SeriousDlqin2yrs,
p=valid_proportion_of_train,
list=FALSE)
cs_valid <- cs_train[valid_indices, ]
cs_train <- cs_train[-valid_indices, ]
Just to demonstrate that the data was split representatively by createDataPartition
from caret
: the delinquency incidences in the Training, Validation and Test sets are 6.68%, 6.69% and 6.68% respectively.
Sample name | No. of Observations |
---|---|
cs_train | 20000 |
cs_test | 119999 |
cs_valid | 10001 |
There will be built three types of classification models: a Classification (CART) Tree, a Random Forest, and a Logistic Regression.
# set up some turning and cross-validation parameters that will be used across
# the classifiers
caret_optimized_metric <- 'logLoss' # equivalent to 1 / 2 of Deviance
caret_train_control <- trainControl(
classProbs=TRUE, # compute class probabilities
summaryFunction=mnLogLoss, # equivalent to 1 / 2 of Deviance
method='repeatedcv', # repeated Cross Validation
number=5, # 5 folds
repeats=2, # 2 repeats
allowParallel=FALSE)
# cp values
cp.grid = expand.grid( .cp = seq(0.001, 0.05, 0.001))
cart_model = train(
x=cs_train[, x_vars],
y=cs_train$SeriousDlqin2yrs,
method='rpart', # CART
metric=caret_optimized_metric,
trControl=caret_train_control,
tuneGrid = cp.grid
)
# cart_model
# show the tree for the tuned .cp value
cart_model.best.tree = cart_model$finalModel
prp(cart_model.best.tree)
cart_varImp <- varImp(cart_model)
plot(cart_varImp, main = "Variable importance in CART, 1st Iter.")
B <- 600
# http://topepo.github.io/caret/Random_Forest.html
rf_model <- train(
x=cs_train[, x_vars],
y=cs_train$SeriousDlqin2yrs,
method='rf', # Random Forest
metric=caret_optimized_metric,
ntree=B, # number of trees in the Random Forest
nodesize=100, # minimum node size set small enough to allow for complex trees,
# but not so small as to require too large B to eliminate high variance
importance=TRUE, # evaluate importance of predictors
keep.inbag=TRUE,
trControl=caret_train_control,
tuneGrid=NULL)
rf_varImp <- varImp(rf_model)
plot(rf_varImp, main = "Variable importance in RF, 1st Iter.")
log_reg_model <- train(
x=cs_train[, x_vars],
y=cs_train$SeriousDlqin2yrs,
preProcess=c('center', 'scale'),
method='plr', # Penalized Logistic Regression
metric=caret_optimized_metric,
trControl=caret_train_control,
tuneGrid=expand.grid(
lambda=0, # weight penalty parameter
cp='aic')) # complexity parameter (AIC / BIC)
log_reg_varImp <- varImp(log_reg_model)
plot(log_reg_varImp, main = "Variable importance in LogReg, 1st Iter.")
In order to see which model seems to have the best prediction characteristics, create and chart ROC curves for each model based on the sample test data we kept in reserve for validation.
## quartz_off_screen
## 3
## quartz_off_screen
## 2
# impVars <- row.names(log_reg_varImp$importance)
png("varImp1a.png", width=12, height=4, units="in", res=300)
p1 <- plot(log_reg_varImp, main = "LogReg, 1st Iter.")
p2 <- plot(cart_varImp, main = "CART, 1st Iter.")
p3 <- plot(rf_varImp, main = "RF, 1st Iter.")
grid.arrange(p1, p2, p3, nrow = 1)
dev.off()
## quartz_off_screen
## 2
A sensistivity threshold of 75
% was chosen, subjectively, meaning it is desired of the model parameters to catch 75% of the actual delinquency cases.
log_reg_sensitivity_threshold <- .75
log_reg_i <- min(which(log_reg_oos_performance$sensitivity < log_reg_sensitivity_threshold)) - 1
log_reg_selected_prob_threshold <- prob_thresholds[log_reg_i]
The selected decision threshold is 0.051 – meaning when the logistic regression model is used to predict on new data, it will predict “Delinquent” when the predicted probability exceeds that threshold.
rf_sensitivity_threshold <- .75
rf_i <- min(which(rf_oos_performance$sensitivity < rf_sensitivity_threshold)) - 1
rf_selected_prob_threshold <- prob_thresholds[rf_i]
The selected decision threshold for the Random Forest model is 0.007.
cart_sensitivity_threshold <- .30
cart_i <- min(which(cart_oos_performance$sensitivity < cart_sensitivity_threshold)) - 1
cart_selected_prob_threshold <- prob_thresholds[cart_i]
# cart_selected_prob_threshold
The selected decision threshold is 0.261 – meaning when the CART model is used to predict on new data, it will predict “Delinquent” when the predicted probability exceeds that threshold.
Calculate the performance on the test split of data from the original dataset on the respective model. First a helper function to display Accuracy, Sensitivity, and Specificity is created.
Evaluating the performance of the selected logit model, with a decision threshold at 0.261:
cart_test_pred_probs <- predict(
cart_model, newdata=cs_test[, x_vars], type='prob')
cart_test_performance <- bin_classif_eval(
cart_test_pred_probs$delinquent, cs_test$SeriousDlqin2yrs, thresholds=cart_selected_prob_threshold)
# expected predictive performance
display_accu_sens_spec(cart_oos_performance[cart_i, ])
## [1] "Accuracy:" "0.9211" "Sensitivity:" "0.3004"
## [5] "specificity:" "0.9656"
# tested performance
display_accu_sens_spec(cart_test_performance)
## accuracy sensitivity specificity
## "0.9237" "0.3318" "0.9661"
showCM(cart_test_pred_probs, cart_selected_prob_threshold, "cart")
## [1] "Confusion matrix for cart predictions at prob threshold of 0.261"
## Reference
## Prediction ok delinquent
## ok 108182 5359
## delinquent 3797 2661
The accuracy is not great, but the test split data performance is similar to its expected performance.
Evaluating the performance of the selected logit model, with a decision threshold at 0.051:
log_reg_test_pred_probs <- predict(
log_reg_model, newdata=cs_test[, x_vars], type='prob')
log_reg_test_performance <- bin_classif_eval(
log_reg_test_pred_probs$delinquent, cs_test$SeriousDlqin2yrs, thresholds=log_reg_selected_prob_threshold)
# expected predictive performance
display_accu_sens_spec(log_reg_oos_performance[log_reg_i, ])
## [1] "Accuracy:" "0.6782" "Sensitivity:" "0.7549"
## [5] "specificity:" "0.6727"
# tested performance
display_accu_sens_spec(log_reg_test_performance)
## accuracy sensitivity specificity
## "0.6766" "0.7703" "0.6699"
showCM(log_reg_test_pred_probs, log_reg_selected_prob_threshold, "log_reg")
## [1] "Confusion matrix for log_reg predictions at prob threshold of 0.051"
## Reference
## Prediction ok delinquent
## ok 75016 1842
## delinquent 36963 6178
The accuracy is not great, but the test split data performance is similar to its expected performance.
Evaluating the performance of the selected Random Forest model, with a decision threshold at 0.007:
rf_test_pred_probs <- predict(
rf_model, newdata=cs_test[, x_vars], type='prob')
rf_test_performance <- bin_classif_eval(
rf_test_pred_probs$delinquent, cs_test$SeriousDlqin2yrs, thresholds=rf_selected_prob_threshold)
# expected predicted performance
display_accu_sens_spec(rf_oos_performance[rf_i, ])
## [1] "Accuracy:" "0.7284" "Sensitivity:" "0.7504"
## [5] "specificity:" "0.7269"
# tested performance
display_accu_sens_spec(rf_test_performance)
## accuracy sensitivity specificity
## "0.7326" "0.7465" "0.7316"
showCM(rf_test_pred_probs, rf_selected_prob_threshold, "rf")
## [1] "Confusion matrix for rf predictions at prob threshold of 0.007"
## Reference
## Prediction ok delinquent
## ok 81927 2033
## delinquent 30052 5987
# prediction() needs this to be numeric (not factor SeriousDlqin2yrs)
cs_test_y_numeric <- as.integer(cs_test$SeriousDlqin2yrs)
cs_test_y_numeric <- ifelse(cs_test_y_numeric == 2, 1, 0)
png("cart_gain.png", width=6, height=4.5, units="in", res=100)
cart_rocr <- prediction(cart_test_pred_probs[,2], cs_test_y_numeric)
cart_gain <- performance(cart_rocr, "tpr", "rpp")
p1 <- plot(cart_gain, colorize=TRUE, main = "Gain chart for CART 1st Iter") + abline(a=0,b=1,lty=2,col=8)
dev.off()
## quartz_off_screen
## 2
png("cart_lift.png", width=6, height=4.5, units="in", res=100)
cart_lift <- performance(cart_rocr, "lift", "rpp")
plot(cart_lift, main="Lift curve for CART 1st Iter", colorize=T)
dev.off()
## quartz_off_screen
## 2
png("rf_gain.png", width=6, height=4.5, units="in", res=100)
rf_rocr <- prediction(rf_test_pred_probs[,2], cs_test_y_numeric)
rf_gain <- performance(rf_rocr, "tpr", "rpp")
p2 <- plot(rf_gain, colorize=TRUE, main = "Gain chart for RF 1st Iter") + abline(a=0,b=1,lty=2,col=8)
dev.off()
## quartz_off_screen
## 2
png("rf_lift.png", width=6, height=4.5, units="in", res=100)
rf_lift <- performance(rf_rocr, "lift", "rpp")
plot(rf_lift, main="Lift curve for RF 1st Iter", colorize=T)
dev.off()
## quartz_off_screen
## 2
png("logreg_gain.png", width=6, height=4.5, units="in", res=100)
log_reg_rocr <- prediction(log_reg_test_pred_probs[,2], cs_test_y_numeric)
log_reg_gain <- performance(log_reg_rocr, "tpr", "rpp")
p3 <- plot(log_reg_gain, colorize=TRUE, main = "Gain chart for LogReg 1st Iter") + abline(a=0,b=1,lty=2,col=8)
dev.off()
## quartz_off_screen
## 2
png("logreg_lift.png", width=6, height=4.5, units="in", res=100)
log_reg_lift <- performance(log_reg_rocr, "lift", "rpp")
plot(rf_lift, main="Lift curve for LogReg 1st Iter", colorize=T)
dev.off()
## quartz_off_screen
## 2
cart_auc <- performance(cart_rocr, "auc")@y.values
cart_auc
## [[1]]
## [1] 0.6503274
rf_auc <- performance(rf_rocr, "auc")@y.values
rf_auc
## [[1]]
## [1] 0.8132474
log_reg_auc <- performance(log_reg_rocr, "auc")@y.values
log_reg_auc
## [[1]]
## [1] 0.8102036
The Random Forest model has a higher accuracy at the chosen sensitivity threshold. The performance comparison on the test slice is in the expected performance ballpark, within 1% or so.
So the models seem to be able to predict reasonably for “new” data and are not overtrained.
Using some constructed feature variables, the classifier models are rebuilt.
# Remove the non-linear DebtRatio and add MonthlyExpenses and NetMonthlySurplus.
# Replace the three deliquency variables with the single
# ConsolidatedNumberOfDaysPastDue
x_vars_features = c(
'RevolvingUtilizationOfUnsecuredLines',
'age',
# 'NumberOfTime30.59DaysPastDueNotWorse',
# 'NumberOfTime60.89DaysPastDueNotWorse',
# 'NumberOfTimes90DaysLate',
'MonthlyIncome',
'MonthlyExpenses',
'NetMonthlySurplus',
'NumberOfOpenCreditLinesAndLoans',
'NumberRealEstateLoansOrLines',
'NumberOfDependents',
'ConsolidatedNumberOfDaysPastDue'
)
# cp values
### cp.grid = expand.grid( .cp = seq(0.001, 0.010, 0.001))
# changed to a fixed value since randomness was changing this to a worse
# predicting model
cp.grid = expand.grid( .cp = c(0.003))
cart_model2 = train(
x=cs_train[, x_vars_features],
y=cs_train$SeriousDlqin2yrs,
method='rpart', # CART
metric=caret_optimized_metric,
trControl=caret_train_control,
tuneGrid = cp.grid
)
cart_model2
## CART
##
## 20000 samples
## 9 predictor
## 2 classes: 'ok', 'delinquent'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 16000, 15999, 16000, 16000, 16001, 16000, ...
## Resampling results
##
## logLoss logLoss SD
## 0.2056158 0.006907112
##
## Tuning parameter 'cp' was held constant at a value of 0.003
##
#str(cart_model2)
# show the tree for the tuned .cp value
cart_model2.best.tree = cart_model2$finalModel
prp(cart_model2.best.tree)
cart2_varImp <- varImp(cart_model2)
plot(cart2_varImp, main = "Variable importance in CART, 2nd Iter.")
B <- 600
# http://topepo.github.io/caret/Random_Forest.html
rf_model2 <- train(
x=cs_train[, x_vars_features],
y=cs_train$SeriousDlqin2yrs,
method='rf', # Random Forest
metric=caret_optimized_metric,
ntree=B, # number of trees in the Random Forest
nodesize=100, # minimum node size set small enough to allow for complex trees,
# but not so small as to require too large B to eliminate high variance
importance=TRUE, # evaluate importance of predictors
keep.inbag=TRUE,
trControl=caret_train_control,
tuneGrid=NULL)
rf2_varImp <- varImp(rf_model2)
plot(rf2_varImp, main = "Variable importance in RF, 2nd Iter.")
log_reg_model2 <- train(
x=cs_train[, x_vars_features],
y=cs_train$SeriousDlqin2yrs,
preProcess=c('center', 'scale'),
method='plr', # Penalized Logistic Regression
metric=caret_optimized_metric,
trControl=caret_train_control,
tuneGrid=expand.grid(
lambda=0, # weight penalty parameter
cp='aic')) # complexity parameter (AIC / BIC)
log_reg2_varImp <- varImp(log_reg_model2)
plot(log_reg2_varImp, main = "Variable importance in LogReg, 2nd Iter.")
# impVars <- row.names(log_reg2_varImp$importance)
png("varImp2a.png", width=12, height=4, units="in", res=300)
p1 <- plot(log_reg2_varImp, main = "LogReg, 2nd Iter.")
p2 <- plot(cart2_varImp, main = "CART, 2nd Iter.")
p3 <- plot(rf2_varImp, main = "RF, 2nd Iter.")
grid.arrange(p1, p2, p3, nrow = 1)
dev.off()
## quartz_off_screen
## 2
In order to see which model seems to have the best prediction characteristics, create and chart ROC curves for each model based on the sample test data we kept in reserve for validation.
## quartz_off_screen
## 3
## quartz_off_screen
## 2
The CART model does significantly better this time using the engineered features than it did with the original variables, even matching or exceeding the Random Forest at higher specificity. The logisitic regression eventual catches up to the CART model at higher sensitivity. Again, however, the Random Forest model has the highest AUC among the three models.
Again, a sensistivity threshold of 75
% is chosen.
The selected decision threshold is 0.053.
The selected decision threshold for the Random Forest model is 0.007.
The selected decision threshold is 0.088.
Evaluating the performance of the selected logit model, with a decision threshold at 0.053:
log_reg2_test_pred_probs <- predict(
log_reg_model2, newdata=cs_test[, x_vars_features], type='prob')
log_reg2_test_performance <- bin_classif_eval(
log_reg2_test_pred_probs$delinquent, cs_test$SeriousDlqin2yrs, thresholds=log_reg2_selected_prob_threshold)
# expected predictive performance
display_accu_sens_spec(log_reg2_oos_performance[log_reg2_i, ])
## [1] "Accuracy:" "0.6674" "Sensitivity:" "0.7549"
## [5] "specificity:" "0.6612"
# tested performance
display_accu_sens_spec(log_reg2_test_performance)
## accuracy sensitivity specificity
## "0.6651" "0.7731" "0.6574"
showCM(log_reg2_test_pred_probs, log_reg2_selected_prob_threshold, "log_reg2")
## [1] "Confusion matrix for log_reg2 predictions at prob threshold of 0.053"
## Reference
## Prediction ok delinquent
## ok 73612 1820
## delinquent 38367 6200
The accuracy degrades slightly over the first iteration model, and the test split data performance is similar to its expected performance.
Evaluating the performance of the selected Random Forest model, with a decision threshold at 0.007:
rf2_test_pred_probs <- predict(
rf_model2, newdata=cs_test[, x_vars_features], type='prob')
rf2_test_performance <- bin_classif_eval(
rf2_test_pred_probs$delinquent, cs_test$SeriousDlqin2yrs, thresholds=rf2_selected_prob_threshold)
# expected predicted performance
display_accu_sens_spec(rf2_oos_performance[rf2_i, ])
## [1] "Accuracy:" "0.7318" "Sensitivity:" "0.7758"
## [5] "specificity:" "0.7287"
# tested performance
display_accu_sens_spec(rf2_test_performance)
## accuracy sensitivity specificity
## "0.7355" "0.7683" "0.7331"
showCM(rf2_test_pred_probs, rf2_selected_prob_threshold, "rf2")
## [1] "Confusion matrix for rf2 predictions at prob threshold of 0.007"
## Reference
## Prediction ok delinquent
## ok 82096 1858
## delinquent 29883 6162
The Random Forest model has a higher accuracy at the chosen sensitivity threshold. The performance comparison on the test slice is in the expected performance ballpark, within 1% or so.
Evaluating the performance of the selected CART model, with a decision threshold at 0.088:
cart2_test_pred_probs <- predict(
cart_model2, newdata=cs_test[, x_vars_features], type='prob')
cart2_test_performance <- bin_classif_eval(
cart2_test_pred_probs$delinquent, cs_test$SeriousDlqin2yrs, thresholds=cart2_selected_prob_threshold)
# expected predicted performance
display_accu_sens_spec(cart2_oos_performance[cart2_i, ])
## [1] "Accuracy:" "0.7532" "Sensitivity:" "0.7728"
## [5] "specificity:" "0.7518"
# tested performance
display_accu_sens_spec(cart2_test_performance)
## accuracy sensitivity specificity
## "0.7543" "0.7527" "0.7544"
showCM(cart2_test_pred_probs, cart2_selected_prob_threshold, "cart2")
## [1] "Confusion matrix for cart2 predictions at prob threshold of 0.088"
## Reference
## Prediction ok delinquent
## ok 84473 1983
## delinquent 27506 6037
The CART model is in the running this time. The prediction performance comparison on the test slice is reasonable.
# prediction() needs this to be numeric (not factor SeriousDlqin2yrs)
cs_test_y_numeric <- as.integer(cs_test$SeriousDlqin2yrs)
cs_test_y_numeric <- ifelse(cs_test_y_numeric == 2, 1, 0)
png("cart2_gain.png", width=6, height=4.5, units="in", res=100)
cart2_rocr <- prediction(cart2_test_pred_probs[,2], cs_test_y_numeric)
cart2_gain <- performance(cart2_rocr, "tpr", "rpp")
p1 <- plot(cart2_gain, colorize=TRUE, main = "Gain chart for CART 2nd Iter") + abline(a=0,b=1,lty=2,col=8)
dev.off()
## quartz_off_screen
## 2
png("cart2_lift.png", width=6, height=4.5, units="in", res=100)
cart2_lift <- performance(cart2_rocr, "lift", "rpp")
plot(cart2_lift, main="Lift curve for CART 2nd Iter", colorize=T)
dev.off()
## quartz_off_screen
## 2
png("rf2_gain.png", width=6, height=4.5, units="in", res=100)
rf2_rocr <- prediction(rf2_test_pred_probs[,2], cs_test_y_numeric)
rf2_gain <- performance(rf2_rocr, "tpr", "rpp")
p2 <- plot(rf2_gain, colorize=TRUE, main = "Gain chart for RF 2nd Iter") + abline(a=0,b=1,lty=2,col=8)
dev.off()
## quartz_off_screen
## 2
png("rf2_lift.png", width=6, height=4.5, units="in", res=100)
rf2_lift <- performance(rf2_rocr, "lift", "rpp")
plot(rf2_lift, main="Lift curve for RF 2nd Iter", colorize=T)
dev.off()
## quartz_off_screen
## 2
png("logreg2_gain.png", width=6, height=4.5, units="in", res=100)
log_reg2_rocr <- prediction(log_reg2_test_pred_probs[,2], cs_test_y_numeric)
log_reg2_gain <- performance(log_reg2_rocr, "tpr", "rpp")
p3 <- plot(log_reg2_gain, colorize=TRUE, main = "Gain chart for LogReg 2nd Iter") + abline(a=0,b=1,lty=2,col=8)
dev.off()
## quartz_off_screen
## 2
png("logreg2_lift.png", width=6, height=4.5, units="in", res=100)
log_reg2_lift <- performance(log_reg2_rocr, "lift", "rpp")
plot(log_reg2_lift, main="Lift curve for LogReg 2nd Iter", colorize=T)
dev.off()
## quartz_off_screen
## 2
cart2_auc <- performance(cart2_rocr, "auc")@y.values
cart2_auc
## [[1]]
## [1] 0.7953715
rf2_auc <- performance(rf2_rocr, "auc")@y.values
rf2_auc
## [[1]]
## [1] 0.8263005
log_reg2_auc <- performance(log_reg2_rocr, "auc")@y.values
log_reg2_auc
## [[1]]
## [1] 0.809895