Ref:
The caret package (short for Classification And REgression Training) is a set of functions that attempt to streamline the process for creating predictive models. The package contains tools for: data splitting pre-processing feature selection model tuning using resampling variable importance estimation as well as other functionality. ref
# to install package with all dependencies:
# install.packages("caret", dependencies = c("Depends", "Suggests"))
library(caret)
The next functions help prepare the dataset for learning.
# calls caret::preProcess
caret_preprocess <- function(df, ...) {
predict(preProcess(df, ...), df)
}
# pre: to perform classification
# pre: assumes dependent variable is the last column
# convert factors with a one hot encoding (make dummy vars with 0/1s)
# “fullrank=T” create only (n-1) columns for a factor with n different levels
caret_oneHotEncoding <- function(df) {
dmy <- dummyVars(" ~ .", data=df, fullRank=TRUE)
res <- data.frame(predict(dmy, newdata = df))
res[,ncol(res)] <- as.factor(res[,ncol(res)]) # make dependent variable a factor
names(res)[ncol(res)] <- names(df)[ncol(df)] # and keep its original name
res
}
# pre: assumes dependent variable is the last column
caret_featureSelection <- function(train, test, n.pred) {
if (missing(n.pred)) n.pred=ncol(train)-1 # use all as predictors
control <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 3,
verbose = FALSE)
dependent.col <- ncol(train)
predictors <- names(train)[-dependent.col]
rfe.profile <- rfe(train[,predictors], train[,dependent.col], rfeControl = control)
rfe.profile$variables$var[1:n.pred] # return the best predictors
}
Now we are ready to read and preprocess the dataset:
library(dplyr)
raw.data <- read.csv("train_u6lujuX_CVtuZ9i.csv", stringsAsFactors = T)
glimpse(raw.data)
## Observations: 614
## Variables: 13
## $ Loan_ID <fctr> LP001002, LP001003, LP001005, LP001006, LP0...
## $ Gender <fctr> Male, Male, Male, Male, Male, Male, Male, M...
## $ Married <fctr> No, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, ...
## $ Dependents <fctr> 0, 1, 0, 0, 0, 2, 0, 3+, 2, 1, 2, 2, 2, 0, ...
## $ Education <fctr> Graduate, Graduate, Graduate, Not Graduate,...
## $ Self_Employed <fctr> No, No, Yes, No, No, Yes, No, No, No, No, N...
## $ ApplicantIncome <int> 5849, 4583, 3000, 2583, 6000, 5417, 2333, 30...
## $ CoapplicantIncome <dbl> 0, 1508, 0, 2358, 0, 4196, 1516, 2504, 1526,...
## $ LoanAmount <int> NA, 128, 66, 120, 141, 267, 95, 158, 168, 34...
## $ Loan_Amount_Term <int> 360, 360, 360, 360, 360, 360, 360, 360, 360,...
## $ Credit_History <int> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,...
## $ Property_Area <fctr> Urban, Rural, Urban, Urban, Urban, Urban, U...
## $ Loan_Status <fctr> Y, N, Y, Y, Y, Y, Y, N, Y, N, Y, Y, Y, N, Y...
dependentVar <- names(raw.data)[ncol(raw.data)] # must be the last column
# process data
raw.data %>%
dplyr::select(-Loan_ID) %>% # assumption: irrelevant data
caret_preprocess(method = c("knnImpute", "center", "scale")) %>%
caret_oneHotEncoding -> my.data
glimpse(my.data)
## Observations: 614
## Variables: 19
## $ Gender.Female <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Gender.Male <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ Married.No <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Married.Yes <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ Dependents.0 <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ Dependents.1 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ Dependents.2 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,...
## $ Dependents.3. <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ Education.Not.Graduate <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ Self_Employed.No <dbl> 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,...
## $ Self_Employed.Yes <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ ApplicantIncome <dbl> 0.072931360, -0.134302453, -0.39342656...
## $ CoapplicantIncome <dbl> -0.55403561, -0.03870000, -0.55403561,...
## $ LoanAmount <dbl> 0.01621546, -0.21512721, -0.93953353, ...
## $ Loan_Amount_Term <dbl> 0.276411, 0.276411, 0.276411, 0.276411...
## $ Credit_History <dbl> 0.4324768, 0.4324768, 0.4324768, 0.432...
## $ Property_Area.Semiurban <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,...
## $ Property_Area.Urban <dbl> 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,...
## $ Loan_Status <fctr> 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1...
Next we partition the dataset into training and testing sets:
set.seed(123)
inTrain <- createDataPartition(my.data[,ncol(my.data)], p=0.75, list=FALSE) # train gets 75%
train <- my.data[ inTrain,]
test <- my.data[-inTrain,]
We can optionally perform feature selection:
predictors <- caret_featureSelection(train, test, n.pred=5)
predictors
## [1] "Credit_History" "Loan_Amount_Term"
## [3] "ApplicantIncome" "CoapplicantIncome"
## [5] "Property_Area.Semiurban"
And can choose some of the many models available on caret to perform classification:
model_gbm <- train(train[,predictors], train[,dependentVar], method='gbm' , verbose=F) # Boosted Regression
model_rf <- train(train[,predictors], train[,dependentVar], method='rf' , verbose=F) # Random Forest
model_nnet <- train(train[,predictors], train[,dependentVar], method='nnet', verbose=F) # Neural Nets
model_svm <- train(train[,predictors], train[,dependentVar], method='svmRadial' , verbose=F) # Support Vector Machines
We can check variable importance:
par(mfrow=c(2,2))
plot(varImp(object=model_gbm), main="GBM - Variable Importance")
plot(varImp(object=model_rf), main="RF - Variable Importance")
plot(varImp(object=model_nnet), main="GBM - Variable Importance")
plot(varImp(object=model_svm), main="SVM - Variable Importance")
par(mfrow=c(1,1))
and make predictions:
library(pROC)
predictions <- predict.train(object=model_gbm, test[,predictors], type="raw")
confusionMatrix(predictions, test[,dependentVar])$table
## Reference
## Prediction 0 1
## 0 23 1
## 1 25 104
roc(as.integer(test[,dependentVar]), as.integer(predictions))$auc
## Area under the curve: 0.7348
Caret also allows for parameter tuning:
modelLookup(model='gbm') # which parameters can be tuned?
## model parameter label forReg forClass
## 1 gbm n.trees # Boosting Iterations TRUE TRUE
## 2 gbm interaction.depth Max Tree Depth TRUE TRUE
## 3 gbm shrinkage Shrinkage TRUE TRUE
## 4 gbm n.minobsinnode Min. Terminal Node Size TRUE TRUE
## probModel
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
# making a grid of values
grid <- expand.grid(n.trees = c(10,20),
shrinkage = c(0.01,0.05,0.5),
n.minobsinnode = c(3,5,10),
interaction.depth = c(1,5,10))
# let's use, say, 5-Fold cross-validation repeated 3 times
fitControl <- trainControl(method = "repeatedcv",
number = 5,
repeats = 3)
# tune the model
model_gbm2 <- train(train[,predictors], train[,dependentVar], method='gbm',
# "RMSE" or "Rsquared" for regression;
# "Accuracy" or "Kappa" for classification
metric= "Accuracy",
trControl=fitControl,
tuneGrid=grid,
verbose=FALSE)
model_gbm2$results %>%
dplyr::arrange(desc(Accuracy)) %>%
head(6)
## shrinkage interaction.depth n.minobsinnode n.trees Accuracy Kappa
## 1 0.05 1 3 10 0.8047626 0.4653741
## 2 0.05 1 5 10 0.8047626 0.4653741
## 3 0.05 1 10 10 0.8047626 0.4653741
## 4 0.05 5 10 10 0.8047626 0.4653741
## 5 0.05 10 10 10 0.8047626 0.4653741
## 6 0.05 1 3 20 0.8047626 0.4653741
## AccuracySD KappaSD
## 1 0.0228608 0.07111885
## 2 0.0228608 0.07111885
## 3 0.0228608 0.07111885
## 4 0.0228608 0.07111885
## 5 0.0228608 0.07111885
## 6 0.0228608 0.07111885
plot(model_gbm2)
model_gbm2$bestTune
## n.trees interaction.depth shrinkage n.minobsinnode
## 19 10 1 0.05 3
predictions <- predict.train(object=model_gbm2, test[,predictors], type="raw")
confusionMatrix(predictions, test[,dependentVar])$table
## Reference
## Prediction 0 1
## 0 22 1
## 1 26 104
roc(as.integer(test[,dependentVar]), as.integer(predictions))$auc
## Area under the curve: 0.7244
To compare different models:
resamps <- resamples(list(GBM = model_gbm,
RF = model_rf,
NN = model_nnet,
SVM = model_svm))
resamps
##
## Call:
## resamples.default(x = list(GBM = model_gbm, RF = model_rf, NN
## = model_nnet, SVM = model_svm))
##
## Models: GBM, RF, NN, SVM
## Number of resamples: 25
## Performance metrics: Accuracy, Kappa
## Time estimates for: everything, final model fit
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: GBM, RF, NN, SVM
## Number of resamples: 25
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## GBM 0.7588 0.7840 0.7955 0.7969 0.8098 0.8383 0
## RF 0.7396 0.7818 0.7919 0.7935 0.8110 0.8409 0
## NN 0.7471 0.7844 0.8023 0.8001 0.8171 0.8494 0
## SVM 0.7011 0.7378 0.7722 0.7637 0.7892 0.8160 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## GBM 0.3509 0.4216 0.4476 0.4508 0.4757 0.5873 0
## RF 0.3124 0.4404 0.4880 0.4721 0.5148 0.5739 0
## NN 0.2706 0.4531 0.4769 0.4634 0.4962 0.5461 0
## SVM 0.2437 0.2970 0.3764 0.3536 0.3995 0.4550 0
bwplot(resamps, layout = c(2, 1))
dotplot(resamps, metric = "Kappa")
xyplot(resamps, what = "BlandAltman")
splom(resamps)
A company package caretEmsemble
deals with ensembles of models, and a vignette can be read here. Check also How to Build an Ensemble Of Machine Learning Algorithms in R.
library(caretEnsemble)
control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
models.names <- c('gbm', 'rf', 'nnet', 'svmRadial')
# necessary to prevent invalid R variables names in caretList
for (f in names(my.data))
if (class(my.data[[f]])=="factor") {
levels <- unique(c(my.data[[f]]))
my.data[[f]] <- factor(my.data[[f]], labels=make.names(levels))
}
models <- caretList(Loan_Status ~ .,
data=my.data[,c(predictors[-2],dependentVar)],
trControl=control,
methodList=models.names,
verbose=FALSE)
Let’s see some comparisons:
results <- resamples(models)
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: gbm, rf, nnet, svmRadial
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## gbm 0.7377 0.7774 0.8033 0.8067 0.8387 0.8689 0
## rf 0.7541 0.7714 0.7903 0.8002 0.8226 0.8548 0
## nnet 0.7541 0.7774 0.8033 0.8051 0.8361 0.8689 0
## svmRadial 0.7258 0.7774 0.8033 0.8013 0.8197 0.8548 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## gbm 0.2945 0.4009 0.4641 0.4773 0.5663 0.6544 0
## rf 0.2945 0.3959 0.4532 0.4651 0.5342 0.6174 0
## nnet 0.2945 0.4115 0.4641 0.4723 0.5534 0.6544 0
## svmRadial 0.2670 0.3801 0.4670 0.4563 0.5169 0.6235 0
dotplot(results)
The models should have low correlation so that the ensemble works best:
modelCor(results)
## gbm rf nnet svmRadial
## gbm 1.0000000 0.8705049 0.9310803 0.7941055
## rf 0.8705049 1.0000000 0.8533515 0.7818630
## nnet 0.9310803 0.8533515 1.0000000 0.7887679
## svmRadial 0.7941055 0.7818630 0.7887679 1.0000000
We see that there is high correlation, so the ensemble results will have little benefits.
Function caretStack
find a linear combination of several models:
stack.glm <- caretStack(models, method="glm", metric="Accuracy", trControl=control)
print(stack.glm)
## A glm ensemble of 2 base models: gbm, rf, nnet, svmRadial
##
## Ensemble results:
## Generalized Linear Model
##
## 1842 samples
## 4 predictor
## 2 classes: 'X2', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 1657, 1658, 1659, 1658, 1659, 1658, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8036486 0.4705906
##
##
We see that the results were, unsurprisingly, not that better…
Let’s try a random forest to ensemble the models:
stack.rf <- caretStack(models, method="rf", metric="Accuracy", trControl=control)
print(stack.rf)
## A rf ensemble of 2 base models: gbm, rf, nnet, svmRadial
##
## Ensemble results:
## Random Forest
##
## 1842 samples
## 4 predictor
## 2 classes: 'X2', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 1658, 1657, 1659, 1658, 1657, 1658, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8002355 0.4873520
## 3 0.7991484 0.4855304
## 4 0.7962508 0.4799460
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
To predict we need also to rename the dependent variable:
predictions <- ifelse(predict(object=stack.glm, test[,predictors]) ==
levels(my.data$Loan_Status)[[1]], "0" , "1")
confusionMatrix(predictions, test[,dependentVar])$table
## Reference
## Prediction 0 1
## 0 26 104
## 1 22 1
roc(as.integer(test[,dependentVar]), as.integer(predictions))$auc
## Area under the curve: 0.2756