Ensemble Learning

Ref:

The caret package (short for Classification And REgression Training) is a set of functions that attempt to streamline the process for creating predictive models. The package contains tools for: data splitting pre-processing feature selection model tuning using resampling variable importance estimation as well as other functionality. ref

# to install package with all dependencies:
# install.packages("caret", dependencies = c("Depends", "Suggests"))
library(caret)

The next functions help prepare the dataset for learning.

# calls caret::preProcess
caret_preprocess <- function(df, ...) {
  predict(preProcess(df, ...), df)
}

# pre: to perform classification
# pre: assumes dependent variable is the last column
# convert factors with a one hot encoding (make dummy vars with 0/1s)
#  “fullrank=T” create only (n-1) columns for a factor with n different levels
caret_oneHotEncoding <- function(df) {
  dmy <- dummyVars(" ~ .", data=df, fullRank=TRUE)
  res <- data.frame(predict(dmy, newdata = df))
  res[,ncol(res)] <- as.factor(res[,ncol(res)]) # make dependent variable a factor
  names(res)[ncol(res)] <- names(df)[ncol(df)]  # and keep its original name
  res
}

# pre: assumes dependent variable is the last column
caret_featureSelection <- function(train, test, n.pred) {
  if (missing(n.pred)) n.pred=ncol(train)-1  # use all as predictors
  control <- rfeControl(functions = rfFuncs,
                        method = "repeatedcv",
                        repeats = 3,
                        verbose = FALSE)
  
  dependent.col <- ncol(train)
  predictors    <- names(train)[-dependent.col]
  rfe.profile   <- rfe(train[,predictors], train[,dependent.col], rfeControl = control)
  rfe.profile$variables$var[1:n.pred]  # return the best predictors
}

Now we are ready to read and preprocess the dataset:

library(dplyr)

raw.data <- read.csv("train_u6lujuX_CVtuZ9i.csv", stringsAsFactors = T)
glimpse(raw.data)

## Observations: 614
## Variables: 13
## $ Loan_ID           <fctr> LP001002, LP001003, LP001005, LP001006, LP0...
## $ Gender            <fctr> Male, Male, Male, Male, Male, Male, Male, M...
## $ Married           <fctr> No, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, ...
## $ Dependents        <fctr> 0, 1, 0, 0, 0, 2, 0, 3+, 2, 1, 2, 2, 2, 0, ...
## $ Education         <fctr> Graduate, Graduate, Graduate, Not Graduate,...
## $ Self_Employed     <fctr> No, No, Yes, No, No, Yes, No, No, No, No, N...
## $ ApplicantIncome   <int> 5849, 4583, 3000, 2583, 6000, 5417, 2333, 30...
## $ CoapplicantIncome <dbl> 0, 1508, 0, 2358, 0, 4196, 1516, 2504, 1526,...
## $ LoanAmount        <int> NA, 128, 66, 120, 141, 267, 95, 158, 168, 34...
## $ Loan_Amount_Term  <int> 360, 360, 360, 360, 360, 360, 360, 360, 360,...
## $ Credit_History    <int> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,...
## $ Property_Area     <fctr> Urban, Rural, Urban, Urban, Urban, Urban, U...
## $ Loan_Status       <fctr> Y, N, Y, Y, Y, Y, Y, N, Y, N, Y, Y, Y, N, Y...

dependentVar <- names(raw.data)[ncol(raw.data)]  # must be the last column

# process data
raw.data %>% 
  dplyr::select(-Loan_ID) %>%                          # assumption: irrelevant data
  caret_preprocess(method = c("knnImpute", "center", "scale")) %>% 
  caret_oneHotEncoding -> my.data
glimpse(my.data)

## Observations: 614
## Variables: 19
## $ Gender.Female           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Gender.Male             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ Married.No              <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Married.Yes             <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ Dependents.0            <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ Dependents.1            <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ Dependents.2            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,...
## $ Dependents.3.           <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ Education.Not.Graduate  <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ Self_Employed.No        <dbl> 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,...
## $ Self_Employed.Yes       <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ ApplicantIncome         <dbl> 0.072931360, -0.134302453, -0.39342656...
## $ CoapplicantIncome       <dbl> -0.55403561, -0.03870000, -0.55403561,...
## $ LoanAmount              <dbl> 0.01621546, -0.21512721, -0.93953353, ...
## $ Loan_Amount_Term        <dbl> 0.276411, 0.276411, 0.276411, 0.276411...
## $ Credit_History          <dbl> 0.4324768, 0.4324768, 0.4324768, 0.432...
## $ Property_Area.Semiurban <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,...
## $ Property_Area.Urban     <dbl> 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,...
## $ Loan_Status             <fctr> 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1...

Next we partition the dataset into training and testing sets:

set.seed(123)
inTrain <- createDataPartition(my.data[,ncol(my.data)], p=0.75, list=FALSE)  # train gets 75%
train   <- my.data[ inTrain,]
test    <- my.data[-inTrain,]

We can optionally perform feature selection:

predictors <- caret_featureSelection(train, test, n.pred=5)
predictors

## [1] "Credit_History"          "Loan_Amount_Term"       
## [3] "ApplicantIncome"         "CoapplicantIncome"      
## [5] "Property_Area.Semiurban"

And can choose some of the many models available on caret to perform classification:

model_gbm  <- train(train[,predictors], train[,dependentVar], method='gbm' , verbose=F) # Boosted Regression
model_rf   <- train(train[,predictors], train[,dependentVar], method='rf'  , verbose=F) # Random Forest
model_nnet <- train(train[,predictors], train[,dependentVar], method='nnet', verbose=F) # Neural Nets
model_svm  <- train(train[,predictors], train[,dependentVar], method='svmRadial' , verbose=F) # Support Vector Machines

We can check variable importance:

par(mfrow=c(2,2))
plot(varImp(object=model_gbm),   main="GBM - Variable Importance")

plot(varImp(object=model_rf),    main="RF  - Variable Importance")

plot(varImp(object=model_nnet),  main="GBM - Variable Importance")

plot(varImp(object=model_svm),   main="SVM - Variable Importance")

par(mfrow=c(1,1))

and make predictions:

library(pROC)

predictions <- predict.train(object=model_gbm, test[,predictors], type="raw")
confusionMatrix(predictions, test[,dependentVar])$table

##           Reference
## Prediction   0   1
##          0  23   1
##          1  25 104

roc(as.integer(test[,dependentVar]), as.integer(predictions))$auc

## Area under the curve: 0.7348

Caret also allows for parameter tuning:

modelLookup(model='gbm') # which parameters can be tuned?

##   model         parameter                   label forReg forClass
## 1   gbm           n.trees   # Boosting Iterations   TRUE     TRUE
## 2   gbm interaction.depth          Max Tree Depth   TRUE     TRUE
## 3   gbm         shrinkage               Shrinkage   TRUE     TRUE
## 4   gbm    n.minobsinnode Min. Terminal Node Size   TRUE     TRUE
##   probModel
## 1      TRUE
## 2      TRUE
## 3      TRUE
## 4      TRUE

# making a grid of values
grid <- expand.grid(n.trees           = c(10,20),
                    shrinkage         = c(0.01,0.05,0.5),
                    n.minobsinnode    = c(3,5,10),
                    interaction.depth = c(1,5,10))

# let's use, say, 5-Fold cross-validation repeated 3 times
fitControl <- trainControl(method = "repeatedcv", 
                           number = 5, 
                           repeats = 3)

# tune the model
model_gbm2 <- train(train[,predictors], train[,dependentVar], method='gbm',
                   # "RMSE" or "Rsquared" for regression; 
                   # "Accuracy" or "Kappa" for classification
                   metric= "Accuracy", 
                   trControl=fitControl,
                   tuneGrid=grid,
                   verbose=FALSE)

model_gbm2$results %>% 
  dplyr::arrange(desc(Accuracy)) %>% 
  head(6)

##   shrinkage interaction.depth n.minobsinnode n.trees  Accuracy     Kappa
## 1      0.05                 1              3      10 0.8047626 0.4653741
## 2      0.05                 1              5      10 0.8047626 0.4653741
## 3      0.05                 1             10      10 0.8047626 0.4653741
## 4      0.05                 5             10      10 0.8047626 0.4653741
## 5      0.05                10             10      10 0.8047626 0.4653741
## 6      0.05                 1              3      20 0.8047626 0.4653741
##   AccuracySD    KappaSD
## 1  0.0228608 0.07111885
## 2  0.0228608 0.07111885
## 3  0.0228608 0.07111885
## 4  0.0228608 0.07111885
## 5  0.0228608 0.07111885
## 6  0.0228608 0.07111885

plot(model_gbm2)

model_gbm2$bestTune

##    n.trees interaction.depth shrinkage n.minobsinnode
## 19      10                 1      0.05              3

predictions <- predict.train(object=model_gbm2, test[,predictors], type="raw")
confusionMatrix(predictions, test[,dependentVar])$table

##           Reference
## Prediction   0   1
##          0  22   1
##          1  26 104

roc(as.integer(test[,dependentVar]), as.integer(predictions))$auc

## Area under the curve: 0.7244

To compare different models:

resamps <- resamples(list(GBM = model_gbm,
                          RF  = model_rf,
                          NN  = model_nnet,
                          SVM = model_svm))
resamps

## 
## Call:
## resamples.default(x = list(GBM = model_gbm, RF = model_rf, NN
##  = model_nnet, SVM = model_svm))
## 
## Models: GBM, RF, NN, SVM 
## Number of resamples: 25 
## Performance metrics: Accuracy, Kappa 
## Time estimates for: everything, final model fit

summary(resamps)

## 
## Call:
## summary.resamples(object = resamps)
## 
## Models: GBM, RF, NN, SVM 
## Number of resamples: 25 
## 
## Accuracy 
##       Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## GBM 0.7588  0.7840 0.7955 0.7969  0.8098 0.8383    0
## RF  0.7396  0.7818 0.7919 0.7935  0.8110 0.8409    0
## NN  0.7471  0.7844 0.8023 0.8001  0.8171 0.8494    0
## SVM 0.7011  0.7378 0.7722 0.7637  0.7892 0.8160    0
## 
## Kappa 
##       Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## GBM 0.3509  0.4216 0.4476 0.4508  0.4757 0.5873    0
## RF  0.3124  0.4404 0.4880 0.4721  0.5148 0.5739    0
## NN  0.2706  0.4531 0.4769 0.4634  0.4962 0.5461    0
## SVM 0.2437  0.2970 0.3764 0.3536  0.3995 0.4550    0

bwplot(resamps, layout = c(2, 1))

dotplot(resamps, metric = "Kappa")

xyplot(resamps, what = "BlandAltman")

splom(resamps)

Ensemble Learning

A company package caretEmsemble deals with ensembles of models, and a vignette can be read here. Check also How to Build an Ensemble Of Machine Learning Algorithms in R.

library(caretEnsemble)

control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
models.names <- c('gbm', 'rf', 'nnet', 'svmRadial')

# necessary to prevent invalid R variables names in caretList
for (f in names(my.data)) 
  if (class(my.data[[f]])=="factor") {
    levels <- unique(c(my.data[[f]]))
    my.data[[f]] <- factor(my.data[[f]], labels=make.names(levels))
  }

models <- caretList(Loan_Status ~ ., 
                    data=my.data[,c(predictors[-2],dependentVar)],
                    trControl=control, 
                    methodList=models.names,
                    verbose=FALSE)

Let’s see some comparisons:

results <- resamples(models)
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: gbm, rf, nnet, svmRadial 
## Number of resamples: 30 
## 
## Accuracy 
##             Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## gbm       0.7377  0.7774 0.8033 0.8067  0.8387 0.8689    0
## rf        0.7541  0.7714 0.7903 0.8002  0.8226 0.8548    0
## nnet      0.7541  0.7774 0.8033 0.8051  0.8361 0.8689    0
## svmRadial 0.7258  0.7774 0.8033 0.8013  0.8197 0.8548    0
## 
## Kappa 
##             Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## gbm       0.2945  0.4009 0.4641 0.4773  0.5663 0.6544    0
## rf        0.2945  0.3959 0.4532 0.4651  0.5342 0.6174    0
## nnet      0.2945  0.4115 0.4641 0.4723  0.5534 0.6544    0
## svmRadial 0.2670  0.3801 0.4670 0.4563  0.5169 0.6235    0

dotplot(results)

The models should have low correlation so that the ensemble works best:

modelCor(results)

##                 gbm        rf      nnet svmRadial
## gbm       1.0000000 0.8705049 0.9310803 0.7941055
## rf        0.8705049 1.0000000 0.8533515 0.7818630
## nnet      0.9310803 0.8533515 1.0000000 0.7887679
## svmRadial 0.7941055 0.7818630 0.7887679 1.0000000

We see that there is high correlation, so the ensemble results will have little benefits.

Function caretStack find a linear combination of several models:

stack.glm <- caretStack(models, method="glm", metric="Accuracy", trControl=control)
print(stack.glm)

## A glm ensemble of 2 base models: gbm, rf, nnet, svmRadial
## 
## Ensemble results:
## Generalized Linear Model 
## 
## 1842 samples
##    4 predictor
##    2 classes: 'X2', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 1657, 1658, 1659, 1658, 1659, 1658, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8036486  0.4705906
## 
##

We see that the results were, unsurprisingly, not that better…

Let’s try a random forest to ensemble the models:

stack.rf <- caretStack(models, method="rf", metric="Accuracy", trControl=control)
print(stack.rf)

## A rf ensemble of 2 base models: gbm, rf, nnet, svmRadial
## 
## Ensemble results:
## Random Forest 
## 
## 1842 samples
##    4 predictor
##    2 classes: 'X2', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 1658, 1657, 1659, 1658, 1657, 1658, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.8002355  0.4873520
##   3     0.7991484  0.4855304
##   4     0.7962508  0.4799460
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.

To predict we need also to rename the dependent variable:

predictions <- ifelse(predict(object=stack.glm, test[,predictors]) ==
                          levels(my.data$Loan_Status)[[1]], "0" , "1")
confusionMatrix(predictions, test[,dependentVar])$table

##           Reference
## Prediction   0   1
##          0  26 104
##          1  22   1

roc(as.integer(test[,dependentVar]), as.integer(predictions))$auc

## Area under the curve: 0.2756

Caret

João Neto

December 2016

Ensemble Learning