## Repeated 5-fold cross validation with feature selection inside for-loop
set.seed(51773)
n <- nrow(heart) # The number of observations
nFolds <- 5
nRep <- 100
# Create a vector with the same amount of fold labels
fold <- rep(1:nFolds, 100)
fold <- fold[1:n]
# Create matrix to store repeated CV error rates
errorRepCV <- matrix(NA, nRep, n)
for(j in 1:nRep){
# Reorder these to avoid systematic ordering bias
foldSample <- sample(fold, n, replace = FALSE)
predCVLoop <- factor(rep(NA, n), levels = c("no", "yes"))
for(i in 1:nFolds){
# Split into training and test set
dataTest <- heart[foldSample == i, ]
dataTrain <- heart[foldSample != i, ]
# Fit model on dataTrain
fit0 <- glm(outcome~1, dataTrain, family = binomial)
fitFull <- glm(outcome~surgery+age+wait.time+mismatch+hla.a2+mscore,
dataTrain,family = binomial)
fit = step(glm(fit0, dataTrain, family = binomial),
scope = list(lower=fit0, upper=fitFull),
direction = "forward", trace = 0)
# Predict on dataTest
pred <- predict(fit, dataTest)
pred <- factor(pred>0.5,
levels = c(FALSE, TRUE),
labels = c("no", "yes"))
# Save test predictions
predCVLoop[foldSample == i] <- pred
}
errorRepCV[j,] <- predCVLoop != heart$outcome
}
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
mean(errorRepCV)
## [1] 0.39
boxplot(rowMeans(errorRepCV))
abline(h = 0.5, lty = 'dotted')
fullCV <- rowMeans(errorRepCV)