Classifiers
svm() randomForest() knn() glm()
Predict class of new data using classifier
predict()
Data wrangling
rep() sample()
© University of Sydney
svm() randomForest() knn() glm()
predict()
rep() sample()
A set of melanoma patients had their cancer samples assayed by mRNA microarrays, which measure the RNA amount of thousands of different RNA molecules at the same time. Each patient has clinical data available which specifies the time since the sample was taken and whether they are alive or not.
Two classes are formed:
• The patients who died from melanoma within one year are considered to have Poor outcome.
• The patients who lived for more than four years without relapse had Good outcome.
From last week lab, Data is hosted on a popular omics repository called Gene Expression Omnibus (GEO).
To import the data of the Australian melanoma patients, GEOquery’s getGEO function will be used to obtain data set GSE54467.
library(limma) library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.1.3
library(tidyverse)
## Warning: package 'readr' was built under R version 4.1.3
library(GEOquery) Sys.setenv(VROOM_CONNECTION_SIZE = 131072 * 2) gset <- getGEO("GSE54467", GSEMatrix = TRUE, getGPL = FALSE, destdir = ".")[[1]] gset
## ExpressionSet (storageMode: lockedEnvironment) ## assayData: 26085 features, 79 samples ## element names: exprs ## protocolData: none ## phenoData ## sampleNames: GSM1315904 GSM1315905 ... GSM1315982 (79 total) ## varLabels: title geo_accession ... survival from stage iii tumour ## banked (months):ch1 (47 total) ## varMetadata: labelDescription ## featureData: none ## experimentData: use 'experimentData(object)' ## pubMedIds: 24975271 ## Annotation: GPL6884
## Get phenotype data pheno <- pData(gset) colnames(pheno) <- gsub(" ", "_", colnames(pheno)) colnames(pheno) <- sub(":ch1", "", colnames(pheno)) ## Get gene expression data expVal <- exprs(gset) boxplot(expVal[, 1:5])
## Get gene annotations geneAnno <- fData(gset) # this is empty (due to getGPL=TRUE resulting in a download error so changing # to getGPL=FALSE) # alternative way to get some annotation # BiocManager::install('illuminaHumanv3.db') library(illuminaHumanv3.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.1.1
## ## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr': ## ## first, rename
## The following object is masked from 'package:tidyr': ## ## expand
## The following objects are masked from 'package:base': ## ## expand.grid, I, unname
## ## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr': ## ## collapse, desc, slice
## The following object is masked from 'package:purrr': ## ## reduce
## The following object is masked from 'package:grDevices': ## ## windows
## ## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr': ## ## select
## Loading required package: org.Hs.eg.db
##
##
geneAnno <- data.frame(rownames(expVal)) colnames(geneAnno) <- gsub("rownames.expVal.", "PROBEID", colnames(geneAnno)) ENTREZID <- data.frame(mapIds(illuminaHumanv3.db, keys = as.character(geneAnno$PROBEID), column = c("ENTREZID"), keytype = "PROBEID", multiVals = "first"))
## 'select()' returned 1:many mapping between keys and columns
colnames(ENTREZID) <- c("ENTREZID") geneSymb <- data.frame(mapIds(illuminaHumanv3.db, keys = as.character(geneAnno$PROBEID), column = c("SYMBOL"), keytype = "PROBEID", multiVals = "first"))
## 'select()' returned 1:many mapping between keys and columns
colnames(geneSymb) <- c("SYMBOL") geneAnno2 <- cbind(geneAnno, ENTREZID, geneSymb) fData(gset) <- geneAnno2
Points of reflection:
The months since obtaining the sample looks like numeric data but is stored as character strings. Conversion to a numeric vector is essential for the correct analysis.
pheno$status <- pheno$patient_last_status pheno$survival <- as.numeric(pheno$`survival_from_stage_iii_tumour_banked_(months)`)/12
Note: R will (without warning) determine the inequalities between a character string and a number by casting the number to a character string and using lexical ordering.
pheno$survClass <- NA pheno$survClass[pheno$survival >= 4 & pheno$status == "Alive NSR"] <- "Good" pheno$survClass[pheno$survival <= 1 & pheno$status == "Dead Melanoma"] <- "Poor" table(pheno$survClass)
## ## Good Poor ## 25 22
Points of reflection:
Including features which have little discriminative power in a classifier degrades the predictive performance of many classifiers. We will do a differential expression analysis to choose the 10 features with largest evidence of a difference in means.
## Make a design matrix design = model.matrix(~survClass, pheno) dim(design)
## [1] 47 2
dim(pheno)
## [1] 79 49
## Fit linear models fit = lmFit(gset[, rownames(design)], design, method = "robust") # Why did I used a robust fit? efit = eBayes(fit, trend = TRUE) ## Top genes dt_fdr <- decideTests(efit) summary(dt_fdr)
## (Intercept) survClassPoor ## Down 0 111 ## NotSig 0 25950 ## Up 26085 24
top10 <- topTable(efit, n = 10)
Determine a vector of the row numbers of the 10 largest (by absolute value) t-statistics and subset the table of gene expression measurements by it.
# Choose top ten genes and transpose. X <- t(expVal[rownames(top10), rownames(design)]) # Select outcome variable. Y <- as.factor(pheno[rownames(design), "survClass"]) # create new dataset. data <- data.frame(Y, X)
Important: In general, the feature selection should be done within cross-validation for correctness. It is part of “The Classification Rule”.
Important: When implementing these classification approaches yourself. DO NOT blindly copy and paste, but think about the variable names and if they are appropriate for your dataset.
A random forest is a set of decision trees which each guess the class of each sample. The class which has the most common guesses for a particular sample is predicted as that sample’s class. The decision boundary is often non-linear.
Random forests in R are provided by the package randomForest
. Random forests have a large number of options. Here, the default values are used. It requires the genes to be the columns of the matrix and the samples to be the rows of the matrix.
randomForest
takes the training data and test data all as inputs to one function. A formula
can be used to defined the relationship between the class and the variables used to model it. Each programmer has their own convention and classification code written for one classifier won’t work for another.
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.3
fit <- randomForest(Y ~ ., data) predRF <- predict(fit, data) #Here we have predicted the classes of our original data.
An SVM classifier may either use a linear or non-linear boundary, depending on the user’s choice of kernel. Understanding how it works requires an undersanding of convex optimisation, so that is not covered in this course.
The svm
function in R is provided by a programmer who developed a package named e1071
. The default kernel used is the radial basis function kernel which calculates a non-linear boundary. Like many classical statistical classifiers, it requires the genes to be the columns of the matrix and the samples to be the rows of the matrix. The prediction function returns a factor of the predicted classes.
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
fit <- svm(Y ~ ., data) predSVM <- predict(fit, data) #Here we have predicted the classes of our original data.
Logistic regression is a classical statistical technique for modelling a binary response. We used logistic regression in week 6 to model transplant rejection in heart transplant patients.
The glm
function can be used with family=binomial
to fit a logistic regression model. Unlike svm
and randomForest
, predict doesn’t output a factor, but instead outputs a predicted probability for the class labels. Here I have thresholded the probabilities at 0.5 and turned them back into a factor.
fit <- glm(Y ~ ., data, family = binomial) predLR <- predict(fit, data, type = "response") #Here we have predicted the classes of our original data. predLR <- factor(predLR > 0.5, levels = c(FALSE, TRUE), labels = c("Good", "Poor"))
K-nearest neighbour classification is a very intuitive classification approach. A new data point can be classified by simply assessing which observations it is closest to in a training set using a defined distant metric.
The class
package has a function knn
which implements a k-nearest neighbours with a Euclidean distance metric. The syntax of this function is quite different from the others, it requires a matrix of training data and the corresponding class labels of this data, and then a test dataset. Note that it fits the model and predicts in a single line of code.
library(class)
## Warning: package 'class' was built under R version 4.1.3
train <- dplyr::select(data, -Y) test <- dplyr::select(data, -Y) #Here we have predicted the classes of our original data. cl <- Y predKNN1 <- knn(train, test, cl, k = 1) predKNN10 <- knn(train, test, cl, k = 10)
In the following we will calculate error rates for
Resubstitution error.
Test-set error.
2-fold cross-validation error
fit <- svm(Y ~ ., data) predSVM <- predict(fit, data) #Here we have predicted the classes of our original data. mean(predSVM != data$Y)
## [1] 0.106383
mean(predRF != data$Y)
## [1] 0
mean(predLR != data$Y)
## [1] 0.212766
mean(predKNN1 != data$Y)
## [1] 0
mean(predKNN10 != data$Y)
## [1] 0.1489362
set.seed(51773) n <- nrow(data) # The number of observations # Create a vector with the same amount of train and test labels. There are # better ways to do this. This is my way. trainTest <- rep(c("train", "test"), 100) trainTest <- trainTest[1:n] # Reorder these to avoid systematic ordering bias trainTestSample <- sample(trainTest, n, replace = FALSE) # Split data into a training set and a test set. dataTrain <- data[trainTestSample == "train", ] dataTest <- data[trainTestSample == "test", ]
fit <- svm(Y ~ ., dataTrain) predSVM <- predict(fit, dataTest) mean(predSVM != dataTest$Y)
## [1] 0.173913
fit <- randomForest(Y ~ ., dataTrain) predRF <- predict(fit, dataTest) mean(predRF != dataTest$Y)
## [1] 0.2608696
fit <- glm(Y ~ ., dataTrain, family = binomial) predLR <- predict(fit, dataTest, type = "response") predLR <- factor(predLR > 0.5, levels = c(FALSE, TRUE), labels = c("Good", "Poor")) mean(predLR != dataTest$Y)
## [1] 0.3913043
train <- dplyr::select(dataTrain, -Y) test <- dplyr::select(dataTest, -Y) cl <- dataTrain$Y predKNN1 <- knn(train, test, cl, k = 1) mean(predKNN1 != dataTest$Y)
## [1] 0.3043478
predKNN10 <- knn(train, test, cl, k = 10) mean(predKNN10 != dataTest$Y)
## [1] 0.3913043
set.seed(51773) n <- nrow(data) # The number of observations # Create a vector with the same amount of fold labels fold <- rep(c("1", "2"), 100) fold <- fold[1:n] # Reorder these to avoid systematic ordering bias foldSample <- sample(fold, n, replace = FALSE)
# If your response isn't a factor you might just want rep(NA,n) predSVM <- factor(rep(NA, n), levels = c("Good", "Poor"))
dataTest <- data[foldSample == 1, ] dataTrain <- data[foldSample != 1, ] fit <- svm(Y ~ ., dataTrain) predSVM[foldSample == 1] <- predict(fit, dataTest)
dataTest <- data[foldSample == 2, ] dataTrain <- data[foldSample != 2, ] fit <- svm(Y ~ ., dataTrain) predSVM[foldSample == 2] <- predict(fit, dataTest)
mean(predSVM != data$Y)
## [1] 0.1914894
set.seed(51773) n <- nrow(data) # The number of observations # Create a vector with the same amount of fold labels fold <- rep(c("1", "2"), 100) fold <- fold[1:n] # Reorder these to avoid systematic ordering bias foldSample <- sample(fold, n, replace = FALSE)
# If your response isn't a factor you might just want rep(NA,n) predRF <- factor(rep(NA, n), levels = c("Good", "Poor"))
dataTest <- data[foldSample == 1, ] dataTrain <- data[foldSample != 1, ] fit <- randomForest(Y ~ ., dataTrain) predRF[foldSample == 1] <- predict(fit, dataTest)
dataTest <- data[foldSample == 2, ] dataTrain <- data[foldSample != 2, ] fit <- randomForest(Y ~ ., dataTrain) predRF[foldSample == 2] <- predict(fit, dataTest)
mean(predRF != data$Y)
## [1] 0.2340426