Regression

University of Sydney

Recap

What if we have two variables?

Qualitative vs Qualitative

\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & A & Y \\ 2 & A & Y \\ 3 & A & X \\ 4 & B & Y \\ 5 & A & Z \\ 6 & B & X \\ 7 & B & Z \\ 8 & B & Z \\ 9 & A & X \\ 10 & A & X \\ \hline \end{array} \]

What if we have two variables?

Qualitative vs Quantitative

\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & A & 11.7 \\ 2 & A & 10.5 \\ 3 & A & 8.7 \\ 4 & B & 9.3 \\ 5 & A & 9.6 \\ 6 & B & 11.2 \\ 7 & B & 10.4 \\ 8 & B & 10.4 \\ 9 & A & 10.1 \\ 10 & A & 9.4 \\ \hline \end{array} \]

What if we have two variables?

Qualitative vs Quantitative

\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & A & 11.7 \\ 2 & A & 10.5 \\ 3 & A & 8.7 \\ 4 & A & 9.3 \\ 5 & A & 9.6 \\ 6 & A & 11.2 \\ 7 & A & 10.4 \\ 8 & A & 10.4 \\ 9 & A & 10.1 \\ 10 & A & 9.4 \\ \hline \end{array} \]

What if we have two variables?

Qualitative vs Quantitative

\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & C & 11.7 \\ 2 & C & 10.5 \\ 3 & C & 8.7 \\ 4 & B & 9.3 \\ 5 & C & 9.6 \\ 6 & B & 11.2 \\ 7 & B & 10.4 \\ 8 & B & 10.4 \\ 9 & C & 10.1 \\ 10 & A & 9.4 \\ \hline \end{array} \]

What if we have two variables?

Quantitative vs Quantitative

\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & 9.4 & 11.2 \\ 2 & 9.8 & 10.4 \\ 3 & 11.6 & 10.4 \\ 4 & 10.1 & 10.1 \\ 5 & 10.1 & 9.4 \\ 6 & 11.7 & 11.8 \\ 7 & 10.5 & 10.5 \\ 8 & 8.7 & 8.0 \\ 9 & 9.3 & 10.7 \\ 10 & 9.6 & 9.5 \\ \hline \end{array} \]

Multiple linear regression

Grades

We recently conducted a class survey.
Is there a relationship between the amount of time spent studying and grades?

Data wrangling

suppressPackageStartupMessages(library(tidyverse))
survey = read.csv("https://wimr-genomics.vip.sydney.edu.au/AMED3002/data/ClassSurveyv5.csv")


## Take the questions that overlap between Cohorts.
survey = survey[, 1:18]


## rename column names
colnames(survey)

##  [1] "ï..Timestamp"                                                       
##  [2] "Survey.ID..make.up.a.name."                                         
##  [3] "Gender"                                                             
##  [4] "Postcode.of.where.you.live"                                         
##  [5] "What.introductory.statistic.course.did.you.take."                   
##  [6] "How.many.university.clubs.are.you.a.member.of."                     
##  [7] "How.many.hours.per.week.did.you.spend.on.studying.last.year."       
##  [8] "What.was.your.study.load.last.year."                                
##  [9] "What.is.your.favourite.social.media.platform.."                     
## [10] "How.many.siblings.do.you.have.."                                    
## [11] "How.many.friends.do.you.have.on.Facebook.."                         
## [12] "What.is.your.average.grade.from.last.year..please.enter.a.number..."
## [13] "Did.you.have.a.pet.growing.up."                                     
## [14] "Do.you.still.live.at.home."                                         
## [15] "How.many.hours.a.week.do.you.spend.exercising."                     
## [16] "What.is.your.eye.colour."                                           
## [17] "How.many.hours.do.you.work.per.week."                               
## [18] "What.is.your.favourite.season."

oldname = colnames(survey)
newname = c("Time", "ID", "Gender", "Postcode", "StatsCourse", "Clubs", "StudyTime", 
    "StudyLoad", "SocialMedia", "Siblings", "FBFriends", "Grade", "Pet", "Home", 
    "ExerciseTime", "Eyecolor", "Working", "Season")
colnames(survey) = newname


# Clean up the results a little bit.
survey$Grade = as.character(survey$Grade)
survey$Grade[survey$Grade == "n/a"] = NA
survey$Grade[survey$Grade == "50-60"] = 55
survey$Grade[survey$Grade == "pass"] = 50
survey$Grade[survey$Grade == "D"] = 75
survey$Grade[survey$Grade == "c"] = 65
survey$Grade[survey$Grade == "C "] = 65
survey$Grade[survey$Grade == "HD"] = 85
survey$Grade[survey$Grade == "WAM of seventy-two"] = 72
survey$Grade = as.numeric(survey$Grade)

## Warning: NAs introduced by coercion

survey$StudyTime = as.character(survey$StudyTime)
survey$StudyTime[survey$StudyTime == "5-6"] = 5.5
survey$StudyTime[survey$StudyTime == "10-20"] = 15
survey$StudyTime[survey$StudyTime == "5-10"] = 7.5
survey$StudyTime[survey$StudyTime == "8hr"] = 8
survey$StudyTime[survey$StudyTime == "28 hours"] = 28
survey$StudyTime[survey$StudyTime == "8hr"] = 8
survey$StudyTime[survey$StudyTime == "didn't start uni maybe 6h"] = 6
survey$StudyTime[survey$StudyTime == "20-"] = 22
survey$StudyTime[survey$StudyTime == "20-25?"] = 22.5
survey$StudyTime[survey$StudyTime == "15-25"] = 20
survey$StudyTime[survey$StudyTime == "10 (excluding attending university) "] = 10
survey$StudyTime[survey$StudyTime == "10?"] = 10
survey$StudyTime[survey$StudyTime == "80?"] = 80
survey$StudyTime[survey$StudyTime == "10-12 extracrricular"] = 11
survey$StudyTime[survey$StudyTime == "25 hours a week"] = 25
survey$StudyTime[survey$StudyTime == "20-24"] = 22
survey$StudyTime[survey$StudyTime == "10 hours for each unit"] = 40
survey$StudyTime[survey$StudyTime == "30-40"] = 35

survey$StudyTime[survey$StudyTime == "not sure"] = NA
survey$StudyTime[survey$StudyTime == "not sure "] = NA
survey$StudyTime[survey$StudyTime == "unsure"] = NA
survey$StudyTime[survey$StudyTime == "Unsure"] = NA
survey$StudyTime[survey$StudyTime == "Too many"] = NA
survey$StudyTime[survey$StudyTime == "too many"] = NA
survey$StudyTime[survey$StudyTime == "I have no idea "] = NA
survey$StudyTime[survey$StudyTime == "n/a"] = NA
survey$StudyTime[survey$StudyTime == "lost"] = NA
survey$StudyTime[survey$StudyTime == "lots"] = NA

survey$StudyTime = gsub("\\+", "", survey$StudyTime)
survey$StudyTime = gsub(" ", "", survey$StudyTime)
survey$StudyTime = gsub("~", "", survey$StudyTime)
survey$StudyTime = gsub("\\?", "", survey$StudyTime)

survey$StudyTime = as.numeric(survey$StudyTime)

## Warning: NAs introduced by coercion

FB = survey$FBFriends
FB = gsub("\\+", "", FB)
FB = gsub("~", "", FB)
FB = gsub(">", "", FB)
FB = gsub(" ish", "", FB)
FB[FB == "over 1.2k"] = 1201
FB[FB == "Don't have facebook"] = 0
FB[FB == "Don't have FB"] = 0
FB[FB == "none (not in facebook)"] = 0
FB[FB == "Cant remember"] = NA
FB[FB == "not sure"] = NA
FB[FB == "I don't know"] = NA
FB[FB == "do not know"] = NA
FB[FB == "Too many"] = NA
FB = as.numeric(FB)

## Warning: NAs introduced by coercion

FB[is.na(FB)] = mean(FB, na.rm = TRUE)
survey[, "FBFriends"] = FB


ET = as.character(survey$ExerciseTime)
ET[ET == ""] = 0
ET[ET == "5-6"] = 5.5
ET[ET == "2-3"] = 2.5
ET[ET == "4-5"] = 4.5
ET[ET == "6 hr"] = 6
ET[ET == "none"] = 0
ET[ET == "2 hours"] = 2
ET[ET == "2 hours"] = 2
survey[, "ExerciseTime"] = as.numeric(ET)

## Warning: NAs introduced by coercion

WK = as.character(survey[, "Working"])
WK[WK == ""] = 0
WK[WK == "18 hours"] = 18
WK[WK == "8-12"] = 10
WK[WK == "168"] = NA
WK[WK == "none"] = 0
WK[WK == "Approx 25"] = 25
WK[WK == "NULL"] = 0
WK[WK == "10-15"] = 12.5
survey[, "Working"] = as.numeric(WK)

## Warning: NAs introduced by coercion

PT = as.character(survey$Pet)
PT[PT == ""] = NA
survey$Pet = as.factor(PT)


HM = as.character(survey$Home)
HM[HM == ""] = NA
survey$Home = as.factor(HM)


SB = as.character(survey$Siblings)
SB[SB == "none"] = 0
SB[SB == "NA"] = 0
SB[SB == "165"] = NA
SB[SB == "one"] = 1
SB[SB == "One"] = 1
survey$Siblings = as.numeric(SB)


CB = as.character(survey$Clubs)
CB[grep("none", CB, ignore.case = TRUE)] = 0
CB[CB == "10+"] = 10
CB[CB == "18 or so? executive of 9"] = 18
CB[CB == "Dog Society, Sydney Uni Car Club"] = 2
CB[CB == "3-4"] = 3.5
CB[CB == "N/A"] = 0
survey$Clubs = as.numeric(CB)

## Warning: NAs introduced by coercion

SN = as.character(survey$Season)
SN[SN == ""] = NA
survey$Season = as.factor(SN)

survey$Cohort = (as.Date(survey$Time, "%m/%d/%Y") > "2018/01/01") + (as.Date(survey$Time, 
    "%m/%d/%Y") > "2019/01/01") + (as.Date(survey$Time, "%m/%d/%Y") > "2020/01/01") + 
    (as.Date(survey$Time, "%m/%d/%Y") > "2021/01/01")
survey$Cohort = factor(survey$Cohort, levels = c(0, 1, 2, 3, 4), labels = c("STAT2012", 
    "AMED3002.2018", "AMED3002.2019", "AMED3002.2020", "AMED3002.2021"))

Data exploration

str(survey)

## 'data.frame':    204 obs. of  19 variables:
##  $ Time        : Factor w/ 203 levels "3/11/2019 10:00",..: 124 125 126 127 128 129 129 130 131 132 ...
##  $ ID          : Factor w/ 199 levels "123454321","1907",..: 159 186 123 26 24 131 130 190 102 88 ...
##  $ Gender      : Factor w/ 4 levels "crocodilian",..: 3 2 2 3 3 2 3 3 2 3 ...
##  $ Postcode    : Factor w/ 112 levels "14052","2000",..: 21 68 50 85 66 78 25 54 21 2 ...
##  $ StatsCourse : Factor w/ 18 levels "","BUSS1020",..: 8 8 8 8 8 8 1 8 8 8 ...
##  $ Clubs       : num  0 0 2 3 2 1 2 2 1 2 ...
##  $ StudyTime   : num  50 10 7 70 30 NA 20 25 5 15 ...
##  $ StudyLoad   : Factor w/ 3 levels "","full-time",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ SocialMedia : Factor w/ 22 levels "canvas","Discord",..: 3 14 3 3 3 17 3 3 17 17 ...
##  $ Siblings    : num  1 2 2 0 2 0 1 2 3 1 ...
##  $ FBFriends   : num  90 600 115 100 263 10 120 150 228 200 ...
##  $ Grade       : num  72 73 64 90 90.5 66 74 72 60 62 ...
##  $ Pet         : Factor w/ 2 levels "No","Yes": 2 1 2 NA 2 1 1 1 1 2 ...
##  $ Home        : Factor w/ 2 levels "No","Yes": 1 2 2 NA 2 2 1 2 1 2 ...
##  $ ExerciseTime: num  3 4 2 0 1 2 2 10 2 4 ...
##  $ Eyecolor    : Factor w/ 31 levels "","black","Black",..: 13 10 6 1 11 2 11 3 3 11 ...
##  $ Working     : num  0 15 0 0 10 0 0 15 0 5 ...
##  $ Season      : Factor w/ 4 levels "Autumn","Spring",..: 1 2 1 NA 4 1 2 3 3 2 ...
##  $ Cohort      : Factor w/ 5 levels "STAT2012","AMED3002.2018",..: 1 1 1 1 1 1 1 1 1 1 ...

Missing data

library(naniar)
vis_miss(survey)

sort(colSums(is.na(survey)))

##         Time       Gender     Postcode  StatsCourse    StudyLoad  SocialMedia 
##            0            0            0            0            0            0 
##    FBFriends       Cohort           ID        Clubs     Siblings     Eyecolor 
##            0            0            1            1            1            1 
##      Working        Grade          Pet       Season         Home ExerciseTime 
##            3            4            4            4            5            5 
##    StudyTime 
##           12

survey = na.omit(survey)[, c("Grade", "StudyTime", "Cohort", "FBFriends", "ExerciseTime", 
    "Working", "Pet", "Home", "Clubs", "Season")]

Clustering

dataContinuous = as.data.frame(model.matrix(~. - 1, survey))
dataScaled <- scale(dataContinuous)
Hcluster = hclust(dist(t(dataScaled)))
plot(Hcluster)

Take a look

ggplot(survey, aes(x = StudyTime, y = Grade)) + geom_point() + theme_classic()

Remove “outliers”

# Remove outlier
survey2 = filter(survey, StudyTime < 100, Grade < 100)
ggplot(survey2, aes(x = StudyTime, y = Grade)) + geom_point(size = 3) + theme_classic()

Remove “outliers”

ggplot(survey2, aes(x = StudyTime, y = Grade, colour = Cohort)) + geom_point(size = 3) + 
    theme_classic(20)

Simple linear regression

fit = lm(Grade ~ StudyTime, survey2)
summary(fit)

## 
## Call:
## lm(formula = Grade ~ StudyTime, data = survey2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -68.180  -4.156   0.806   7.820  23.384 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 64.23063    2.02474  31.723  < 2e-16 ***
## StudyTime    0.28212    0.08378   3.367 0.000939 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.81 on 170 degrees of freedom
## Multiple R-squared:  0.06253,    Adjusted R-squared:  0.05702 
## F-statistic: 11.34 on 1 and 170 DF,  p-value: 0.0009386

# assumptions: Linearity, Equality of variance, Independence of residuals,
# Normality of residuals.
plot(fit)

Assess robustness

# Remove outlier
survey3 = filter(survey2, Grade > 40)

ggplot(survey3, aes(x = StudyTime, y = Grade, colour = Cohort)) + geom_point(size = 3) + 
    theme_classic(20)

Assess robustness

fit = lm(Grade ~ StudyTime, survey3)
summary(fit)

## 
## Call:
## lm(formula = Grade ~ StudyTime, data = survey3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.8819  -5.1722  -0.3952   5.1721  20.3074 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  68.9086     1.2637  54.528  < 2e-16 ***
## StudyTime     0.1487     0.0517   2.875  0.00457 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.379 on 165 degrees of freedom
## Multiple R-squared:  0.04772,    Adjusted R-squared:  0.04195 
## F-statistic: 8.268 on 1 and 165 DF,  p-value: 0.004568

plot(fit)

What trend?

ggplot(survey3, aes(x = StudyTime, y = Grade)) + geom_point(size = 3) + theme_classic(20) + 
    geom_smooth(method = "lm")

## `geom_smooth()` using formula 'y ~ x'

What trend?

ggplot(survey3, aes(x = StudyTime, y = Grade, colour = Cohort)) + geom_point(size = 3) + 
    theme_classic(20) + geom_smooth(method = "lm")

## `geom_smooth()` using formula 'y ~ x'

Multiple linear regression

How well can we predict someones grades given we have multiple variables?

Simple linear regression: \(y = \alpha + \beta x\)

Multiple regression: \(y = \alpha + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_3 + \beta_4 x_4 + \beta_5 x_5 + \beta_6 x_6 + \beta_7 x_7\)

Multiple linear regression

fit = lm(Grade ~ StudyTime + ExerciseTime + Working + Pet, survey3)
summary(fit)

## 
## Call:
## lm(formula = Grade ~ StudyTime + ExerciseTime + Working + Pet, 
##     data = survey3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.0771  -5.3196   0.0588   5.0408  19.9963 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  68.461937   1.708350  40.075  < 2e-16 ***
## StudyTime     0.149307   0.052729   2.832  0.00522 ** 
## ExerciseTime -0.037418   0.177410  -0.211  0.83322    
## Working      -0.006863   0.068455  -0.100  0.92027    
## PetYes        1.253974   1.343409   0.933  0.35199    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.433 on 162 degrees of freedom
## Multiple R-squared:  0.05297,    Adjusted R-squared:  0.02959 
## F-statistic: 2.265 on 4 and 162 DF,  p-value: 0.06441

plot(fit)

Multiple linear regression

fit = lm(Grade ~ ., survey3)
summary(fit)

## 
## Call:
## lm(formula = Grade ~ ., data = survey3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.7162  -5.2790  -0.2158   5.0328  18.8700 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         65.735996   2.387897  27.529  < 2e-16 ***
## StudyTime            0.143045   0.053875   2.655  0.00877 ** 
## CohortAMED3002.2018 -0.700346   2.702710  -0.259  0.79589    
## CohortAMED3002.2019 -0.619427   2.189879  -0.283  0.77767    
## CohortAMED3002.2020  0.741144   1.802908   0.411  0.68159    
## CohortAMED3002.2021  3.159793   2.173712   1.454  0.14811    
## FBFriends            0.003967   0.001854   2.140  0.03399 *  
## ExerciseTime        -0.111892   0.185651  -0.603  0.54761    
## Working             -0.029265   0.073109  -0.400  0.68950    
## PetYes               0.932031   1.398933   0.666  0.50626    
## HomeYes              0.657698   1.430407   0.460  0.64632    
## Clubs                0.060157   0.388634   0.155  0.87719    
## SeasonSpring         0.017546   1.836321   0.010  0.99239    
## SeasonSummer         0.831259   1.965120   0.423  0.67289    
## SeasonWinter         2.210601   1.952846   1.132  0.25942    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.414 on 152 degrees of freedom
## Multiple R-squared:  0.1155, Adjusted R-squared:  0.03403 
## F-statistic: 1.418 on 14 and 152 DF,  p-value: 0.151

plot(fit)

Variable selection

Sparsity
Parsimony

\(\mbox{Model 1:} \ \ y = \alpha + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_3 + \beta_4 x_4 + \beta_5 x_5 + \beta_6 x_6 + \beta_7 x_7\)

\(\mbox{Model 2:} \ \ y = \alpha + \beta_2 x_2 + \beta_7 x_7\)

Variable selection

LASSO

Step-wise procedures
- strategy to add/remove
- measure to assess fit

Step-wise model selection

Define the range of models to test over.

# Full model
fit1 <- lm(Grade ~ ., survey3)
# Null model
fit2 <- lm(Grade ~ 1, survey3)

Forwards step-wise regression

## Forward stepwise model selection using AIC criterion
fit = step(lm(Grade ~ 1, survey3), direction = "forward", scope = list(lower = fit2, 
    upper = fit1))

## Start:  AIC=718.15
## Grade ~ 1
## 
##                Df Sum of Sq   RSS    AIC
## + StudyTime     1    580.48 11584 711.98
## + FBFriends     1    358.57 11806 715.15
## <none>                      12165 718.15
## + Home          1     61.94 12103 719.30
## + Pet           1     50.14 12115 719.46
## + Clubs         1     23.10 12142 719.83
## + Working       1     10.12 12155 720.01
## + ExerciseTime  1      0.55 12164 720.14
## + Cohort        4    429.38 11736 720.15
## + Season        3    105.31 12060 722.70
## 
## Step:  AIC=711.98
## Grade ~ StudyTime
## 
##                Df Sum of Sq   RSS    AIC
## + FBFriends     1    388.87 11196 708.28
## <none>                      11584 711.98
## + Pet           1     59.43 11525 713.13
## + Home          1     49.92 11535 713.26
## + Clubs         1      8.24 11576 713.87
## + ExerciseTime  1      1.34 11583 713.97
## + Working       1      0.32 11584 713.98
## + Cohort        4    306.09 11278 715.51
## + Season        3    121.33 11463 716.23
## 
## Step:  AIC=708.28
## Grade ~ StudyTime + FBFriends
## 
##                Df Sum of Sq   RSS    AIC
## <none>                      11196 708.28
## + Home          1    36.645 11159 709.73
## + ExerciseTime  1    22.264 11173 709.95
## + Clubs         1    14.685 11181 710.06
## + Pet           1    13.035 11183 710.09
## + Working       1     9.518 11186 710.14
## + Season        3   138.816 11057 712.20
## + Cohort        4   232.666 10963 712.78

summary(fit)

## 
## Call:
## lm(formula = Grade ~ StudyTime + FBFriends, data = survey3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.5986  -4.7076  -0.4714   5.4056  18.5365 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 66.951146   1.491774  44.880  < 2e-16 ***
## StudyTime    0.152579   0.051010   2.991  0.00321 ** 
## FBFriends    0.003933   0.001648   2.387  0.01814 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.262 on 164 degrees of freedom
## Multiple R-squared:  0.07968,    Adjusted R-squared:  0.06846 
## F-statistic:   7.1 on 2 and 164 DF,  p-value: 0.001104

Backwards step-wise regression

## Backwards stepwise model selection using AIC criterion
fit = step(lm(Grade ~ ., survey3), direction = "backward", scope = list(lower = fit2, 
    upper = fit1))

## Start:  AIC=725.65
## Grade ~ StudyTime + Cohort + FBFriends + ExerciseTime + Working + 
##     Pet + Home + Clubs + Season
## 
##                Df Sum of Sq   RSS    AIC
## - Cohort        4    205.20 10965 720.81
## - Season        3    117.86 10878 721.47
## - Clubs         1      1.70 10762 723.68
## - Working       1     11.34 10771 723.83
## - Home          1     14.97 10775 723.89
## - ExerciseTime  1     25.71 10786 724.05
## - Pet           1     31.42 10791 724.14
## <none>                      10760 725.65
## - FBFriends     1    324.05 11084 728.61
## - StudyTime     1    499.05 11259 731.23
## 
## Step:  AIC=720.81
## Grade ~ StudyTime + FBFriends + ExerciseTime + Working + Pet + 
##     Home + Clubs + Season
## 
##                Df Sum of Sq   RSS    AIC
## - Season        3    136.33 11102 716.87
## - Working       1      1.57 10967 718.83
## - Clubs         1      4.77 10970 718.88
## - Pet           1     12.33 10978 719.00
## - Home          1     34.63 11000 719.34
## - ExerciseTime  1     34.67 11000 719.34
## <none>                      10965 720.81
## - FBFriends     1    383.67 11349 724.55
## - StudyTime     1    597.01 11562 727.66
## 
## Step:  AIC=716.87
## Grade ~ StudyTime + FBFriends + ExerciseTime + Working + Pet + 
##     Home + Clubs
## 
##                Df Sum of Sq   RSS    AIC
## - Clubs         1      7.37 11109 714.98
## - Working       1      9.26 11111 715.01
## - Pet           1     18.17 11120 715.15
## - ExerciseTime  1     20.70 11122 715.18
## - Home          1     35.21 11137 715.40
## <none>                      11102 716.87
## - FBFriends     1    367.14 11469 720.31
## - StudyTime     1    558.32 11660 723.07
## 
## Step:  AIC=714.98
## Grade ~ StudyTime + FBFriends + ExerciseTime + Working + Pet + 
##     Home
## 
##                Df Sum of Sq   RSS    AIC
## - Working       1     12.77 11122 713.18
## - Pet           1     18.18 11127 713.26
## - ExerciseTime  1     20.64 11130 713.29
## - Home          1     39.11 11148 713.57
## <none>                      11109 714.98
## - FBFriends     1    365.15 11474 718.38
## - StudyTime     1    565.89 11675 721.28
## 
## Step:  AIC=713.18
## Grade ~ StudyTime + FBFriends + ExerciseTime + Pet + Home
## 
##                Df Sum of Sq   RSS    AIC
## - Pet           1     13.61 11135 711.38
## - ExerciseTime  1     25.97 11148 711.56
## - Home          1     36.29 11158 711.72
## <none>                      11122 713.18
## - FBFriends     1    354.16 11476 716.41
## - StudyTime     1    608.37 11730 720.07
## 
## Step:  AIC=711.38
## Grade ~ StudyTime + FBFriends + ExerciseTime + Home
## 
##                Df Sum of Sq   RSS    AIC
## - ExerciseTime  1     23.71 11159 709.73
## - Home          1     38.09 11173 709.95
## <none>                      11135 711.38
## - FBFriends     1    397.36 11533 715.24
## - StudyTime     1    604.59 11740 718.21
## 
## Step:  AIC=709.73
## Grade ~ StudyTime + FBFriends + Home
## 
##             Df Sum of Sq   RSS    AIC
## - Home       1     36.65 11196 708.28
## <none>                   11159 709.73
## - FBFriends  1    375.60 11535 713.26
## - StudyTime  1    599.52 11758 716.47
## 
## Step:  AIC=708.28
## Grade ~ StudyTime + FBFriends
## 
##             Df Sum of Sq   RSS    AIC
## <none>                   11196 708.28
## - FBFriends  1    388.87 11584 711.98
## - StudyTime  1    610.79 11806 715.15

summary(fit)

## 
## Call:
## lm(formula = Grade ~ StudyTime + FBFriends, data = survey3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.5986  -4.7076  -0.4714   5.4056  18.5365 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 66.951146   1.491774  44.880  < 2e-16 ***
## StudyTime    0.152579   0.051010   2.991  0.00321 ** 
## FBFriends    0.003933   0.001648   2.387  0.01814 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.262 on 164 degrees of freedom
## Multiple R-squared:  0.07968,    Adjusted R-squared:  0.06846 
## F-statistic:   7.1 on 2 and 164 DF,  p-value: 0.001104

Important… Colinearity

set.seed(51773)
weight = rnorm(100)
height = rnorm(100, weight)
headCirc = rnorm(100, height, 0.3)
footLength = rnorm(100, height, 0.3)

# E.g. Y = weight, w = height, x = head circumference (headCirc), z = foot length
# (footLength)

pairs(data.frame(weight, headCirc, footLength))

Colinearity

summary(lm(weight ~ headCirc))

## 
## Call:
## lm(formula = weight ~ headCirc)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.81770 -0.50122  0.09853  0.46984  1.76106 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.07317    0.07679  -0.953    0.343    
## headCirc     0.48045    0.05297   9.071 1.26e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7676 on 98 degrees of freedom
## Multiple R-squared:  0.4564, Adjusted R-squared:  0.4508 
## F-statistic: 82.27 on 1 and 98 DF,  p-value: 1.259e-14

summary(lm(weight ~ footLength))

## 
## Call:
## lm(formula = weight ~ footLength)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7894 -0.4505  0.0778  0.4384  2.0889 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.11457    0.07614  -1.505    0.136    
## footLength   0.49009    0.05302   9.244  5.3e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.761 on 98 degrees of freedom
## Multiple R-squared:  0.4658, Adjusted R-squared:  0.4603 
## F-statistic: 85.44 on 1 and 98 DF,  p-value: 5.304e-15

summary(lm(weight ~ headCirc + footLength))

## 
## Call:
## lm(formula = weight ~ headCirc + footLength)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.76444 -0.44618  0.04217  0.48183  1.88197 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.09859    0.07753  -1.272   0.2065  
## headCirc     0.19130    0.17838   1.072   0.2862  
## footLength   0.30547    0.18011   1.696   0.0931 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7604 on 97 degrees of freedom
## Multiple R-squared:  0.472,  Adjusted R-squared:  0.4612 
## F-statistic: 43.36 on 2 and 97 DF,  p-value: 3.517e-14

GLM

Generalized linear models are a generalized form of regression.
GLM does not assume a linear relationship between dependent and independent variables. However, it assumes a linear relationship between link function and independent variables in logit model.
Examples of GLM include logistic regression and Poisson regression,
Link for logistic regression is \[log(\frac{p}{1-p}) = \alpha + \beta x\] where \(p\) is the probability of “success”.

Logistic regression

Logistic regression can be used to fit a regression with a binary response variable. E.g. Kidney failure Yes/No
logistic regression models the probability of our response variable e.g. “Kidney failure” belonging to a particular category.
Gender: male or female

Heart transplant

Transplant rejection of patients that were on the waiting list for the Stanford heart transplant program.

Score developed to assign hearts for transplant.
Is the score effective?

We have patients that reject heart within one year and survive without a rejection after one year.

J Crowley and M Hu (1977), Covariance analysis of heart transplant survival data. Journal of the American Statistical Association, 72, 27-36.

Explore

boxplot(score ~ outcome, ylab = "Match score", xlab = "Survives")

Testing with logistic regression

fit = glm(outcome ~ score, heart1, family = binomial)
summary(fit)

## 
## Call:
## glm(formula = outcome ~ score, family = binomial, data = heart1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6415  -1.1254   0.6736   1.0449   1.4326  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -1.3115     0.8397  -1.562   0.1183  
## score         1.2153     0.6371   1.907   0.0565 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57.843  on 41  degrees of freedom
## Residual deviance: 53.640  on 40  degrees of freedom
##   (23 observations deleted due to missingness)
## AIC: 57.64
## 
## Number of Fisher Scoring iterations: 4

# Visual diagnostics for logistic regression aren't appropriate
plot(fit)

Multiple logistic regression

We have more information than just a mismatch score.

colnames(heart1)

##  [1] "birth.dt"   "accept.dt"  "tx.date"    "fu.date"    "fustat"    
##  [6] "surgery"    "age"        "futime"     "wait.time"  "transplant"
## [11] "mismatch"   "hla.a2"     "mscore"     "reject"

Can we do better at predicting rejection?

Multiple logistic regression

fit = glm(outcome ~ surgery + age + wait.time + mismatch + hla.a2 + mscore, heart1, 
    family = binomial)
summary(fit)

## 
## Call:
## glm(formula = outcome ~ surgery + age + wait.time + mismatch + 
##     hla.a2 + mscore, family = binomial, data = heart1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0951  -0.6967   0.3447   0.7897   1.7360  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -7.028984   3.568025  -1.970   0.0488 *
## surgery     -1.633404   0.968140  -1.687   0.0916 .
## age          0.146055   0.063019   2.318   0.0205 *
## wait.time   -0.008959   0.009938  -0.902   0.3673  
## mismatch    -0.179936   0.577919  -0.311   0.7555  
## hla.a2      -0.392561   1.037202  -0.378   0.7051  
## mscore       1.307939   0.978552   1.337   0.1814  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57.843  on 41  degrees of freedom
## Residual deviance: 41.316  on 35  degrees of freedom
##   (23 observations deleted due to missingness)
## AIC: 55.316
## 
## Number of Fisher Scoring iterations: 4

Model selection

fit = step(glm(outcome ~ 1, heart1, family = binomial), scope = list(upper = ~surgery + 
    age + wait.time + mismatch + hla.a2 + mscore), direction = "forward")

## Start:  AIC=59.84
## outcome ~ 1
## 
##             Df Deviance    AIC
## + age        1   48.939 52.939
## + surgery    1   52.797 56.797
## + mscore     1   53.640 57.640
## <none>           57.843 59.843
## + wait.time  1   56.678 60.678
## + hla.a2     1   57.065 61.065
## + mismatch   1   57.838 61.838
## 
## Step:  AIC=52.94
## outcome ~ age
## 
##             Df Deviance    AIC
## + surgery    1   45.324 51.324
## + mscore     1   45.750 51.750
## <none>           48.939 52.939
## + wait.time  1   47.487 53.487
## + hla.a2     1   48.771 54.771
## + mismatch   1   48.869 54.869
## 
## Step:  AIC=51.32
## outcome ~ age + surgery
## 
##             Df Deviance    AIC
## + mscore     1   42.788 50.788
## <none>           45.324 51.324
## + wait.time  1   43.790 51.790
## + mismatch   1   45.009 53.009
## + hla.a2     1   45.191 53.191
## 
## Step:  AIC=50.79
## outcome ~ age + surgery + mscore
## 
##             Df Deviance    AIC
## <none>           42.788 50.788
## + wait.time  1   41.674 51.674
## + mismatch   1   42.359 52.359
## + hla.a2     1   42.475 52.475

summary(fit)

## 
## Call:
## glm(formula = outcome ~ age + surgery + mscore, family = binomial, 
##     data = heart1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1036  -0.7670   0.3920   0.8025   1.8747  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -7.37758    3.04074  -2.426   0.0153 *
## age          0.13604    0.06027   2.257   0.0240 *
## surgery     -1.56404    0.96117  -1.627   0.1037  
## mscore       1.17452    0.78748   1.491   0.1358  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57.843  on 41  degrees of freedom
## Residual deviance: 42.788  on 38  degrees of freedom
##   (23 observations deleted due to missingness)
## AIC: 50.788
## 
## Number of Fisher Scoring iterations: 4

Odds ratios

In a logisitc regression the regression coefficients can be interpreted as odds ratios. Specifically, \[OR = exp(\beta_i) = e^{\beta_i}\] For every unit change in \(x_1\) the odds of disease increase by \(e^{\beta_i}\).

Odds ratios

exp(coef(summary(fit))[, 1])

##  (Intercept)          age      surgery       mscore 
## 0.0006251121 1.1457303643 0.2092890864 3.2365754902

Getting side-tracked….

Can we predict who still lives at home?

fit1 <- glm(Home ~ ., survey2, family = binomial)
fit2 <- glm(Home ~ 1, survey2, family = binomial)
fit = step(glm(Home ~ 1, survey2, family = binomial), direction = "forward", scope = list(lower = fit2, 
    upper = fit1))

## Start:  AIC=227.99
## Home ~ 1
## 
##                Df Deviance    AIC
## + Cohort        4   211.86 221.86
## + Grade         1   223.97 227.97
## <none>              225.99 227.99
## + Working       1   224.31 228.31
## + Clubs         1   225.02 229.02
## + FBFriends     1   225.47 229.47
## + StudyTime     1   225.57 229.57
## + ExerciseTime  1   225.60 229.60
## + Pet           1   225.70 229.70
## + Season        3   223.70 231.70
## 
## Step:  AIC=221.86
## Home ~ Cohort
## 
##                Df Deviance    AIC
## <none>              211.86 221.86
## + Working       1   210.57 222.57
## + Grade         1   210.91 222.91
## + Clubs         1   210.95 222.95
## + Pet           1   211.15 223.15
## + ExerciseTime  1   211.32 223.32
## + FBFriends     1   211.48 223.48
## + StudyTime     1   211.79 223.79
## + Season        3   210.88 226.88

summary(fit)

## 
## Call:
## glm(formula = Home ~ Cohort, family = binomial, data = survey2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8930  -1.2601   0.7002   1.0969   1.2814  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)  
## (Intercept)           0.1924     0.2352   0.818   0.4133  
## CohortAMED3002.2018   1.4171     0.8095   1.751   0.0800 .
## CohortAMED3002.2019  -0.4335     0.4665  -0.929   0.3527  
## CohortAMED3002.2020   1.0116     0.4469   2.263   0.0236 *
## CohortAMED3002.2021   1.0886     0.5575   1.952   0.0509 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.99  on 171  degrees of freedom
## Residual deviance: 211.86  on 167  degrees of freedom
## AIC: 221.86
## 
## Number of Fisher Scoring iterations: 4

exp(coef(summary(fit))[, 1])

##         (Intercept) CohortAMED3002.2018 CohortAMED3002.2019 CohortAMED3002.2020 
##           1.2121212           4.1250000           0.6482143           2.7500000 
## CohortAMED3002.2021 
##           2.9700000

table(survey2$Home, survey2$Cohort)

##      
##       STAT2012 AMED3002.2018 AMED3002.2019 AMED3002.2020 AMED3002.2021
##   No        33             2            14             9             5
##   Yes       40            10            11            30            18

fit <- glm(Home ~ Cohort, survey2, family = binomial)
summary(fit)

## 
## Call:
## glm(formula = Home ~ Cohort, family = binomial, data = survey2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8930  -1.2601   0.7002   1.0969   1.2814  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)  
## (Intercept)           0.1924     0.2352   0.818   0.4133  
## CohortAMED3002.2018   1.4171     0.8095   1.751   0.0800 .
## CohortAMED3002.2019  -0.4335     0.4665  -0.929   0.3527  
## CohortAMED3002.2020   1.0116     0.4469   2.263   0.0236 *
## CohortAMED3002.2021   1.0886     0.5575   1.952   0.0509 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.99  on 171  degrees of freedom
## Residual deviance: 211.86  on 167  degrees of freedom
## AIC: 221.86
## 
## Number of Fisher Scoring iterations: 4

chisq.test(table(survey2$Home, survey2$Cohort))

## Warning in chisq.test(table(survey2$Home, survey2$Cohort)): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  table(survey2$Home, survey2$Cohort)
## X-squared = 13.697, df = 4, p-value = 0.008326

Survival analysis

Heart transplant

Transplant rejection of patients that were on the waiting list for the Stanford heart transplant program.

Score developed to assign hearts for transplant.
Is the score effective?

We have patients that reject heart within one year and survive without a rejection after one year.

We have information on how long someone has been followed up and whether they havd a rejection.

J Crowley and M Hu (1977), Covariance analysis of heart transplant survival data. Journal of the American Statistical Association, 72, 27-36.

Explore

Survival analysis

We will fit a cox proportional-hazards model

fit = coxph(Surv(futime, reject) ~ surgery + age + wait.time + mismatch + hla.a2 + 
    mscore, heart1)
fit

## Call:
## coxph(formula = Surv(futime, reject) ~ surgery + age + wait.time + 
##     mismatch + hla.a2 + mscore, data = heart1)
## 
##                coef exp(coef)  se(coef)      z       p
## surgery   -0.982678  0.374307  0.625297 -1.572 0.11606
## age        0.095668  1.100393  0.030633  3.123 0.00179
## wait.time -0.009470  0.990575  0.007487 -1.265 0.20590
## mismatch  -0.110350  0.895521  0.229665 -0.480 0.63088
## hla.a2     0.275834  1.317629  0.469523  0.587 0.55688
## mscore     0.780048  2.181577  0.379562  2.055 0.03987
## 
## Likelihood ratio test=27.3  on 6 df, p=0.000127
## n= 65, number of events= 29

Survival analysis

We use the “Surv” object to represent survival data.

with(heart1, Surv(futime, reject))

##  [1]   15+   38+  674    57   152+   80  1386   307    42+   27  1031    50 
## [13]  732   218  1799+   71+  851    76  1586+ 1571+   99    65     4+ 1407+
## [25] 1321+   44+  995    71  1141+  284    60   941+  342   915+   67   841+
## [37]  583+   77   284+   67   669+   29   619+  595+   89    16+  544+  514+
## [49]   95   481+  444+   79+  333   396+  109   369+  206   185+  339+  264+
## [61]  164   179+  130+  108+   38+

Survival analysis

We will fit a cox proportional-hazards model with step-wise model selection.

fit = step(coxph(Surv(futime, reject) ~ surgery + age + wait.time + mismatch + hla.a2 + 
    mscore, heart1))

## Start:  AIC=185.23
## Surv(futime, reject) ~ surgery + age + wait.time + mismatch + 
##     hla.a2 + mscore
## 
##             Df    AIC
## - mismatch   1 183.47
## - hla.a2     1 183.58
## <none>         185.23
## - wait.time  1 185.41
## - surgery    1 186.31
## - mscore     1 187.06
## - age        1 196.14
## 
## Step:  AIC=183.47
## Surv(futime, reject) ~ surgery + age + wait.time + hla.a2 + mscore
## 
##             Df    AIC
## - hla.a2     1 181.73
## <none>         183.47
## - wait.time  1 183.64
## - surgery    1 184.49
## - mscore     1 185.07
## - age        1 194.24
## 
## Step:  AIC=181.73
## Surv(futime, reject) ~ surgery + age + wait.time + mscore
## 
##             Df    AIC
## <none>         181.73
## - wait.time  1 181.87
## - surgery    1 182.59
## - mscore     1 185.15
## - age        1 192.25

fit

## Call:
## coxph(formula = Surv(futime, reject) ~ surgery + age + wait.time + 
##     mscore, data = heart1)
## 
##                coef exp(coef)  se(coef)      z       p
## surgery   -0.946733  0.388007  0.623963 -1.517 0.12919
## age        0.095073  1.099739  0.030895  3.077 0.00209
## wait.time -0.009528  0.990517  0.007573 -1.258 0.20832
## mscore     0.795538  2.215632  0.328914  2.419 0.01558
## 
## Likelihood ratio test=26.81  on 4 df, p=2.172e-05
## n= 65, number of events= 29

Visualising

If we binarize our score we can use a Kaplan-Meier curve.

mscore2 = as.factor(heart1$mscore > median(heart1$mscore))
plot(survfit(Surv(futime/365, reject) ~ mscore2, heart1), col = c(1, 2), xlab = "time", 
    ylab = "Probability of success", mark.time = TRUE)
legend("topright", c("small mscore", "large mscore"), col = 2:1, lty = 1)

Recap

What if we have two variables?

What if we have two variables?

What if we have two variables?

What if we have two variables?

What if we have two variables?

Multiple linear regression

Grades

Data wrangling

Data exploration

Missing data

Clustering

Take a look

Remove “outliers”

Remove “outliers”

Simple linear regression

Assess robustness

Assess robustness

What trend?

What trend?

Multiple linear regression

Multiple linear regression

Multiple linear regression

Variable selection

Variable selection

Step-wise model selection

Forwards step-wise regression

Backwards step-wise regression

Important… Colinearity

Colinearity

GLM

GLM

Logistic regression

Heart transplant

Explore

Testing with logistic regression

Multiple logistic regression

Multiple logistic regression

Model selection

Odds ratios

Odds ratios

Getting side-tracked….

Survival analysis

Heart transplant

Explore

Survival analysis

Survival analysis

Survival analysis

Visualising

Summary