University of Sydney

Recap

What if we have two variables?

Qualitative vs Qualitative

\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & A & Y \\ 2 & A & Y \\ 3 & A & X \\ 4 & B & Y \\ 5 & A & Z \\ 6 & B & X \\ 7 & B & Z \\ 8 & B & Z \\ 9 & A & X \\ 10 & A & X \\ \hline \end{array} \]

What if we have two variables?

Qualitative vs Quantitative

\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & A & 11.7 \\ 2 & A & 10.5 \\ 3 & A & 8.7 \\ 4 & B & 9.3 \\ 5 & A & 9.6 \\ 6 & B & 11.2 \\ 7 & B & 10.4 \\ 8 & B & 10.4 \\ 9 & A & 10.1 \\ 10 & A & 9.4 \\ \hline \end{array} \]

What if we have two variables?

Qualitative vs Quantitative

\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & A & 11.7 \\ 2 & A & 10.5 \\ 3 & A & 8.7 \\ 4 & A & 9.3 \\ 5 & A & 9.6 \\ 6 & A & 11.2 \\ 7 & A & 10.4 \\ 8 & A & 10.4 \\ 9 & A & 10.1 \\ 10 & A & 9.4 \\ \hline \end{array} \]

What if we have two variables?

Qualitative vs Quantitative

\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & C & 11.7 \\ 2 & C & 10.5 \\ 3 & C & 8.7 \\ 4 & B & 9.3 \\ 5 & C & 9.6 \\ 6 & B & 11.2 \\ 7 & B & 10.4 \\ 8 & B & 10.4 \\ 9 & C & 10.1 \\ 10 & A & 9.4 \\ \hline \end{array} \]

What if we have two variables?

Quantitative vs Quantitative

\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & 9.4 & 11.2 \\ 2 & 9.8 & 10.4 \\ 3 & 11.6 & 10.4 \\ 4 & 10.1 & 10.1 \\ 5 & 10.1 & 9.4 \\ 6 & 11.7 & 11.8 \\ 7 & 10.5 & 10.5 \\ 8 & 8.7 & 8.0 \\ 9 & 9.3 & 10.7 \\ 10 & 9.6 & 9.5 \\ \hline \end{array} \]

Multiple linear regression

Grades


  • We recently conducted a class survey.

  • Is there a relationship between the amount of time spent studying and grades?

Data wrangling

suppressPackageStartupMessages(library(tidyverse))
survey = read.csv("https://wimr-genomics.vip.sydney.edu.au/AMED3002/data/ClassSurveyv5.csv")


## Take the questions that overlap between Cohorts.
survey = survey[, 1:18]


## rename column names
colnames(survey)
##  [1] "ĂŻ..Timestamp"                                                       
##  [2] "Survey.ID..make.up.a.name."                                         
##  [3] "Gender"                                                             
##  [4] "Postcode.of.where.you.live"                                         
##  [5] "What.introductory.statistic.course.did.you.take."                   
##  [6] "How.many.university.clubs.are.you.a.member.of."                     
##  [7] "How.many.hours.per.week.did.you.spend.on.studying.last.year."       
##  [8] "What.was.your.study.load.last.year."                                
##  [9] "What.is.your.favourite.social.media.platform.."                     
## [10] "How.many.siblings.do.you.have.."                                    
## [11] "How.many.friends.do.you.have.on.Facebook.."                         
## [12] "What.is.your.average.grade.from.last.year..please.enter.a.number..."
## [13] "Did.you.have.a.pet.growing.up."                                     
## [14] "Do.you.still.live.at.home."                                         
## [15] "How.many.hours.a.week.do.you.spend.exercising."                     
## [16] "What.is.your.eye.colour."                                           
## [17] "How.many.hours.do.you.work.per.week."                               
## [18] "What.is.your.favourite.season."
oldname = colnames(survey)
newname = c("Time", "ID", "Gender", "Postcode", "StatsCourse", "Clubs", "StudyTime", 
    "StudyLoad", "SocialMedia", "Siblings", "FBFriends", "Grade", "Pet", "Home", 
    "ExerciseTime", "Eyecolor", "Working", "Season")
colnames(survey) = newname


# Clean up the results a little bit.
survey$Grade = as.character(survey$Grade)
survey$Grade[survey$Grade == "n/a"] = NA
survey$Grade[survey$Grade == "50-60"] = 55
survey$Grade[survey$Grade == "pass"] = 50
survey$Grade[survey$Grade == "D"] = 75
survey$Grade[survey$Grade == "c"] = 65
survey$Grade[survey$Grade == "C "] = 65
survey$Grade[survey$Grade == "HD"] = 85
survey$Grade[survey$Grade == "WAM of seventy-two"] = 72
survey$Grade = as.numeric(survey$Grade)
## Warning: NAs introduced by coercion
survey$StudyTime = as.character(survey$StudyTime)
survey$StudyTime[survey$StudyTime == "5-6"] = 5.5
survey$StudyTime[survey$StudyTime == "10-20"] = 15
survey$StudyTime[survey$StudyTime == "5-10"] = 7.5
survey$StudyTime[survey$StudyTime == "8hr"] = 8
survey$StudyTime[survey$StudyTime == "28 hours"] = 28
survey$StudyTime[survey$StudyTime == "8hr"] = 8
survey$StudyTime[survey$StudyTime == "didn't start uni maybe 6h"] = 6
survey$StudyTime[survey$StudyTime == "20-"] = 22
survey$StudyTime[survey$StudyTime == "20-25?"] = 22.5
survey$StudyTime[survey$StudyTime == "15-25"] = 20
survey$StudyTime[survey$StudyTime == "10 (excluding attending university) "] = 10
survey$StudyTime[survey$StudyTime == "10?"] = 10
survey$StudyTime[survey$StudyTime == "80?"] = 80
survey$StudyTime[survey$StudyTime == "10-12 extracrricular"] = 11
survey$StudyTime[survey$StudyTime == "25 hours a week"] = 25
survey$StudyTime[survey$StudyTime == "20-24"] = 22
survey$StudyTime[survey$StudyTime == "10 hours for each unit"] = 40
survey$StudyTime[survey$StudyTime == "30-40"] = 35

survey$StudyTime[survey$StudyTime == "not sure"] = NA
survey$StudyTime[survey$StudyTime == "not sure "] = NA
survey$StudyTime[survey$StudyTime == "unsure"] = NA
survey$StudyTime[survey$StudyTime == "Unsure"] = NA
survey$StudyTime[survey$StudyTime == "Too many"] = NA
survey$StudyTime[survey$StudyTime == "too many"] = NA
survey$StudyTime[survey$StudyTime == "I have no idea "] = NA
survey$StudyTime[survey$StudyTime == "n/a"] = NA
survey$StudyTime[survey$StudyTime == "lost"] = NA
survey$StudyTime[survey$StudyTime == "lots"] = NA

survey$StudyTime = gsub("\\+", "", survey$StudyTime)
survey$StudyTime = gsub(" ", "", survey$StudyTime)
survey$StudyTime = gsub("~", "", survey$StudyTime)
survey$StudyTime = gsub("\\?", "", survey$StudyTime)

survey$StudyTime = as.numeric(survey$StudyTime)
## Warning: NAs introduced by coercion
FB = survey$FBFriends
FB = gsub("\\+", "", FB)
FB = gsub("~", "", FB)
FB = gsub(">", "", FB)
FB = gsub(" ish", "", FB)
FB[FB == "over 1.2k"] = 1201
FB[FB == "Don't have facebook"] = 0
FB[FB == "Don't have FB"] = 0
FB[FB == "none (not in facebook)"] = 0
FB[FB == "Cant remember"] = NA
FB[FB == "not sure"] = NA
FB[FB == "I don't know"] = NA
FB[FB == "do not know"] = NA
FB[FB == "Too many"] = NA
FB = as.numeric(FB)
## Warning: NAs introduced by coercion
FB[is.na(FB)] = mean(FB, na.rm = TRUE)
survey[, "FBFriends"] = FB


ET = as.character(survey$ExerciseTime)
ET[ET == ""] = 0
ET[ET == "5-6"] = 5.5
ET[ET == "2-3"] = 2.5
ET[ET == "4-5"] = 4.5
ET[ET == "6 hr"] = 6
ET[ET == "none"] = 0
ET[ET == "2 hours"] = 2
ET[ET == "2 hours"] = 2
survey[, "ExerciseTime"] = as.numeric(ET)
## Warning: NAs introduced by coercion
WK = as.character(survey[, "Working"])
WK[WK == ""] = 0
WK[WK == "18 hours"] = 18
WK[WK == "8-12"] = 10
WK[WK == "168"] = NA
WK[WK == "none"] = 0
WK[WK == "Approx 25"] = 25
WK[WK == "NULL"] = 0
WK[WK == "10-15"] = 12.5
survey[, "Working"] = as.numeric(WK)
## Warning: NAs introduced by coercion
PT = as.character(survey$Pet)
PT[PT == ""] = NA
survey$Pet = as.factor(PT)


HM = as.character(survey$Home)
HM[HM == ""] = NA
survey$Home = as.factor(HM)


SB = as.character(survey$Siblings)
SB[SB == "none"] = 0
SB[SB == "NA"] = 0
SB[SB == "165"] = NA
SB[SB == "one"] = 1
SB[SB == "One"] = 1
survey$Siblings = as.numeric(SB)


CB = as.character(survey$Clubs)
CB[grep("none", CB, ignore.case = TRUE)] = 0
CB[CB == "10+"] = 10
CB[CB == "18 or so? executive of 9"] = 18
CB[CB == "Dog Society, Sydney Uni Car Club"] = 2
CB[CB == "3-4"] = 3.5
CB[CB == "N/A"] = 0
survey$Clubs = as.numeric(CB)
## Warning: NAs introduced by coercion
SN = as.character(survey$Season)
SN[SN == ""] = NA
survey$Season = as.factor(SN)

survey$Cohort = (as.Date(survey$Time, "%m/%d/%Y") > "2018/01/01") + (as.Date(survey$Time, 
    "%m/%d/%Y") > "2019/01/01") + (as.Date(survey$Time, "%m/%d/%Y") > "2020/01/01") + 
    (as.Date(survey$Time, "%m/%d/%Y") > "2021/01/01")
survey$Cohort = factor(survey$Cohort, levels = c(0, 1, 2, 3, 4), labels = c("STAT2012", 
    "AMED3002.2018", "AMED3002.2019", "AMED3002.2020", "AMED3002.2021"))

Data exploration

str(survey)
## 'data.frame':    204 obs. of  19 variables:
##  $ Time        : Factor w/ 203 levels "3/11/2019 10:00",..: 124 125 126 127 128 129 129 130 131 132 ...
##  $ ID          : Factor w/ 199 levels "123454321","1907",..: 159 186 123 26 24 131 130 190 102 88 ...
##  $ Gender      : Factor w/ 4 levels "crocodilian",..: 3 2 2 3 3 2 3 3 2 3 ...
##  $ Postcode    : Factor w/ 112 levels "14052","2000",..: 21 68 50 85 66 78 25 54 21 2 ...
##  $ StatsCourse : Factor w/ 18 levels "","BUSS1020",..: 8 8 8 8 8 8 1 8 8 8 ...
##  $ Clubs       : num  0 0 2 3 2 1 2 2 1 2 ...
##  $ StudyTime   : num  50 10 7 70 30 NA 20 25 5 15 ...
##  $ StudyLoad   : Factor w/ 3 levels "","full-time",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ SocialMedia : Factor w/ 22 levels "canvas","Discord",..: 3 14 3 3 3 17 3 3 17 17 ...
##  $ Siblings    : num  1 2 2 0 2 0 1 2 3 1 ...
##  $ FBFriends   : num  90 600 115 100 263 10 120 150 228 200 ...
##  $ Grade       : num  72 73 64 90 90.5 66 74 72 60 62 ...
##  $ Pet         : Factor w/ 2 levels "No","Yes": 2 1 2 NA 2 1 1 1 1 2 ...
##  $ Home        : Factor w/ 2 levels "No","Yes": 1 2 2 NA 2 2 1 2 1 2 ...
##  $ ExerciseTime: num  3 4 2 0 1 2 2 10 2 4 ...
##  $ Eyecolor    : Factor w/ 31 levels "","black","Black",..: 13 10 6 1 11 2 11 3 3 11 ...
##  $ Working     : num  0 15 0 0 10 0 0 15 0 5 ...
##  $ Season      : Factor w/ 4 levels "Autumn","Spring",..: 1 2 1 NA 4 1 2 3 3 2 ...
##  $ Cohort      : Factor w/ 5 levels "STAT2012","AMED3002.2018",..: 1 1 1 1 1 1 1 1 1 1 ...

Missing data

library(naniar)
vis_miss(survey)

sort(colSums(is.na(survey)))
##         Time       Gender     Postcode  StatsCourse    StudyLoad  SocialMedia 
##            0            0            0            0            0            0 
##    FBFriends       Cohort           ID        Clubs     Siblings     Eyecolor 
##            0            0            1            1            1            1 
##      Working        Grade          Pet       Season         Home ExerciseTime 
##            3            4            4            4            5            5 
##    StudyTime 
##           12
survey = na.omit(survey)[, c("Grade", "StudyTime", "Cohort", "FBFriends", "ExerciseTime", 
    "Working", "Pet", "Home", "Clubs", "Season")]

Clustering

dataContinuous = as.data.frame(model.matrix(~. - 1, survey))
dataScaled <- scale(dataContinuous)
Hcluster = hclust(dist(t(dataScaled)))
plot(Hcluster)

Take a look

ggplot(survey, aes(x = StudyTime, y = Grade)) + geom_point() + theme_classic()

Remove “outliers”

# Remove outlier
survey2 = filter(survey, StudyTime < 100, Grade < 100)
ggplot(survey2, aes(x = StudyTime, y = Grade)) + geom_point(size = 3) + theme_classic()

Remove “outliers”

ggplot(survey2, aes(x = StudyTime, y = Grade, colour = Cohort)) + geom_point(size = 3) + 
    theme_classic(20)

Simple linear regression

fit = lm(Grade ~ StudyTime, survey2)
summary(fit)
## 
## Call:
## lm(formula = Grade ~ StudyTime, data = survey2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -68.180  -4.156   0.806   7.820  23.384 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 64.23063    2.02474  31.723  < 2e-16 ***
## StudyTime    0.28212    0.08378   3.367 0.000939 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.81 on 170 degrees of freedom
## Multiple R-squared:  0.06253,    Adjusted R-squared:  0.05702 
## F-statistic: 11.34 on 1 and 170 DF,  p-value: 0.0009386
# assumptions: Linearity, Equality of variance, Independence of residuals,
# Normality of residuals.
plot(fit)

Assess robustness

# Remove outlier
survey3 = filter(survey2, Grade > 40)

ggplot(survey3, aes(x = StudyTime, y = Grade, colour = Cohort)) + geom_point(size = 3) + 
    theme_classic(20)

Assess robustness

fit = lm(Grade ~ StudyTime, survey3)
summary(fit)
## 
## Call:
## lm(formula = Grade ~ StudyTime, data = survey3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.8819  -5.1722  -0.3952   5.1721  20.3074 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  68.9086     1.2637  54.528  < 2e-16 ***
## StudyTime     0.1487     0.0517   2.875  0.00457 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.379 on 165 degrees of freedom
## Multiple R-squared:  0.04772,    Adjusted R-squared:  0.04195 
## F-statistic: 8.268 on 1 and 165 DF,  p-value: 0.004568
plot(fit)

What trend?

ggplot(survey3, aes(x = StudyTime, y = Grade)) + geom_point(size = 3) + theme_classic(20) + 
    geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

What trend?

ggplot(survey3, aes(x = StudyTime, y = Grade, colour = Cohort)) + geom_point(size = 3) + 
    theme_classic(20) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

Multiple linear regression


How well can we predict someones grades given we have multiple variables?

Simple linear regression: \(y = \alpha + \beta x\)


Multiple regression: \(y = \alpha + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_3 + \beta_4 x_4 + \beta_5 x_5 + \beta_6 x_6 + \beta_7 x_7\)

Multiple linear regression

fit = lm(Grade ~ StudyTime + ExerciseTime + Working + Pet, survey3)
summary(fit)
## 
## Call:
## lm(formula = Grade ~ StudyTime + ExerciseTime + Working + Pet, 
##     data = survey3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.0771  -5.3196   0.0588   5.0408  19.9963 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  68.461937   1.708350  40.075  < 2e-16 ***
## StudyTime     0.149307   0.052729   2.832  0.00522 ** 
## ExerciseTime -0.037418   0.177410  -0.211  0.83322    
## Working      -0.006863   0.068455  -0.100  0.92027    
## PetYes        1.253974   1.343409   0.933  0.35199    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.433 on 162 degrees of freedom
## Multiple R-squared:  0.05297,    Adjusted R-squared:  0.02959 
## F-statistic: 2.265 on 4 and 162 DF,  p-value: 0.06441
plot(fit)

Multiple linear regression

fit = lm(Grade ~ ., survey3)
summary(fit)
## 
## Call:
## lm(formula = Grade ~ ., data = survey3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.7162  -5.2790  -0.2158   5.0328  18.8700 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         65.735996   2.387897  27.529  < 2e-16 ***
## StudyTime            0.143045   0.053875   2.655  0.00877 ** 
## CohortAMED3002.2018 -0.700346   2.702710  -0.259  0.79589    
## CohortAMED3002.2019 -0.619427   2.189879  -0.283  0.77767    
## CohortAMED3002.2020  0.741144   1.802908   0.411  0.68159    
## CohortAMED3002.2021  3.159793   2.173712   1.454  0.14811    
## FBFriends            0.003967   0.001854   2.140  0.03399 *  
## ExerciseTime        -0.111892   0.185651  -0.603  0.54761    
## Working             -0.029265   0.073109  -0.400  0.68950    
## PetYes               0.932031   1.398933   0.666  0.50626    
## HomeYes              0.657698   1.430407   0.460  0.64632    
## Clubs                0.060157   0.388634   0.155  0.87719    
## SeasonSpring         0.017546   1.836321   0.010  0.99239    
## SeasonSummer         0.831259   1.965120   0.423  0.67289    
## SeasonWinter         2.210601   1.952846   1.132  0.25942    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.414 on 152 degrees of freedom
## Multiple R-squared:  0.1155, Adjusted R-squared:  0.03403 
## F-statistic: 1.418 on 14 and 152 DF,  p-value: 0.151
plot(fit)

Variable selection


  • Sparsity

  • Parsimony


\(\mbox{Model 1:} \ \ y = \alpha + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_3 + \beta_4 x_4 + \beta_5 x_5 + \beta_6 x_6 + \beta_7 x_7\)


\(\mbox{Model 2:} \ \ y = \alpha + \beta_2 x_2 + \beta_7 x_7\)

Variable selection


  • LASSO


  • Step-wise procedures
    • strategy to add/remove
    • measure to assess fit

Step-wise model selection

  • Define the range of models to test over.
# Full model
fit1 <- lm(Grade ~ ., survey3)
# Null model
fit2 <- lm(Grade ~ 1, survey3)

Forwards step-wise regression

## Forward stepwise model selection using AIC criterion
fit = step(lm(Grade ~ 1, survey3), direction = "forward", scope = list(lower = fit2, 
    upper = fit1))
## Start:  AIC=718.15
## Grade ~ 1
## 
##                Df Sum of Sq   RSS    AIC
## + StudyTime     1    580.48 11584 711.98
## + FBFriends     1    358.57 11806 715.15
## <none>                      12165 718.15
## + Home          1     61.94 12103 719.30
## + Pet           1     50.14 12115 719.46
## + Clubs         1     23.10 12142 719.83
## + Working       1     10.12 12155 720.01
## + ExerciseTime  1      0.55 12164 720.14
## + Cohort        4    429.38 11736 720.15
## + Season        3    105.31 12060 722.70
## 
## Step:  AIC=711.98
## Grade ~ StudyTime
## 
##                Df Sum of Sq   RSS    AIC
## + FBFriends     1    388.87 11196 708.28
## <none>                      11584 711.98
## + Pet           1     59.43 11525 713.13
## + Home          1     49.92 11535 713.26
## + Clubs         1      8.24 11576 713.87
## + ExerciseTime  1      1.34 11583 713.97
## + Working       1      0.32 11584 713.98
## + Cohort        4    306.09 11278 715.51
## + Season        3    121.33 11463 716.23
## 
## Step:  AIC=708.28
## Grade ~ StudyTime + FBFriends
## 
##                Df Sum of Sq   RSS    AIC
## <none>                      11196 708.28
## + Home          1    36.645 11159 709.73
## + ExerciseTime  1    22.264 11173 709.95
## + Clubs         1    14.685 11181 710.06
## + Pet           1    13.035 11183 710.09
## + Working       1     9.518 11186 710.14
## + Season        3   138.816 11057 712.20
## + Cohort        4   232.666 10963 712.78
summary(fit)
## 
## Call:
## lm(formula = Grade ~ StudyTime + FBFriends, data = survey3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.5986  -4.7076  -0.4714   5.4056  18.5365 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 66.951146   1.491774  44.880  < 2e-16 ***
## StudyTime    0.152579   0.051010   2.991  0.00321 ** 
## FBFriends    0.003933   0.001648   2.387  0.01814 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.262 on 164 degrees of freedom
## Multiple R-squared:  0.07968,    Adjusted R-squared:  0.06846 
## F-statistic:   7.1 on 2 and 164 DF,  p-value: 0.001104

Backwards step-wise regression

## Backwards stepwise model selection using AIC criterion
fit = step(lm(Grade ~ ., survey3), direction = "backward", scope = list(lower = fit2, 
    upper = fit1))
## Start:  AIC=725.65
## Grade ~ StudyTime + Cohort + FBFriends + ExerciseTime + Working + 
##     Pet + Home + Clubs + Season
## 
##                Df Sum of Sq   RSS    AIC
## - Cohort        4    205.20 10965 720.81
## - Season        3    117.86 10878 721.47
## - Clubs         1      1.70 10762 723.68
## - Working       1     11.34 10771 723.83
## - Home          1     14.97 10775 723.89
## - ExerciseTime  1     25.71 10786 724.05
## - Pet           1     31.42 10791 724.14
## <none>                      10760 725.65
## - FBFriends     1    324.05 11084 728.61
## - StudyTime     1    499.05 11259 731.23
## 
## Step:  AIC=720.81
## Grade ~ StudyTime + FBFriends + ExerciseTime + Working + Pet + 
##     Home + Clubs + Season
## 
##                Df Sum of Sq   RSS    AIC
## - Season        3    136.33 11102 716.87
## - Working       1      1.57 10967 718.83
## - Clubs         1      4.77 10970 718.88
## - Pet           1     12.33 10978 719.00
## - Home          1     34.63 11000 719.34
## - ExerciseTime  1     34.67 11000 719.34
## <none>                      10965 720.81
## - FBFriends     1    383.67 11349 724.55
## - StudyTime     1    597.01 11562 727.66
## 
## Step:  AIC=716.87
## Grade ~ StudyTime + FBFriends + ExerciseTime + Working + Pet + 
##     Home + Clubs
## 
##                Df Sum of Sq   RSS    AIC
## - Clubs         1      7.37 11109 714.98
## - Working       1      9.26 11111 715.01
## - Pet           1     18.17 11120 715.15
## - ExerciseTime  1     20.70 11122 715.18
## - Home          1     35.21 11137 715.40
## <none>                      11102 716.87
## - FBFriends     1    367.14 11469 720.31
## - StudyTime     1    558.32 11660 723.07
## 
## Step:  AIC=714.98
## Grade ~ StudyTime + FBFriends + ExerciseTime + Working + Pet + 
##     Home
## 
##                Df Sum of Sq   RSS    AIC
## - Working       1     12.77 11122 713.18
## - Pet           1     18.18 11127 713.26
## - ExerciseTime  1     20.64 11130 713.29
## - Home          1     39.11 11148 713.57
## <none>                      11109 714.98
## - FBFriends     1    365.15 11474 718.38
## - StudyTime     1    565.89 11675 721.28
## 
## Step:  AIC=713.18
## Grade ~ StudyTime + FBFriends + ExerciseTime + Pet + Home
## 
##                Df Sum of Sq   RSS    AIC
## - Pet           1     13.61 11135 711.38
## - ExerciseTime  1     25.97 11148 711.56
## - Home          1     36.29 11158 711.72
## <none>                      11122 713.18
## - FBFriends     1    354.16 11476 716.41
## - StudyTime     1    608.37 11730 720.07
## 
## Step:  AIC=711.38
## Grade ~ StudyTime + FBFriends + ExerciseTime + Home
## 
##                Df Sum of Sq   RSS    AIC
## - ExerciseTime  1     23.71 11159 709.73
## - Home          1     38.09 11173 709.95
## <none>                      11135 711.38
## - FBFriends     1    397.36 11533 715.24
## - StudyTime     1    604.59 11740 718.21
## 
## Step:  AIC=709.73
## Grade ~ StudyTime + FBFriends + Home
## 
##             Df Sum of Sq   RSS    AIC
## - Home       1     36.65 11196 708.28
## <none>                   11159 709.73
## - FBFriends  1    375.60 11535 713.26
## - StudyTime  1    599.52 11758 716.47
## 
## Step:  AIC=708.28
## Grade ~ StudyTime + FBFriends
## 
##             Df Sum of Sq   RSS    AIC
## <none>                   11196 708.28
## - FBFriends  1    388.87 11584 711.98
## - StudyTime  1    610.79 11806 715.15
summary(fit)
## 
## Call:
## lm(formula = Grade ~ StudyTime + FBFriends, data = survey3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.5986  -4.7076  -0.4714   5.4056  18.5365 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 66.951146   1.491774  44.880  < 2e-16 ***
## StudyTime    0.152579   0.051010   2.991  0.00321 ** 
## FBFriends    0.003933   0.001648   2.387  0.01814 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.262 on 164 degrees of freedom
## Multiple R-squared:  0.07968,    Adjusted R-squared:  0.06846 
## F-statistic:   7.1 on 2 and 164 DF,  p-value: 0.001104

Important… Colinearity

set.seed(51773)
weight = rnorm(100)
height = rnorm(100, weight)
headCirc = rnorm(100, height, 0.3)
footLength = rnorm(100, height, 0.3)

# E.g. Y = weight, w = height, x = head circumference (headCirc), z = foot length
# (footLength)

pairs(data.frame(weight, headCirc, footLength))

Colinearity

summary(lm(weight ~ headCirc))
## 
## Call:
## lm(formula = weight ~ headCirc)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.81770 -0.50122  0.09853  0.46984  1.76106 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.07317    0.07679  -0.953    0.343    
## headCirc     0.48045    0.05297   9.071 1.26e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7676 on 98 degrees of freedom
## Multiple R-squared:  0.4564, Adjusted R-squared:  0.4508 
## F-statistic: 82.27 on 1 and 98 DF,  p-value: 1.259e-14
summary(lm(weight ~ footLength))
## 
## Call:
## lm(formula = weight ~ footLength)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7894 -0.4505  0.0778  0.4384  2.0889 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.11457    0.07614  -1.505    0.136    
## footLength   0.49009    0.05302   9.244  5.3e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.761 on 98 degrees of freedom
## Multiple R-squared:  0.4658, Adjusted R-squared:  0.4603 
## F-statistic: 85.44 on 1 and 98 DF,  p-value: 5.304e-15
summary(lm(weight ~ headCirc + footLength))
## 
## Call:
## lm(formula = weight ~ headCirc + footLength)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.76444 -0.44618  0.04217  0.48183  1.88197 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.09859    0.07753  -1.272   0.2065  
## headCirc     0.19130    0.17838   1.072   0.2862  
## footLength   0.30547    0.18011   1.696   0.0931 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7604 on 97 degrees of freedom
## Multiple R-squared:  0.472,  Adjusted R-squared:  0.4612 
## F-statistic: 43.36 on 2 and 97 DF,  p-value: 3.517e-14

GLM

GLM


  • Generalized linear models are a generalized form of regression.

  • GLM does not assume a linear relationship between dependent and independent variables. However, it assumes a linear relationship between link function and independent variables in logit model.

  • Examples of GLM include logistic regression and Poisson regression,

  • Link for logistic regression is \[log(\frac{p}{1-p}) = \alpha + \beta x\] where \(p\) is the probability of “success”.

Logistic regression

  • Logistic regression can be used to fit a regression with a binary response variable. E.g. Kidney failure Yes/No

  • logistic regression models the probability of our response variable e.g. “Kidney failure” belonging to a particular category.

  • Gender: male or female

Heart transplant


Transplant rejection of patients that were on the waiting list for the Stanford heart transplant program.

  • Score developed to assign hearts for transplant.

  • Is the score effective?

We have patients that reject heart within one year and survive without a rejection after one year.



J Crowley and M Hu (1977), Covariance analysis of heart transplant survival data. Journal of the American Statistical Association, 72, 27-36.

Explore

boxplot(score ~ outcome, ylab = "Match score", xlab = "Survives")

Testing with logistic regression

fit = glm(outcome ~ score, heart1, family = binomial)
summary(fit)
## 
## Call:
## glm(formula = outcome ~ score, family = binomial, data = heart1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6415  -1.1254   0.6736   1.0449   1.4326  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -1.3115     0.8397  -1.562   0.1183  
## score         1.2153     0.6371   1.907   0.0565 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57.843  on 41  degrees of freedom
## Residual deviance: 53.640  on 40  degrees of freedom
##   (23 observations deleted due to missingness)
## AIC: 57.64
## 
## Number of Fisher Scoring iterations: 4
# Visual diagnostics for logistic regression aren't appropriate
plot(fit)

Multiple logistic regression

We have more information than just a mismatch score.

colnames(heart1)
##  [1] "birth.dt"   "accept.dt"  "tx.date"    "fu.date"    "fustat"    
##  [6] "surgery"    "age"        "futime"     "wait.time"  "transplant"
## [11] "mismatch"   "hla.a2"     "mscore"     "reject"

Can we do better at predicting rejection?

Multiple logistic regression

fit = glm(outcome ~ surgery + age + wait.time + mismatch + hla.a2 + mscore, heart1, 
    family = binomial)
summary(fit)
## 
## Call:
## glm(formula = outcome ~ surgery + age + wait.time + mismatch + 
##     hla.a2 + mscore, family = binomial, data = heart1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0951  -0.6967   0.3447   0.7897   1.7360  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -7.028984   3.568025  -1.970   0.0488 *
## surgery     -1.633404   0.968140  -1.687   0.0916 .
## age          0.146055   0.063019   2.318   0.0205 *
## wait.time   -0.008959   0.009938  -0.902   0.3673  
## mismatch    -0.179936   0.577919  -0.311   0.7555  
## hla.a2      -0.392561   1.037202  -0.378   0.7051  
## mscore       1.307939   0.978552   1.337   0.1814  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57.843  on 41  degrees of freedom
## Residual deviance: 41.316  on 35  degrees of freedom
##   (23 observations deleted due to missingness)
## AIC: 55.316
## 
## Number of Fisher Scoring iterations: 4

Model selection

fit = step(glm(outcome ~ 1, heart1, family = binomial), scope = list(upper = ~surgery + 
    age + wait.time + mismatch + hla.a2 + mscore), direction = "forward")
## Start:  AIC=59.84
## outcome ~ 1
## 
##             Df Deviance    AIC
## + age        1   48.939 52.939
## + surgery    1   52.797 56.797
## + mscore     1   53.640 57.640
## <none>           57.843 59.843
## + wait.time  1   56.678 60.678
## + hla.a2     1   57.065 61.065
## + mismatch   1   57.838 61.838
## 
## Step:  AIC=52.94
## outcome ~ age
## 
##             Df Deviance    AIC
## + surgery    1   45.324 51.324
## + mscore     1   45.750 51.750
## <none>           48.939 52.939
## + wait.time  1   47.487 53.487
## + hla.a2     1   48.771 54.771
## + mismatch   1   48.869 54.869
## 
## Step:  AIC=51.32
## outcome ~ age + surgery
## 
##             Df Deviance    AIC
## + mscore     1   42.788 50.788
## <none>           45.324 51.324
## + wait.time  1   43.790 51.790
## + mismatch   1   45.009 53.009
## + hla.a2     1   45.191 53.191
## 
## Step:  AIC=50.79
## outcome ~ age + surgery + mscore
## 
##             Df Deviance    AIC
## <none>           42.788 50.788
## + wait.time  1   41.674 51.674
## + mismatch   1   42.359 52.359
## + hla.a2     1   42.475 52.475
summary(fit)
## 
## Call:
## glm(formula = outcome ~ age + surgery + mscore, family = binomial, 
##     data = heart1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1036  -0.7670   0.3920   0.8025   1.8747  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -7.37758    3.04074  -2.426   0.0153 *
## age          0.13604    0.06027   2.257   0.0240 *
## surgery     -1.56404    0.96117  -1.627   0.1037  
## mscore       1.17452    0.78748   1.491   0.1358  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57.843  on 41  degrees of freedom
## Residual deviance: 42.788  on 38  degrees of freedom
##   (23 observations deleted due to missingness)
## AIC: 50.788
## 
## Number of Fisher Scoring iterations: 4

Odds ratios

In a logisitc regression the regression coefficients can be interpreted as odds ratios. Specifically, \[OR = exp(\beta_i) = e^{\beta_i}\] For every unit change in \(x_1\) the odds of disease increase by \(e^{\beta_i}\).

Odds ratios

exp(coef(summary(fit))[, 1])
##  (Intercept)          age      surgery       mscore 
## 0.0006251121 1.1457303643 0.2092890864 3.2365754902

Getting side-tracked….

Can we predict who still lives at home?

fit1 <- glm(Home ~ ., survey2, family = binomial)
fit2 <- glm(Home ~ 1, survey2, family = binomial)
fit = step(glm(Home ~ 1, survey2, family = binomial), direction = "forward", scope = list(lower = fit2, 
    upper = fit1))
## Start:  AIC=227.99
## Home ~ 1
## 
##                Df Deviance    AIC
## + Cohort        4   211.86 221.86
## + Grade         1   223.97 227.97
## <none>              225.99 227.99
## + Working       1   224.31 228.31
## + Clubs         1   225.02 229.02
## + FBFriends     1   225.47 229.47
## + StudyTime     1   225.57 229.57
## + ExerciseTime  1   225.60 229.60
## + Pet           1   225.70 229.70
## + Season        3   223.70 231.70
## 
## Step:  AIC=221.86
## Home ~ Cohort
## 
##                Df Deviance    AIC
## <none>              211.86 221.86
## + Working       1   210.57 222.57
## + Grade         1   210.91 222.91
## + Clubs         1   210.95 222.95
## + Pet           1   211.15 223.15
## + ExerciseTime  1   211.32 223.32
## + FBFriends     1   211.48 223.48
## + StudyTime     1   211.79 223.79
## + Season        3   210.88 226.88
summary(fit)
## 
## Call:
## glm(formula = Home ~ Cohort, family = binomial, data = survey2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8930  -1.2601   0.7002   1.0969   1.2814  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)  
## (Intercept)           0.1924     0.2352   0.818   0.4133  
## CohortAMED3002.2018   1.4171     0.8095   1.751   0.0800 .
## CohortAMED3002.2019  -0.4335     0.4665  -0.929   0.3527  
## CohortAMED3002.2020   1.0116     0.4469   2.263   0.0236 *
## CohortAMED3002.2021   1.0886     0.5575   1.952   0.0509 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.99  on 171  degrees of freedom
## Residual deviance: 211.86  on 167  degrees of freedom
## AIC: 221.86
## 
## Number of Fisher Scoring iterations: 4
exp(coef(summary(fit))[, 1])
##         (Intercept) CohortAMED3002.2018 CohortAMED3002.2019 CohortAMED3002.2020 
##           1.2121212           4.1250000           0.6482143           2.7500000 
## CohortAMED3002.2021 
##           2.9700000
table(survey2$Home, survey2$Cohort)
##      
##       STAT2012 AMED3002.2018 AMED3002.2019 AMED3002.2020 AMED3002.2021
##   No        33             2            14             9             5
##   Yes       40            10            11            30            18
fit <- glm(Home ~ Cohort, survey2, family = binomial)
summary(fit)
## 
## Call:
## glm(formula = Home ~ Cohort, family = binomial, data = survey2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8930  -1.2601   0.7002   1.0969   1.2814  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)  
## (Intercept)           0.1924     0.2352   0.818   0.4133  
## CohortAMED3002.2018   1.4171     0.8095   1.751   0.0800 .
## CohortAMED3002.2019  -0.4335     0.4665  -0.929   0.3527  
## CohortAMED3002.2020   1.0116     0.4469   2.263   0.0236 *
## CohortAMED3002.2021   1.0886     0.5575   1.952   0.0509 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.99  on 171  degrees of freedom
## Residual deviance: 211.86  on 167  degrees of freedom
## AIC: 221.86
## 
## Number of Fisher Scoring iterations: 4
chisq.test(table(survey2$Home, survey2$Cohort))
## Warning in chisq.test(table(survey2$Home, survey2$Cohort)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(survey2$Home, survey2$Cohort)
## X-squared = 13.697, df = 4, p-value = 0.008326

Survival analysis

Heart transplant


Transplant rejection of patients that were on the waiting list for the Stanford heart transplant program.

  • Score developed to assign hearts for transplant.

  • Is the score effective?

We have patients that reject heart within one year and survive without a rejection after one year.

We have information on how long someone has been followed up and whether they havd a rejection.


J Crowley and M Hu (1977), Covariance analysis of heart transplant survival data. Journal of the American Statistical Association, 72, 27-36.

Explore

Survival analysis

We will fit a cox proportional-hazards model

fit = coxph(Surv(futime, reject) ~ surgery + age + wait.time + mismatch + hla.a2 + 
    mscore, heart1)
fit
## Call:
## coxph(formula = Surv(futime, reject) ~ surgery + age + wait.time + 
##     mismatch + hla.a2 + mscore, data = heart1)
## 
##                coef exp(coef)  se(coef)      z       p
## surgery   -0.982678  0.374307  0.625297 -1.572 0.11606
## age        0.095668  1.100393  0.030633  3.123 0.00179
## wait.time -0.009470  0.990575  0.007487 -1.265 0.20590
## mismatch  -0.110350  0.895521  0.229665 -0.480 0.63088
## hla.a2     0.275834  1.317629  0.469523  0.587 0.55688
## mscore     0.780048  2.181577  0.379562  2.055 0.03987
## 
## Likelihood ratio test=27.3  on 6 df, p=0.000127
## n= 65, number of events= 29

Survival analysis


We use the “Surv” object to represent survival data.

with(heart1, Surv(futime, reject))
##  [1]   15+   38+  674    57   152+   80  1386   307    42+   27  1031    50 
## [13]  732   218  1799+   71+  851    76  1586+ 1571+   99    65     4+ 1407+
## [25] 1321+   44+  995    71  1141+  284    60   941+  342   915+   67   841+
## [37]  583+   77   284+   67   669+   29   619+  595+   89    16+  544+  514+
## [49]   95   481+  444+   79+  333   396+  109   369+  206   185+  339+  264+
## [61]  164   179+  130+  108+   38+

Survival analysis

We will fit a cox proportional-hazards model with step-wise model selection.

fit = step(coxph(Surv(futime, reject) ~ surgery + age + wait.time + mismatch + hla.a2 + 
    mscore, heart1))
## Start:  AIC=185.23
## Surv(futime, reject) ~ surgery + age + wait.time + mismatch + 
##     hla.a2 + mscore
## 
##             Df    AIC
## - mismatch   1 183.47
## - hla.a2     1 183.58
## <none>         185.23
## - wait.time  1 185.41
## - surgery    1 186.31
## - mscore     1 187.06
## - age        1 196.14
## 
## Step:  AIC=183.47
## Surv(futime, reject) ~ surgery + age + wait.time + hla.a2 + mscore
## 
##             Df    AIC
## - hla.a2     1 181.73
## <none>         183.47
## - wait.time  1 183.64
## - surgery    1 184.49
## - mscore     1 185.07
## - age        1 194.24
## 
## Step:  AIC=181.73
## Surv(futime, reject) ~ surgery + age + wait.time + mscore
## 
##             Df    AIC
## <none>         181.73
## - wait.time  1 181.87
## - surgery    1 182.59
## - mscore     1 185.15
## - age        1 192.25
fit
## Call:
## coxph(formula = Surv(futime, reject) ~ surgery + age + wait.time + 
##     mscore, data = heart1)
## 
##                coef exp(coef)  se(coef)      z       p
## surgery   -0.946733  0.388007  0.623963 -1.517 0.12919
## age        0.095073  1.099739  0.030895  3.077 0.00209
## wait.time -0.009528  0.990517  0.007573 -1.258 0.20832
## mscore     0.795538  2.215632  0.328914  2.419 0.01558
## 
## Likelihood ratio test=26.81  on 4 df, p=2.172e-05
## n= 65, number of events= 29

Visualising

If we binarize our score we can use a Kaplan-Meier curve.

mscore2 = as.factor(heart1$mscore > median(heart1$mscore))
plot(survfit(Surv(futime/365, reject) ~ mscore2, heart1), col = c(1, 2), xlab = "time", 
    ylab = "Probability of success", mark.time = TRUE)
legend("topright", c("small mscore", "large mscore"), col = 2:1, lty = 1)

Summary