University of Sydney
Qualitative vs Qualitative
\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & A & Y \\ 2 & A & Y \\ 3 & A & X \\ 4 & B & Y \\ 5 & A & Z \\ 6 & B & X \\ 7 & B & Z \\ 8 & B & Z \\ 9 & A & X \\ 10 & A & X \\ \hline \end{array} \]
Qualitative vs Quantitative
\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & A & 11.7 \\ 2 & A & 10.5 \\ 3 & A & 8.7 \\ 4 & B & 9.3 \\ 5 & A & 9.6 \\ 6 & B & 11.2 \\ 7 & B & 10.4 \\ 8 & B & 10.4 \\ 9 & A & 10.1 \\ 10 & A & 9.4 \\ \hline \end{array} \]
Qualitative vs Quantitative
\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & A & 11.7 \\ 2 & A & 10.5 \\ 3 & A & 8.7 \\ 4 & A & 9.3 \\ 5 & A & 9.6 \\ 6 & A & 11.2 \\ 7 & A & 10.4 \\ 8 & A & 10.4 \\ 9 & A & 10.1 \\ 10 & A & 9.4 \\ \hline \end{array} \]
Qualitative vs Quantitative
\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & C & 11.7 \\ 2 & C & 10.5 \\ 3 & C & 8.7 \\ 4 & B & 9.3 \\ 5 & C & 9.6 \\ 6 & B & 11.2 \\ 7 & B & 10.4 \\ 8 & B & 10.4 \\ 9 & C & 10.1 \\ 10 & A & 9.4 \\ \hline \end{array} \]
Quantitative vs Quantitative
\[\begin{array}{c|cc} \hline i & X_i & Y_i \\ \hline 1 & 9.4 & 11.2 \\ 2 & 9.8 & 10.4 \\ 3 & 11.6 & 10.4 \\ 4 & 10.1 & 10.1 \\ 5 & 10.1 & 9.4 \\ 6 & 11.7 & 11.8 \\ 7 & 10.5 & 10.5 \\ 8 & 8.7 & 8.0 \\ 9 & 9.3 & 10.7 \\ 10 & 9.6 & 9.5 \\ \hline \end{array} \]
We recently conducted a class survey.
Is there a relationship between the amount of time spent studying and grades?
suppressPackageStartupMessages(library(tidyverse)) survey = read.csv("https://wimr-genomics.vip.sydney.edu.au/AMED3002/data/ClassSurveyv5.csv") ## Take the questions that overlap between Cohorts. survey = survey[, 1:18] ## rename column names colnames(survey)
## [1] "ĂŻ..Timestamp" ## [2] "Survey.ID..make.up.a.name." ## [3] "Gender" ## [4] "Postcode.of.where.you.live" ## [5] "What.introductory.statistic.course.did.you.take." ## [6] "How.many.university.clubs.are.you.a.member.of." ## [7] "How.many.hours.per.week.did.you.spend.on.studying.last.year." ## [8] "What.was.your.study.load.last.year." ## [9] "What.is.your.favourite.social.media.platform.." ## [10] "How.many.siblings.do.you.have.." ## [11] "How.many.friends.do.you.have.on.Facebook.." ## [12] "What.is.your.average.grade.from.last.year..please.enter.a.number..." ## [13] "Did.you.have.a.pet.growing.up." ## [14] "Do.you.still.live.at.home." ## [15] "How.many.hours.a.week.do.you.spend.exercising." ## [16] "What.is.your.eye.colour." ## [17] "How.many.hours.do.you.work.per.week." ## [18] "What.is.your.favourite.season."
oldname = colnames(survey) newname = c("Time", "ID", "Gender", "Postcode", "StatsCourse", "Clubs", "StudyTime", "StudyLoad", "SocialMedia", "Siblings", "FBFriends", "Grade", "Pet", "Home", "ExerciseTime", "Eyecolor", "Working", "Season") colnames(survey) = newname # Clean up the results a little bit. survey$Grade = as.character(survey$Grade) survey$Grade[survey$Grade == "n/a"] = NA survey$Grade[survey$Grade == "50-60"] = 55 survey$Grade[survey$Grade == "pass"] = 50 survey$Grade[survey$Grade == "D"] = 75 survey$Grade[survey$Grade == "c"] = 65 survey$Grade[survey$Grade == "C "] = 65 survey$Grade[survey$Grade == "HD"] = 85 survey$Grade[survey$Grade == "WAM of seventy-two"] = 72 survey$Grade = as.numeric(survey$Grade)
## Warning: NAs introduced by coercion
survey$StudyTime = as.character(survey$StudyTime) survey$StudyTime[survey$StudyTime == "5-6"] = 5.5 survey$StudyTime[survey$StudyTime == "10-20"] = 15 survey$StudyTime[survey$StudyTime == "5-10"] = 7.5 survey$StudyTime[survey$StudyTime == "8hr"] = 8 survey$StudyTime[survey$StudyTime == "28 hours"] = 28 survey$StudyTime[survey$StudyTime == "8hr"] = 8 survey$StudyTime[survey$StudyTime == "didn't start uni maybe 6h"] = 6 survey$StudyTime[survey$StudyTime == "20-"] = 22 survey$StudyTime[survey$StudyTime == "20-25?"] = 22.5 survey$StudyTime[survey$StudyTime == "15-25"] = 20 survey$StudyTime[survey$StudyTime == "10 (excluding attending university) "] = 10 survey$StudyTime[survey$StudyTime == "10?"] = 10 survey$StudyTime[survey$StudyTime == "80?"] = 80 survey$StudyTime[survey$StudyTime == "10-12 extracrricular"] = 11 survey$StudyTime[survey$StudyTime == "25 hours a week"] = 25 survey$StudyTime[survey$StudyTime == "20-24"] = 22 survey$StudyTime[survey$StudyTime == "10 hours for each unit"] = 40 survey$StudyTime[survey$StudyTime == "30-40"] = 35 survey$StudyTime[survey$StudyTime == "not sure"] = NA survey$StudyTime[survey$StudyTime == "not sure "] = NA survey$StudyTime[survey$StudyTime == "unsure"] = NA survey$StudyTime[survey$StudyTime == "Unsure"] = NA survey$StudyTime[survey$StudyTime == "Too many"] = NA survey$StudyTime[survey$StudyTime == "too many"] = NA survey$StudyTime[survey$StudyTime == "I have no idea "] = NA survey$StudyTime[survey$StudyTime == "n/a"] = NA survey$StudyTime[survey$StudyTime == "lost"] = NA survey$StudyTime[survey$StudyTime == "lots"] = NA survey$StudyTime = gsub("\\+", "", survey$StudyTime) survey$StudyTime = gsub(" ", "", survey$StudyTime) survey$StudyTime = gsub("~", "", survey$StudyTime) survey$StudyTime = gsub("\\?", "", survey$StudyTime) survey$StudyTime = as.numeric(survey$StudyTime)
## Warning: NAs introduced by coercion
FB = survey$FBFriends FB = gsub("\\+", "", FB) FB = gsub("~", "", FB) FB = gsub(">", "", FB) FB = gsub(" ish", "", FB) FB[FB == "over 1.2k"] = 1201 FB[FB == "Don't have facebook"] = 0 FB[FB == "Don't have FB"] = 0 FB[FB == "none (not in facebook)"] = 0 FB[FB == "Cant remember"] = NA FB[FB == "not sure"] = NA FB[FB == "I don't know"] = NA FB[FB == "do not know"] = NA FB[FB == "Too many"] = NA FB = as.numeric(FB)
## Warning: NAs introduced by coercion
FB[is.na(FB)] = mean(FB, na.rm = TRUE) survey[, "FBFriends"] = FB ET = as.character(survey$ExerciseTime) ET[ET == ""] = 0 ET[ET == "5-6"] = 5.5 ET[ET == "2-3"] = 2.5 ET[ET == "4-5"] = 4.5 ET[ET == "6 hr"] = 6 ET[ET == "none"] = 0 ET[ET == "2 hours"] = 2 ET[ET == "2 hours"] = 2 survey[, "ExerciseTime"] = as.numeric(ET)
## Warning: NAs introduced by coercion
WK = as.character(survey[, "Working"]) WK[WK == ""] = 0 WK[WK == "18 hours"] = 18 WK[WK == "8-12"] = 10 WK[WK == "168"] = NA WK[WK == "none"] = 0 WK[WK == "Approx 25"] = 25 WK[WK == "NULL"] = 0 WK[WK == "10-15"] = 12.5 survey[, "Working"] = as.numeric(WK)
## Warning: NAs introduced by coercion
PT = as.character(survey$Pet) PT[PT == ""] = NA survey$Pet = as.factor(PT) HM = as.character(survey$Home) HM[HM == ""] = NA survey$Home = as.factor(HM) SB = as.character(survey$Siblings) SB[SB == "none"] = 0 SB[SB == "NA"] = 0 SB[SB == "165"] = NA SB[SB == "one"] = 1 SB[SB == "One"] = 1 survey$Siblings = as.numeric(SB) CB = as.character(survey$Clubs) CB[grep("none", CB, ignore.case = TRUE)] = 0 CB[CB == "10+"] = 10 CB[CB == "18 or so? executive of 9"] = 18 CB[CB == "Dog Society, Sydney Uni Car Club"] = 2 CB[CB == "3-4"] = 3.5 CB[CB == "N/A"] = 0 survey$Clubs = as.numeric(CB)
## Warning: NAs introduced by coercion
SN = as.character(survey$Season) SN[SN == ""] = NA survey$Season = as.factor(SN) survey$Cohort = (as.Date(survey$Time, "%m/%d/%Y") > "2018/01/01") + (as.Date(survey$Time, "%m/%d/%Y") > "2019/01/01") + (as.Date(survey$Time, "%m/%d/%Y") > "2020/01/01") + (as.Date(survey$Time, "%m/%d/%Y") > "2021/01/01") survey$Cohort = factor(survey$Cohort, levels = c(0, 1, 2, 3, 4), labels = c("STAT2012", "AMED3002.2018", "AMED3002.2019", "AMED3002.2020", "AMED3002.2021"))
str(survey)
## 'data.frame': 204 obs. of 19 variables: ## $ Time : Factor w/ 203 levels "3/11/2019 10:00",..: 124 125 126 127 128 129 129 130 131 132 ... ## $ ID : Factor w/ 199 levels "123454321","1907",..: 159 186 123 26 24 131 130 190 102 88 ... ## $ Gender : Factor w/ 4 levels "crocodilian",..: 3 2 2 3 3 2 3 3 2 3 ... ## $ Postcode : Factor w/ 112 levels "14052","2000",..: 21 68 50 85 66 78 25 54 21 2 ... ## $ StatsCourse : Factor w/ 18 levels "","BUSS1020",..: 8 8 8 8 8 8 1 8 8 8 ... ## $ Clubs : num 0 0 2 3 2 1 2 2 1 2 ... ## $ StudyTime : num 50 10 7 70 30 NA 20 25 5 15 ... ## $ StudyLoad : Factor w/ 3 levels "","full-time",..: 2 2 2 2 2 2 2 2 2 2 ... ## $ SocialMedia : Factor w/ 22 levels "canvas","Discord",..: 3 14 3 3 3 17 3 3 17 17 ... ## $ Siblings : num 1 2 2 0 2 0 1 2 3 1 ... ## $ FBFriends : num 90 600 115 100 263 10 120 150 228 200 ... ## $ Grade : num 72 73 64 90 90.5 66 74 72 60 62 ... ## $ Pet : Factor w/ 2 levels "No","Yes": 2 1 2 NA 2 1 1 1 1 2 ... ## $ Home : Factor w/ 2 levels "No","Yes": 1 2 2 NA 2 2 1 2 1 2 ... ## $ ExerciseTime: num 3 4 2 0 1 2 2 10 2 4 ... ## $ Eyecolor : Factor w/ 31 levels "","black","Black",..: 13 10 6 1 11 2 11 3 3 11 ... ## $ Working : num 0 15 0 0 10 0 0 15 0 5 ... ## $ Season : Factor w/ 4 levels "Autumn","Spring",..: 1 2 1 NA 4 1 2 3 3 2 ... ## $ Cohort : Factor w/ 5 levels "STAT2012","AMED3002.2018",..: 1 1 1 1 1 1 1 1 1 1 ...
library(naniar) vis_miss(survey)
sort(colSums(is.na(survey)))
## Time Gender Postcode StatsCourse StudyLoad SocialMedia ## 0 0 0 0 0 0 ## FBFriends Cohort ID Clubs Siblings Eyecolor ## 0 0 1 1 1 1 ## Working Grade Pet Season Home ExerciseTime ## 3 4 4 4 5 5 ## StudyTime ## 12
survey = na.omit(survey)[, c("Grade", "StudyTime", "Cohort", "FBFriends", "ExerciseTime", "Working", "Pet", "Home", "Clubs", "Season")]
dataContinuous = as.data.frame(model.matrix(~. - 1, survey)) dataScaled <- scale(dataContinuous) Hcluster = hclust(dist(t(dataScaled))) plot(Hcluster)
ggplot(survey, aes(x = StudyTime, y = Grade)) + geom_point() + theme_classic()
# Remove outlier survey2 = filter(survey, StudyTime < 100, Grade < 100) ggplot(survey2, aes(x = StudyTime, y = Grade)) + geom_point(size = 3) + theme_classic()
ggplot(survey2, aes(x = StudyTime, y = Grade, colour = Cohort)) + geom_point(size = 3) + theme_classic(20)
fit = lm(Grade ~ StudyTime, survey2) summary(fit)
## ## Call: ## lm(formula = Grade ~ StudyTime, data = survey2) ## ## Residuals: ## Min 1Q Median 3Q Max ## -68.180 -4.156 0.806 7.820 23.384 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 64.23063 2.02474 31.723 < 2e-16 *** ## StudyTime 0.28212 0.08378 3.367 0.000939 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 13.81 on 170 degrees of freedom ## Multiple R-squared: 0.06253, Adjusted R-squared: 0.05702 ## F-statistic: 11.34 on 1 and 170 DF, p-value: 0.0009386
# assumptions: Linearity, Equality of variance, Independence of residuals, # Normality of residuals. plot(fit)
# Remove outlier survey3 = filter(survey2, Grade > 40) ggplot(survey3, aes(x = StudyTime, y = Grade, colour = Cohort)) + geom_point(size = 3) + theme_classic(20)
fit = lm(Grade ~ StudyTime, survey3) summary(fit)
## ## Call: ## lm(formula = Grade ~ StudyTime, data = survey3) ## ## Residuals: ## Min 1Q Median 3Q Max ## -21.8819 -5.1722 -0.3952 5.1721 20.3074 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 68.9086 1.2637 54.528 < 2e-16 *** ## StudyTime 0.1487 0.0517 2.875 0.00457 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 8.379 on 165 degrees of freedom ## Multiple R-squared: 0.04772, Adjusted R-squared: 0.04195 ## F-statistic: 8.268 on 1 and 165 DF, p-value: 0.004568
plot(fit)
ggplot(survey3, aes(x = StudyTime, y = Grade)) + geom_point(size = 3) + theme_classic(20) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(survey3, aes(x = StudyTime, y = Grade, colour = Cohort)) + geom_point(size = 3) + theme_classic(20) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
How well can we predict someones grades given we have multiple variables?
Simple linear regression: \(y = \alpha + \beta x\)
Multiple regression: \(y = \alpha + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_3 + \beta_4 x_4 + \beta_5 x_5 + \beta_6 x_6 + \beta_7 x_7\)
fit = lm(Grade ~ StudyTime + ExerciseTime + Working + Pet, survey3) summary(fit)
## ## Call: ## lm(formula = Grade ~ StudyTime + ExerciseTime + Working + Pet, ## data = survey3) ## ## Residuals: ## Min 1Q Median 3Q Max ## -22.0771 -5.3196 0.0588 5.0408 19.9963 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 68.461937 1.708350 40.075 < 2e-16 *** ## StudyTime 0.149307 0.052729 2.832 0.00522 ** ## ExerciseTime -0.037418 0.177410 -0.211 0.83322 ## Working -0.006863 0.068455 -0.100 0.92027 ## PetYes 1.253974 1.343409 0.933 0.35199 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 8.433 on 162 degrees of freedom ## Multiple R-squared: 0.05297, Adjusted R-squared: 0.02959 ## F-statistic: 2.265 on 4 and 162 DF, p-value: 0.06441
plot(fit)
fit = lm(Grade ~ ., survey3) summary(fit)
## ## Call: ## lm(formula = Grade ~ ., data = survey3) ## ## Residuals: ## Min 1Q Median 3Q Max ## -19.7162 -5.2790 -0.2158 5.0328 18.8700 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 65.735996 2.387897 27.529 < 2e-16 *** ## StudyTime 0.143045 0.053875 2.655 0.00877 ** ## CohortAMED3002.2018 -0.700346 2.702710 -0.259 0.79589 ## CohortAMED3002.2019 -0.619427 2.189879 -0.283 0.77767 ## CohortAMED3002.2020 0.741144 1.802908 0.411 0.68159 ## CohortAMED3002.2021 3.159793 2.173712 1.454 0.14811 ## FBFriends 0.003967 0.001854 2.140 0.03399 * ## ExerciseTime -0.111892 0.185651 -0.603 0.54761 ## Working -0.029265 0.073109 -0.400 0.68950 ## PetYes 0.932031 1.398933 0.666 0.50626 ## HomeYes 0.657698 1.430407 0.460 0.64632 ## Clubs 0.060157 0.388634 0.155 0.87719 ## SeasonSpring 0.017546 1.836321 0.010 0.99239 ## SeasonSummer 0.831259 1.965120 0.423 0.67289 ## SeasonWinter 2.210601 1.952846 1.132 0.25942 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 8.414 on 152 degrees of freedom ## Multiple R-squared: 0.1155, Adjusted R-squared: 0.03403 ## F-statistic: 1.418 on 14 and 152 DF, p-value: 0.151
plot(fit)
Sparsity
Parsimony
\(\mbox{Model 1:} \ \ y = \alpha + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_3 + \beta_4 x_4 + \beta_5 x_5 + \beta_6 x_6 + \beta_7 x_7\)
\(\mbox{Model 2:} \ \ y = \alpha + \beta_2 x_2 + \beta_7 x_7\)
# Full model fit1 <- lm(Grade ~ ., survey3) # Null model fit2 <- lm(Grade ~ 1, survey3)
## Forward stepwise model selection using AIC criterion fit = step(lm(Grade ~ 1, survey3), direction = "forward", scope = list(lower = fit2, upper = fit1))
## Start: AIC=718.15 ## Grade ~ 1 ## ## Df Sum of Sq RSS AIC ## + StudyTime 1 580.48 11584 711.98 ## + FBFriends 1 358.57 11806 715.15 ## <none> 12165 718.15 ## + Home 1 61.94 12103 719.30 ## + Pet 1 50.14 12115 719.46 ## + Clubs 1 23.10 12142 719.83 ## + Working 1 10.12 12155 720.01 ## + ExerciseTime 1 0.55 12164 720.14 ## + Cohort 4 429.38 11736 720.15 ## + Season 3 105.31 12060 722.70 ## ## Step: AIC=711.98 ## Grade ~ StudyTime ## ## Df Sum of Sq RSS AIC ## + FBFriends 1 388.87 11196 708.28 ## <none> 11584 711.98 ## + Pet 1 59.43 11525 713.13 ## + Home 1 49.92 11535 713.26 ## + Clubs 1 8.24 11576 713.87 ## + ExerciseTime 1 1.34 11583 713.97 ## + Working 1 0.32 11584 713.98 ## + Cohort 4 306.09 11278 715.51 ## + Season 3 121.33 11463 716.23 ## ## Step: AIC=708.28 ## Grade ~ StudyTime + FBFriends ## ## Df Sum of Sq RSS AIC ## <none> 11196 708.28 ## + Home 1 36.645 11159 709.73 ## + ExerciseTime 1 22.264 11173 709.95 ## + Clubs 1 14.685 11181 710.06 ## + Pet 1 13.035 11183 710.09 ## + Working 1 9.518 11186 710.14 ## + Season 3 138.816 11057 712.20 ## + Cohort 4 232.666 10963 712.78
summary(fit)
## ## Call: ## lm(formula = Grade ~ StudyTime + FBFriends, data = survey3) ## ## Residuals: ## Min 1Q Median 3Q Max ## -22.5986 -4.7076 -0.4714 5.4056 18.5365 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 66.951146 1.491774 44.880 < 2e-16 *** ## StudyTime 0.152579 0.051010 2.991 0.00321 ** ## FBFriends 0.003933 0.001648 2.387 0.01814 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 8.262 on 164 degrees of freedom ## Multiple R-squared: 0.07968, Adjusted R-squared: 0.06846 ## F-statistic: 7.1 on 2 and 164 DF, p-value: 0.001104
## Backwards stepwise model selection using AIC criterion fit = step(lm(Grade ~ ., survey3), direction = "backward", scope = list(lower = fit2, upper = fit1))
## Start: AIC=725.65 ## Grade ~ StudyTime + Cohort + FBFriends + ExerciseTime + Working + ## Pet + Home + Clubs + Season ## ## Df Sum of Sq RSS AIC ## - Cohort 4 205.20 10965 720.81 ## - Season 3 117.86 10878 721.47 ## - Clubs 1 1.70 10762 723.68 ## - Working 1 11.34 10771 723.83 ## - Home 1 14.97 10775 723.89 ## - ExerciseTime 1 25.71 10786 724.05 ## - Pet 1 31.42 10791 724.14 ## <none> 10760 725.65 ## - FBFriends 1 324.05 11084 728.61 ## - StudyTime 1 499.05 11259 731.23 ## ## Step: AIC=720.81 ## Grade ~ StudyTime + FBFriends + ExerciseTime + Working + Pet + ## Home + Clubs + Season ## ## Df Sum of Sq RSS AIC ## - Season 3 136.33 11102 716.87 ## - Working 1 1.57 10967 718.83 ## - Clubs 1 4.77 10970 718.88 ## - Pet 1 12.33 10978 719.00 ## - Home 1 34.63 11000 719.34 ## - ExerciseTime 1 34.67 11000 719.34 ## <none> 10965 720.81 ## - FBFriends 1 383.67 11349 724.55 ## - StudyTime 1 597.01 11562 727.66 ## ## Step: AIC=716.87 ## Grade ~ StudyTime + FBFriends + ExerciseTime + Working + Pet + ## Home + Clubs ## ## Df Sum of Sq RSS AIC ## - Clubs 1 7.37 11109 714.98 ## - Working 1 9.26 11111 715.01 ## - Pet 1 18.17 11120 715.15 ## - ExerciseTime 1 20.70 11122 715.18 ## - Home 1 35.21 11137 715.40 ## <none> 11102 716.87 ## - FBFriends 1 367.14 11469 720.31 ## - StudyTime 1 558.32 11660 723.07 ## ## Step: AIC=714.98 ## Grade ~ StudyTime + FBFriends + ExerciseTime + Working + Pet + ## Home ## ## Df Sum of Sq RSS AIC ## - Working 1 12.77 11122 713.18 ## - Pet 1 18.18 11127 713.26 ## - ExerciseTime 1 20.64 11130 713.29 ## - Home 1 39.11 11148 713.57 ## <none> 11109 714.98 ## - FBFriends 1 365.15 11474 718.38 ## - StudyTime 1 565.89 11675 721.28 ## ## Step: AIC=713.18 ## Grade ~ StudyTime + FBFriends + ExerciseTime + Pet + Home ## ## Df Sum of Sq RSS AIC ## - Pet 1 13.61 11135 711.38 ## - ExerciseTime 1 25.97 11148 711.56 ## - Home 1 36.29 11158 711.72 ## <none> 11122 713.18 ## - FBFriends 1 354.16 11476 716.41 ## - StudyTime 1 608.37 11730 720.07 ## ## Step: AIC=711.38 ## Grade ~ StudyTime + FBFriends + ExerciseTime + Home ## ## Df Sum of Sq RSS AIC ## - ExerciseTime 1 23.71 11159 709.73 ## - Home 1 38.09 11173 709.95 ## <none> 11135 711.38 ## - FBFriends 1 397.36 11533 715.24 ## - StudyTime 1 604.59 11740 718.21 ## ## Step: AIC=709.73 ## Grade ~ StudyTime + FBFriends + Home ## ## Df Sum of Sq RSS AIC ## - Home 1 36.65 11196 708.28 ## <none> 11159 709.73 ## - FBFriends 1 375.60 11535 713.26 ## - StudyTime 1 599.52 11758 716.47 ## ## Step: AIC=708.28 ## Grade ~ StudyTime + FBFriends ## ## Df Sum of Sq RSS AIC ## <none> 11196 708.28 ## - FBFriends 1 388.87 11584 711.98 ## - StudyTime 1 610.79 11806 715.15
summary(fit)
## ## Call: ## lm(formula = Grade ~ StudyTime + FBFriends, data = survey3) ## ## Residuals: ## Min 1Q Median 3Q Max ## -22.5986 -4.7076 -0.4714 5.4056 18.5365 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 66.951146 1.491774 44.880 < 2e-16 *** ## StudyTime 0.152579 0.051010 2.991 0.00321 ** ## FBFriends 0.003933 0.001648 2.387 0.01814 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 8.262 on 164 degrees of freedom ## Multiple R-squared: 0.07968, Adjusted R-squared: 0.06846 ## F-statistic: 7.1 on 2 and 164 DF, p-value: 0.001104
set.seed(51773) weight = rnorm(100) height = rnorm(100, weight) headCirc = rnorm(100, height, 0.3) footLength = rnorm(100, height, 0.3) # E.g. Y = weight, w = height, x = head circumference (headCirc), z = foot length # (footLength) pairs(data.frame(weight, headCirc, footLength))
summary(lm(weight ~ headCirc))
## ## Call: ## lm(formula = weight ~ headCirc) ## ## Residuals: ## Min 1Q Median 3Q Max ## -1.81770 -0.50122 0.09853 0.46984 1.76106 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -0.07317 0.07679 -0.953 0.343 ## headCirc 0.48045 0.05297 9.071 1.26e-14 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.7676 on 98 degrees of freedom ## Multiple R-squared: 0.4564, Adjusted R-squared: 0.4508 ## F-statistic: 82.27 on 1 and 98 DF, p-value: 1.259e-14
summary(lm(weight ~ footLength))
## ## Call: ## lm(formula = weight ~ footLength) ## ## Residuals: ## Min 1Q Median 3Q Max ## -1.7894 -0.4505 0.0778 0.4384 2.0889 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -0.11457 0.07614 -1.505 0.136 ## footLength 0.49009 0.05302 9.244 5.3e-15 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.761 on 98 degrees of freedom ## Multiple R-squared: 0.4658, Adjusted R-squared: 0.4603 ## F-statistic: 85.44 on 1 and 98 DF, p-value: 5.304e-15
summary(lm(weight ~ headCirc + footLength))
## ## Call: ## lm(formula = weight ~ headCirc + footLength) ## ## Residuals: ## Min 1Q Median 3Q Max ## -1.76444 -0.44618 0.04217 0.48183 1.88197 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -0.09859 0.07753 -1.272 0.2065 ## headCirc 0.19130 0.17838 1.072 0.2862 ## footLength 0.30547 0.18011 1.696 0.0931 . ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.7604 on 97 degrees of freedom ## Multiple R-squared: 0.472, Adjusted R-squared: 0.4612 ## F-statistic: 43.36 on 2 and 97 DF, p-value: 3.517e-14
Generalized linear models are a generalized form of regression.
GLM does not assume a linear relationship between dependent and independent variables. However, it assumes a linear relationship between link function and independent variables in logit model.
Examples of GLM include logistic regression and Poisson regression,
Link for logistic regression is \[log(\frac{p}{1-p}) = \alpha + \beta x\] where \(p\) is the probability of “success”.
Logistic regression can be used to fit a regression with a binary response variable. E.g. Kidney failure Yes/No
logistic regression models the probability of our response variable e.g. “Kidney failure” belonging to a particular category.
Gender: male or female
Transplant rejection of patients that were on the waiting list for the Stanford heart transplant program.
Score developed to assign hearts for transplant.
Is the score effective?
We have patients that reject heart within one year and survive without a rejection after one year.
J Crowley and M Hu (1977), Covariance analysis of heart transplant survival data. Journal of the American Statistical Association, 72, 27-36.
boxplot(score ~ outcome, ylab = "Match score", xlab = "Survives")
fit = glm(outcome ~ score, heart1, family = binomial) summary(fit)
## ## Call: ## glm(formula = outcome ~ score, family = binomial, data = heart1) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.6415 -1.1254 0.6736 1.0449 1.4326 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -1.3115 0.8397 -1.562 0.1183 ## score 1.2153 0.6371 1.907 0.0565 . ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 57.843 on 41 degrees of freedom ## Residual deviance: 53.640 on 40 degrees of freedom ## (23 observations deleted due to missingness) ## AIC: 57.64 ## ## Number of Fisher Scoring iterations: 4
# Visual diagnostics for logistic regression aren't appropriate plot(fit)
We have more information than just a mismatch score.
colnames(heart1)
## [1] "birth.dt" "accept.dt" "tx.date" "fu.date" "fustat" ## [6] "surgery" "age" "futime" "wait.time" "transplant" ## [11] "mismatch" "hla.a2" "mscore" "reject"
Can we do better at predicting rejection?
fit = glm(outcome ~ surgery + age + wait.time + mismatch + hla.a2 + mscore, heart1, family = binomial) summary(fit)
## ## Call: ## glm(formula = outcome ~ surgery + age + wait.time + mismatch + ## hla.a2 + mscore, family = binomial, data = heart1) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.0951 -0.6967 0.3447 0.7897 1.7360 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -7.028984 3.568025 -1.970 0.0488 * ## surgery -1.633404 0.968140 -1.687 0.0916 . ## age 0.146055 0.063019 2.318 0.0205 * ## wait.time -0.008959 0.009938 -0.902 0.3673 ## mismatch -0.179936 0.577919 -0.311 0.7555 ## hla.a2 -0.392561 1.037202 -0.378 0.7051 ## mscore 1.307939 0.978552 1.337 0.1814 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 57.843 on 41 degrees of freedom ## Residual deviance: 41.316 on 35 degrees of freedom ## (23 observations deleted due to missingness) ## AIC: 55.316 ## ## Number of Fisher Scoring iterations: 4
fit = step(glm(outcome ~ 1, heart1, family = binomial), scope = list(upper = ~surgery + age + wait.time + mismatch + hla.a2 + mscore), direction = "forward")
## Start: AIC=59.84 ## outcome ~ 1 ## ## Df Deviance AIC ## + age 1 48.939 52.939 ## + surgery 1 52.797 56.797 ## + mscore 1 53.640 57.640 ## <none> 57.843 59.843 ## + wait.time 1 56.678 60.678 ## + hla.a2 1 57.065 61.065 ## + mismatch 1 57.838 61.838 ## ## Step: AIC=52.94 ## outcome ~ age ## ## Df Deviance AIC ## + surgery 1 45.324 51.324 ## + mscore 1 45.750 51.750 ## <none> 48.939 52.939 ## + wait.time 1 47.487 53.487 ## + hla.a2 1 48.771 54.771 ## + mismatch 1 48.869 54.869 ## ## Step: AIC=51.32 ## outcome ~ age + surgery ## ## Df Deviance AIC ## + mscore 1 42.788 50.788 ## <none> 45.324 51.324 ## + wait.time 1 43.790 51.790 ## + mismatch 1 45.009 53.009 ## + hla.a2 1 45.191 53.191 ## ## Step: AIC=50.79 ## outcome ~ age + surgery + mscore ## ## Df Deviance AIC ## <none> 42.788 50.788 ## + wait.time 1 41.674 51.674 ## + mismatch 1 42.359 52.359 ## + hla.a2 1 42.475 52.475
summary(fit)
## ## Call: ## glm(formula = outcome ~ age + surgery + mscore, family = binomial, ## data = heart1) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.1036 -0.7670 0.3920 0.8025 1.8747 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -7.37758 3.04074 -2.426 0.0153 * ## age 0.13604 0.06027 2.257 0.0240 * ## surgery -1.56404 0.96117 -1.627 0.1037 ## mscore 1.17452 0.78748 1.491 0.1358 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 57.843 on 41 degrees of freedom ## Residual deviance: 42.788 on 38 degrees of freedom ## (23 observations deleted due to missingness) ## AIC: 50.788 ## ## Number of Fisher Scoring iterations: 4
In a logisitc regression the regression coefficients can be interpreted as odds ratios. Specifically, \[OR = exp(\beta_i) = e^{\beta_i}\] For every unit change in \(x_1\) the odds of disease increase by \(e^{\beta_i}\).
exp(coef(summary(fit))[, 1])
## (Intercept) age surgery mscore ## 0.0006251121 1.1457303643 0.2092890864 3.2365754902
Can we predict who still lives at home?
fit1 <- glm(Home ~ ., survey2, family = binomial) fit2 <- glm(Home ~ 1, survey2, family = binomial) fit = step(glm(Home ~ 1, survey2, family = binomial), direction = "forward", scope = list(lower = fit2, upper = fit1))
## Start: AIC=227.99 ## Home ~ 1 ## ## Df Deviance AIC ## + Cohort 4 211.86 221.86 ## + Grade 1 223.97 227.97 ## <none> 225.99 227.99 ## + Working 1 224.31 228.31 ## + Clubs 1 225.02 229.02 ## + FBFriends 1 225.47 229.47 ## + StudyTime 1 225.57 229.57 ## + ExerciseTime 1 225.60 229.60 ## + Pet 1 225.70 229.70 ## + Season 3 223.70 231.70 ## ## Step: AIC=221.86 ## Home ~ Cohort ## ## Df Deviance AIC ## <none> 211.86 221.86 ## + Working 1 210.57 222.57 ## + Grade 1 210.91 222.91 ## + Clubs 1 210.95 222.95 ## + Pet 1 211.15 223.15 ## + ExerciseTime 1 211.32 223.32 ## + FBFriends 1 211.48 223.48 ## + StudyTime 1 211.79 223.79 ## + Season 3 210.88 226.88
summary(fit)
## ## Call: ## glm(formula = Home ~ Cohort, family = binomial, data = survey2) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.8930 -1.2601 0.7002 1.0969 1.2814 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 0.1924 0.2352 0.818 0.4133 ## CohortAMED3002.2018 1.4171 0.8095 1.751 0.0800 . ## CohortAMED3002.2019 -0.4335 0.4665 -0.929 0.3527 ## CohortAMED3002.2020 1.0116 0.4469 2.263 0.0236 * ## CohortAMED3002.2021 1.0886 0.5575 1.952 0.0509 . ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 225.99 on 171 degrees of freedom ## Residual deviance: 211.86 on 167 degrees of freedom ## AIC: 221.86 ## ## Number of Fisher Scoring iterations: 4
exp(coef(summary(fit))[, 1])
## (Intercept) CohortAMED3002.2018 CohortAMED3002.2019 CohortAMED3002.2020 ## 1.2121212 4.1250000 0.6482143 2.7500000 ## CohortAMED3002.2021 ## 2.9700000
table(survey2$Home, survey2$Cohort)
## ## STAT2012 AMED3002.2018 AMED3002.2019 AMED3002.2020 AMED3002.2021 ## No 33 2 14 9 5 ## Yes 40 10 11 30 18
fit <- glm(Home ~ Cohort, survey2, family = binomial) summary(fit)
## ## Call: ## glm(formula = Home ~ Cohort, family = binomial, data = survey2) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.8930 -1.2601 0.7002 1.0969 1.2814 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 0.1924 0.2352 0.818 0.4133 ## CohortAMED3002.2018 1.4171 0.8095 1.751 0.0800 . ## CohortAMED3002.2019 -0.4335 0.4665 -0.929 0.3527 ## CohortAMED3002.2020 1.0116 0.4469 2.263 0.0236 * ## CohortAMED3002.2021 1.0886 0.5575 1.952 0.0509 . ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 225.99 on 171 degrees of freedom ## Residual deviance: 211.86 on 167 degrees of freedom ## AIC: 221.86 ## ## Number of Fisher Scoring iterations: 4
chisq.test(table(survey2$Home, survey2$Cohort))
## Warning in chisq.test(table(survey2$Home, survey2$Cohort)): Chi-squared ## approximation may be incorrect
## ## Pearson's Chi-squared test ## ## data: table(survey2$Home, survey2$Cohort) ## X-squared = 13.697, df = 4, p-value = 0.008326
Transplant rejection of patients that were on the waiting list for the Stanford heart transplant program.
Score developed to assign hearts for transplant.
Is the score effective?
We have patients that reject heart within one year and survive without a rejection after one year.
We have information on how long someone has been followed up and whether they havd a rejection.
J Crowley and M Hu (1977), Covariance analysis of heart transplant survival data. Journal of the American Statistical Association, 72, 27-36.
We will fit a cox proportional-hazards model
fit = coxph(Surv(futime, reject) ~ surgery + age + wait.time + mismatch + hla.a2 + mscore, heart1) fit
## Call: ## coxph(formula = Surv(futime, reject) ~ surgery + age + wait.time + ## mismatch + hla.a2 + mscore, data = heart1) ## ## coef exp(coef) se(coef) z p ## surgery -0.982678 0.374307 0.625297 -1.572 0.11606 ## age 0.095668 1.100393 0.030633 3.123 0.00179 ## wait.time -0.009470 0.990575 0.007487 -1.265 0.20590 ## mismatch -0.110350 0.895521 0.229665 -0.480 0.63088 ## hla.a2 0.275834 1.317629 0.469523 0.587 0.55688 ## mscore 0.780048 2.181577 0.379562 2.055 0.03987 ## ## Likelihood ratio test=27.3 on 6 df, p=0.000127 ## n= 65, number of events= 29
We use the “Surv” object to represent survival data.
with(heart1, Surv(futime, reject))
## [1] 15+ 38+ 674 57 152+ 80 1386 307 42+ 27 1031 50 ## [13] 732 218 1799+ 71+ 851 76 1586+ 1571+ 99 65 4+ 1407+ ## [25] 1321+ 44+ 995 71 1141+ 284 60 941+ 342 915+ 67 841+ ## [37] 583+ 77 284+ 67 669+ 29 619+ 595+ 89 16+ 544+ 514+ ## [49] 95 481+ 444+ 79+ 333 396+ 109 369+ 206 185+ 339+ 264+ ## [61] 164 179+ 130+ 108+ 38+
We will fit a cox proportional-hazards model with step-wise model selection.
fit = step(coxph(Surv(futime, reject) ~ surgery + age + wait.time + mismatch + hla.a2 + mscore, heart1))
## Start: AIC=185.23 ## Surv(futime, reject) ~ surgery + age + wait.time + mismatch + ## hla.a2 + mscore ## ## Df AIC ## - mismatch 1 183.47 ## - hla.a2 1 183.58 ## <none> 185.23 ## - wait.time 1 185.41 ## - surgery 1 186.31 ## - mscore 1 187.06 ## - age 1 196.14 ## ## Step: AIC=183.47 ## Surv(futime, reject) ~ surgery + age + wait.time + hla.a2 + mscore ## ## Df AIC ## - hla.a2 1 181.73 ## <none> 183.47 ## - wait.time 1 183.64 ## - surgery 1 184.49 ## - mscore 1 185.07 ## - age 1 194.24 ## ## Step: AIC=181.73 ## Surv(futime, reject) ~ surgery + age + wait.time + mscore ## ## Df AIC ## <none> 181.73 ## - wait.time 1 181.87 ## - surgery 1 182.59 ## - mscore 1 185.15 ## - age 1 192.25
fit
## Call: ## coxph(formula = Surv(futime, reject) ~ surgery + age + wait.time + ## mscore, data = heart1) ## ## coef exp(coef) se(coef) z p ## surgery -0.946733 0.388007 0.623963 -1.517 0.12919 ## age 0.095073 1.099739 0.030895 3.077 0.00209 ## wait.time -0.009528 0.990517 0.007573 -1.258 0.20832 ## mscore 0.795538 2.215632 0.328914 2.419 0.01558 ## ## Likelihood ratio test=26.81 on 4 df, p=2.172e-05 ## n= 65, number of events= 29
If we binarize our score we can use a Kaplan-Meier curve.
mscore2 = as.factor(heart1$mscore > median(heart1$mscore)) plot(survfit(Surv(futime/365, reject) ~ mscore2, heart1), col = c(1, 2), xlab = "time", ylab = "Probability of success", mark.time = TRUE) legend("topright", c("small mscore", "large mscore"), col = 2:1, lty = 1)