# January 24, 2024 library(MASS) # where a convenient dataset lives library(tidyverse) library(lmSupport) data("Boston") # the convenient data set # plotting the nonlinear relationship ggplot(data = Boston, aes(x = lstat, y = medv)) + geom_point() + stat_smooth(col = "red") + xlab("proportion of population that is low SES") + ylab("median housing price") # transforming the predictor and outcome linearizes this very well, but X and Y # are now less transparent ggplot(data = Boston, aes(x = log(lstat), y = sqrt(medv))) + geom_point() + stat_smooth(col = "red") + xlab("SQRT proportion of population that is low SES") + ylab("LOG median housing price") ### d <- read.csv("https://whlevine.hosted.uark.edu/psyc5143/ch7.csv", header = T) # mean-centering miles d <- d %>% mutate(MILES.c = MILES - mean(MILES)) # adding miles-squared to the data d <- d %>% mutate(M2 = MILES.c^2) polyModel <- lm(TIME ~ MILES.c + M2, d) summary(polyModel) # some stuff for graphing purposes coef(polyModel)[2] -> slope0 # the slope at mean miles (MILES.c = 0) coef(polyModel)[1] + coef(polyModel)[2]*(0 - mean(d$MILES)) -> int0 # the y-intercept of the tangent line at mean miles # scatterplot with some extra info added ggplot(data = d, aes(x = MILES, y = TIME)) + geom_point() + geom_vline(xintercept = mean(d$MILES)) + # mean MILES stat_smooth(method = "lm", formula = y ~ x + I(x^2), se = F) + geom_abline(slope = slope0, intercept = int0, col = "red")# the model, visualized # doing the analysis sequentially # linear-only model m1 <- lm(TIME ~ MILES.c, d) summary(m1) # adding the quadratic term m2 <- lm(TIME ~ MILES.c + I(MILES.c^2), d) summary(m2) # effect sizes, model comparison modelEffectSizes(m1) modelEffectSizes(m2) modelCompare(m1, m2)