# January 24, 2024

library(MASS)       # where a convenient dataset lives
library(tidyverse)
library(lmSupport)

data("Boston") # the convenient data set

# plotting the nonlinear relationship

ggplot(data = Boston, aes(x = lstat, y = medv)) +
	geom_point() +
	stat_smooth(col = "red") +
	xlab("proportion of population that is low SES") +
	ylab("median housing price")


# transforming the predictor and outcome linearizes this very well, but X and Y
# are now less transparent
ggplot(data = Boston, aes(x = log(lstat), y = sqrt(medv))) +
	geom_point() +
	stat_smooth(col = "red") +
	xlab("SQRT proportion of population that is low SES") +
	ylab("LOG median housing price")



###

d <- read.csv("https://whlevine.hosted.uark.edu/psyc5143/ch7.csv", header = T)

# mean-centering miles
d <- d %>% 
	mutate(MILES.c = MILES - mean(MILES))

# adding miles-squared to the data
d <- d %>% 
	mutate(M2 = MILES.c^2)

polyModel <- lm(TIME ~ MILES.c + M2, d)
summary(polyModel)

# some stuff for graphing purposes
coef(polyModel)[2] -> slope0 # the slope at mean miles (MILES.c = 0)
coef(polyModel)[1] + coef(polyModel)[2]*(0 - mean(d$MILES)) -> int0 # the y-intercept of the tangent line at mean miles


# scatterplot with some extra info added
ggplot(data = d, aes(x = MILES, y = TIME)) +
	geom_point() +
	geom_vline(xintercept = mean(d$MILES)) + # mean MILES
	stat_smooth(method = "lm", formula = y ~ x + I(x^2), se = F) + 
	geom_abline(slope = slope0, intercept = int0, col = "red")# the model, visualized

# doing the analysis sequentially

# linear-only model
m1 <- lm(TIME ~ MILES.c, d)
summary(m1)

# adding the quadratic term
m2 <- lm(TIME ~ MILES.c + I(MILES.c^2), d)
summary(m2)

# effect sizes, model comparison
modelEffectSizes(m1)
modelEffectSizes(m2)
modelCompare(m1, m2)