# Spring 2024, Drill #1: polynomial regression

# Libraries ---------------------------------------------------------------
library(tidyverse)
library(lmSupport)
library(ggeffects)

# a hypothetical data set with the following variables:

# SubID = a subject identifier number
# Years = years since PhD
# Productivity = a z-scored measure of professor productivity

d <- read_tsv("https://whlevine.hosted.uark.edu/psyc5143/productivity.dat")

# briefly explore the data
hist(d$Years)
hist(d$Productivity)
cor(d$Years, d$Productivity)
plot(d$Years, d$Productivity)

# we want to consider whether we can predict productivity using experience (as
# measured by years)

linearModel <- lm(Productivity ~ Years, d)
summary(linearModel)
modelEffectSizes(linearModel)

# there's a strong, positive relationship, with about 17.6% of the variance
# being explained

# how can we tell is there is a nonlinear relationship? a residuals plot will
# help

plot(linearModel, which = 1) # which = 1 specifies only the first of the four
                             # plots produced here

# the LOESS fit is not at all flat

# this can be visualized in the raw data, too, using ggplot
ggplot(d, aes(x = Years, y = Productivity)) +
  geom_point() +
  geom_smooth(se = F) +                                          # adds a LOESS curve
  geom_smooth(method='lm', formula= y~x, se = F,  colour="red")  # adds a best-fitting line


# it looks like for the first 20 years or so, productivity increases, then it
# levels off, and then it starts to decline

# how can we capture this changing relationship between years and productivity?
# by including a predictor that is a power of the predictor; in this case,
# squaring it will do the trick because the relationship changes direction
# (positive to negative) once

# before we square the predictor, we'll mean-center it for the same reason we
# mean-centered predictors when testing interactions: it will make
# interpretation of slopes easier

d <- d %>% 
  mutate(years.c = Years - mean(Years))

polyModel <- lm(Productivity ~ years.c + I(years.c^2), d) 

				# what is up with that I function?! the website below does a nice job explaining
				# it (copy and paste if interested)
				
				# https://stackoverflow.com/questions/24192428/what-does-the-capital-letter-i-in-r-linear-regression-formula-mean
				
				# an example that might help is comparing the two linear models below
				
				# y ~ a + (b + c)      # will give three slopes, one each for a, b, and c
				# y ~ a + I(b + c)     # will give two slopes, one for a and one for b + c
				
				# another example that might help
				
				# y ~ a * b            # will give three slopes, one each for a, b, and a:b
				# y ~ I(a * b)         # will give one slope, one for a*b

summary(polyModel)

# the intercept is the predicted productivity at the mean of Years, illustrated
# here

ggplot(d, aes(x = Years, y = Productivity)) +
  geom_point() +
  geom_smooth(method = "lm",                        # what fit is wanted?
              formula = y ~ x + I(x^2), se = F) +   # what's the model (generically)
  geom_vline(xintercept = mean(d$Years))            # add a vertical line at the X mean

# the linear effect of years is the years.c slope; it indicates the predicted
# change in productivity for an increase in one year of experience ONLY FOR
# SOMEONE AT THE MEAN NUMBER OF YEARS OF EXPERIENCE (this should sound a lot
# like the constraint on the interpretation of a slope in interaction models)

# the quadratic effect of years is the I(years.c^2) slope; it indicates half the
# rate of change in the linear slope of years for each additional year of
# experience; that is, the linear slope goes down a little bit every year, and
# will eventually be negative

# what if we were to use non-centered years and its square as a predictor?
polyModel2 <- lm(Productivity ~ Years + I(Years^2), d)

summary(polyModel2)

# the intercept is now (very) different, illustrated here

ggplot(d, aes(x = Years, y = Productivity)) +
  geom_point() +
  geom_smooth(method = "lm",                        # what fit is wanted?
              formula = y ~ x + I(x^2), se = F) +   # what's the model (generically)
  geom_vline(xintercept = 0)                        # add a vertical line at 0

# the linear slope is now much steeper; why? because it's the slope at 0 years
# of experience rather than at the mean years of experience

# but the quadratic slope didn't change

#an alternative method for adding the polynomial term:
d <- d %>% mutate(years.c2 = years.c^2)

#this is the "long" way, where we include the squared term in the dataframe
#instead of creating it in the regression model

#accompanying model syntax:
polyModel3 <- lm(Productivity ~ years.c + years.c2, d)
summary(polyModel3)
#compare to the first polynomial model:
summary(polyModel)

# last, we can find point (linear) slopes for different interesting values of
# years by centering it at different values

d <- d %>% 
  mutate(years20 = Years - 20,
         years40 = Years - 40)

polyModel20 <- lm(Productivity ~ years20 + I(years20^2), d)
coef(polyModel20) # linear slope at 20 years of experience = 0.05
polyModel40 <- lm(Productivity ~ years40 + I(years40^2), d)
coef(polyModel40) # linear slope at 40 years of experience = -0.03