# Spring 2024, Drill #1: polynomial regression # Libraries --------------------------------------------------------------- library(tidyverse) library(lmSupport) library(ggeffects) # a hypothetical data set with the following variables: # SubID = a subject identifier number # Years = years since PhD # Productivity = a z-scored measure of professor productivity d <- read_tsv("https://whlevine.hosted.uark.edu/psyc5143/productivity.dat") # briefly explore the data hist(d$Years) hist(d$Productivity) cor(d$Years, d$Productivity) plot(d$Years, d$Productivity) # we want to consider whether we can predict productivity using experience (as # measured by years) linearModel <- lm(Productivity ~ Years, d) summary(linearModel) modelEffectSizes(linearModel) # there's a strong, positive relationship, with about 17.6% of the variance # being explained # how can we tell is there is a nonlinear relationship? a residuals plot will # help plot(linearModel, which = 1) # which = 1 specifies only the first of the four # plots produced here # the LOESS fit is not at all flat # this can be visualized in the raw data, too, using ggplot ggplot(d, aes(x = Years, y = Productivity)) + geom_point() + geom_smooth(se = F) + # adds a LOESS curve geom_smooth(method='lm', formula= y~x, se = F, colour="red") # adds a best-fitting line # it looks like for the first 20 years or so, productivity increases, then it # levels off, and then it starts to decline # how can we capture this changing relationship between years and productivity? # by including a predictor that is a power of the predictor; in this case, # squaring it will do the trick because the relationship changes direction # (positive to negative) once # before we square the predictor, we'll mean-center it for the same reason we # mean-centered predictors when testing interactions: it will make # interpretation of slopes easier d <- d %>% mutate(years.c = Years - mean(Years)) polyModel <- lm(Productivity ~ years.c + I(years.c^2), d) # what is up with that I function?! the website below does a nice job explaining # it (copy and paste if interested) # https://stackoverflow.com/questions/24192428/what-does-the-capital-letter-i-in-r-linear-regression-formula-mean # an example that might help is comparing the two linear models below # y ~ a + (b + c) # will give three slopes, one each for a, b, and c # y ~ a + I(b + c) # will give two slopes, one for a and one for b + c # another example that might help # y ~ a * b # will give three slopes, one each for a, b, and a:b # y ~ I(a * b) # will give one slope, one for a*b summary(polyModel) # the intercept is the predicted productivity at the mean of Years, illustrated # here ggplot(d, aes(x = Years, y = Productivity)) + geom_point() + geom_smooth(method = "lm", # what fit is wanted? formula = y ~ x + I(x^2), se = F) + # what's the model (generically) geom_vline(xintercept = mean(d$Years)) # add a vertical line at the X mean # the linear effect of years is the years.c slope; it indicates the predicted # change in productivity for an increase in one year of experience ONLY FOR # SOMEONE AT THE MEAN NUMBER OF YEARS OF EXPERIENCE (this should sound a lot # like the constraint on the interpretation of a slope in interaction models) # the quadratic effect of years is the I(years.c^2) slope; it indicates half the # rate of change in the linear slope of years for each additional year of # experience; that is, the linear slope goes down a little bit every year, and # will eventually be negative # what if we were to use non-centered years and its square as a predictor? polyModel2 <- lm(Productivity ~ Years + I(Years^2), d) summary(polyModel2) # the intercept is now (very) different, illustrated here ggplot(d, aes(x = Years, y = Productivity)) + geom_point() + geom_smooth(method = "lm", # what fit is wanted? formula = y ~ x + I(x^2), se = F) + # what's the model (generically) geom_vline(xintercept = 0) # add a vertical line at 0 # the linear slope is now much steeper; why? because it's the slope at 0 years # of experience rather than at the mean years of experience # but the quadratic slope didn't change #an alternative method for adding the polynomial term: d <- d %>% mutate(years.c2 = years.c^2) #this is the "long" way, where we include the squared term in the dataframe #instead of creating it in the regression model #accompanying model syntax: polyModel3 <- lm(Productivity ~ years.c + years.c2, d) summary(polyModel3) #compare to the first polynomial model: summary(polyModel) # last, we can find point (linear) slopes for different interesting values of # years by centering it at different values d <- d %>% mutate(years20 = Years - 20, years40 = Years - 40) polyModel20 <- lm(Productivity ~ years20 + I(years20^2), d) coef(polyModel20) # linear slope at 20 years of experience = 0.05 polyModel40 <- lm(Productivity ~ years40 + I(years40^2), d) coef(polyModel40) # linear slope at 40 years of experience = -0.03