library(tidyverse)

# Drill 7

# cover story/data dictionary

# a comparison of the relative effectiveness of three software packages designed
# to help students get better at math problem solving

# variables in data file linked below

# iv = the three packages (numeric; needs to be factorized or converted to dummy variables)

# cov = a measure of math ability

# dv = scores on a standardized math problem-solving test

read.csv("http://whlevine.hosted.uark.edu/psyc5143/ancova2.csv") -> d

glimpse(d)

#for the sake of drill, to experience working with R's defaults, let's make a
# factor version of the iv and use the default dummy coding

d$ivF <- as.factor(d$iv)
contrasts(d$ivF) # checking dummy coding: group 1 is the reference group

# the model without the IV
lm(dv ~ cov, d) -> step1

# model with the IV
lm(dv ~ cov + ivF, d) -> step2

# ANCOVA as it's conventionally reported is F-change from step1 to step2
anova(step1, step2)

# we can also just get the F-ratio for the IV here
anova(step2)

# we can get more info from the summary of step2
summary(step2)

# centering the covariate at its mean will make the intercept more useful

# base R!
d$cov.c <- d$cov - mean(d$cov)
# could use mutate, of course

lm(dv ~ cov.c + ivF, d) -> step2.c
summary(step2.c)

# adjusted means a couple of ways (this package might need to be installed)
#install.packages("effects")
library(effects)
effect(term = "ivF", mod = step2)

# the stuff below is a little fancy, but is adapted from 
# https://stackoverflow.com/questions/31903606/ggplot2-plotting-regression-lines-with-different-intercepts-but-with-same-slope

d <- cbind(d, pred = fitted(step2)) # store predicted scores in the data frame

# the command on line 60 is a bit old school; we can also do things using mutate
d <- d %>% 
  mutate(pred_new_school = fitted(step2))

head(d) #Let's see how our data frame updated
70.38954 + (-5.3666667*1.6777) #an example of where our predicted scores come from
step2.c$coefficients[1] + ((d[1,5]) * step2.c$coefficients[2])


tail(d)
70.38954 + (-7.5686) + (1.6333333*1.6777) #another example
step2.c$coefficients[1] + step2.c$coefficients[4] + ((d[30,5]) * step2.c$coefficients[2])

# find the mean of the covariate and store it
Mcov <- mean(d$cov)

# create a scatterplot *assuming* equal slopes of the cov-dv relationship

ggplot(d, aes(y = dv, x = cov, col = ivF)) +     # create a scatterplot, using groups to "color" points
  geom_point() +                                 # add points to the scatterplot
  geom_line(mapping = aes(y = pred)) +           # puts a line through the predicted scores for each group
  theme_bw() +                                   # a graph style that I like
  scale_color_grey(start = 0.7, end = 0.3) +     # to graycale groups
  geom_vline(xintercept = Mcov)                  # add a vertical line at the mean of the covariate

# create a scatterplot that does not assume equal slopes (to help visualize how
# much the slopes differ across groups)

ggplot(d, aes(y = dv, x = cov, col = ivF)) +     # create a scatterplot, using groups to "color" points
  geom_point() +                                 # add points to the scatterplot
  geom_smooth(method = "lm", fill = NA) +        # adds separate regression lines for each group
  theme_bw() +                                   # a graph style that I like
  scale_color_grey(start = 0.7, end = 0.3) +     # to graycale groups
  geom_vline(xintercept = Mcov)                  # add a vertical line at the mean of the covariate

# testing the equal slope assumption
lm(dv ~ cov.c + ivF + cov.c:ivF, d) -> equalSlopesOrNo 

anova(equalSlopesOrNo)   # a NON-significant interaction is a win here!

summary(equalSlopesOrNo) # the cov.c slope is for the reference group, and the 
                         # cov.c:ivF2 and cov.c:ivF3 slopes are how far the other
                         # two group slopes are from it

# testing the independence of groups and the covariate
anova(lm(cov ~ ivF, d))