# March 13, 2024

# install.packages("faux")

library(tidyverse)
library(lmSupport)
library(car)
library(faux) # for creating variables correlated with an existing variable

# fake data
set.seed(2001)
treatment <- round(rnorm(n = 10, mean = 12, sd = 3), 0)
control <- round(rnorm(n = 10, mean = 10, sd = 4), 0)

# group means
mean(treatment); mean(control)

# putting the data into a data frame/tibble
d <- tibble(group = c(rep("tx", 10), rep("ctrl", 10)),
						correct = c(treatment, control))

# adding a contrast code
d <- d %>% 
	mutate(con1 = ifelse(group == "ctrl", -1/2, 1/2))

# creating a fake covariate ("foreign language exposure")
model1 <- lm(correct ~ con1, d)
e <- resid(model1)
FLE <- round(rnorm_pre(x = e, mu = 50, sd = 10, r = 0.2), 0)

# add FLE to the tibble, mean-centered
d <- d %>% 
	mutate(FLE = FLE - mean(FLE))

# the ANCOVA model
model2 <- lm(correct ~ con1 + FLE, d) # via contrast codes
model3 <- lm(correct ~ groupF + FLE, d)

# homogeneity of slopes

# ggplot prefers factors
d <- d %>% 
	mutate(groupF = factor(group))

# this is the ANCOVA model with lines of best fit that have the same slope

# we need predicted scores from the ANCOVA model in the data
d <- d %>% 
	mutate(pred = fitted(model2))

ggplot(d, aes(y = correct, x = FLE, col = group)) +
	geom_point() +
	geom_line(mapping = aes(y = pred), size = 2) +
	theme_bw() +
	geom_vline(xintercept = mean(d$FLE))

coef(model3)
	
# allowing slopes to vary across groups
ggplot(d, aes(y = correct, x = FLE, col = group)) +
	geom_point() +
	geom_smooth(method = "lm", fill = NA) +
	theme_bw() +
	geom_vline(xintercept = mean(d$FLE))

# the model that allows slopes to vary
model4 <- lm(correct ~ groupF*FLE, d)
coef(model4)

# testing independence of group and covariate
summary(lm(FLE ~ groupF, d))
# NS! good!

###

# interpretation complications!

d2 <- read.csv("https://whlevine.hosted.uark.edu/psyc5143/april13.csv")

# actual group means
d2 %>% group_by(group) %>% summarise(M = mean(Y))

# mean-centering covariates
d2 <- d2 %>% 
	mutate(Zc = Z - mean(Z),
				 Zbadc = Zbad - mean(Zbad))

# the typical model
summary(lm(Y ~ X1 + X2, d2))

# the ANCOVA
summary(lm(Y ~ X1 + X2 + Zc, d2))

# getting adjusted means using the effects package
library(effects)

ANCOVA <- lm(Y ~ group + Zc, d2)

effect("group", ANCOVA)

# using the "bad" covariate

# what's bad about it? it's not independent of group
summary(aov(Zbad ~ group, d2))

ANCOVA2 <- lm(Y ~ group + Zbadc, d2)

effect("group", ANCOVA2)

# adjusted means will be far from actual means when the covariate is related to
# group membership

# what causes the problem? adjusted means are based on the predicted score for
# each group for the overall mean of the covariate

# the horizontal lines illustrate Y means for each group
# the verticle lines illustrae Zbad means for each group

ggplot(d2, aes(y = Y, x = Zbad, col = group)) +
	geom_point() +
	geom_hline(yintercept = mean(d2$Y[d2$group == "A1"]), color = "#F8766D") +
	geom_hline(yintercept = mean(d2$Y[d2$group == "A2"]), color = "#00BA38") +
	geom_hline(yintercept = mean(d2$Y[d2$group == "A3"]), color = "#619CFF") +
	geom_vline(xintercept = mean(d2$Zbad[d2$group == "A1"]), color = '#F8766D') +
	geom_vline(xintercept = mean(d2$Zbad[d2$group == "A2"]), color = '#00BA38') +
	geom_vline(xintercept = mean(d2$Zbad[d2$group == "A3"]), color = '#619CFF')

# what are the group means of the covariate?
aggregate(Zbad ~ group, d2, mean)

# the graph generated below illustrates why this is a problem
ggplot(d2, aes(y = Y, x = Zbadc, col = group)) +
	geom_point() +
	geom_abline(aes(intercept = 7.1, slope = 0.2686672), color = "#F8766D") +
	geom_abline(aes(intercept = 19.5, slope = 0.2686672), color = "#00BA38") +
	geom_abline(aes(intercept = 33.4, slope = 0.2688672), color = "#619CFF") +
	geom_vline(xintercept = 0) +
	geom_hline(yintercept = mean(d$Y[d$group == "A1"]), color = "#F8766D") +
	geom_hline(yintercept = mean(d$Y[d$group == "A2"]), color = "#00BA38") +
	geom_hline(yintercept = mean(d$Y[d$group == "A3"]), color = "#619CFF")