# Drill #2: single-categorical predictor models

# thanks to Austin Eubanks for some of this code

library(tidyverse)
library(lmSupport)

df <- read_csv("https://whlevine.hosted.uark.edu/psyc5143/royer.csv")

# id
# gender (two levels: male and female)
# grade (3 through 8)
# addacc (accuracy on addition questions)
# subacc (accuracy on subtraction questions)
# multacc (accuracy on multiplication questions)
# acc_mean (overall accuracy)

# we'll be comparing boys and girls (3rd graders) on the overall accuracy here
# simply to illustrate how to do it

# we'll do this by giving the groups numeric values several ways

df <- df %>% 
  mutate(genderF = as.factor(gender),                      # R's default
         gender01 = ifelse(gender == "Female", 0, 1),      # female = 0, male = 1
         gender55 = ifelse(gender == "Female", -0.5, 0.5), # female = -0.5, male = 0.5
         gender11 = ifelse(gender == "Female", -1, 1))     # female = -1, male = 1

# let's see what the new variables look like
head(df)
tail(df)

# so far, so good; but we need to do more to examine the factor
levels(df$genderF)      # gives verbal labels

contrasts(df$genderF)   # okay; I RECOMMEND USING THIS FUNCTION EVERY SINGLE
                        # TIME YOU HAVE A FACTOR VARIABLE - EVERY SINGLE TIME!!
                        # ASSUME NOTHING!! CHECK ALWAYS!! VERIFY VERIFY VERIFY!!

# some descriptives

df %>% 
  filter(grade == 3) %>% 
  group_by(genderF) %>% 
  summarise(M_acc = mean(acc_mean),
            SD_acc = sd(acc_mean),
            n = length(acc_mean))   # useful little table!

df %>% filter(grade == 3) %>% summarise(overall_M_acc = mean(acc_mean))

# 3rd-grade males (87.6%) score higher than 3rd-grade females (83.8%) by 3.8
# percentage points; the overall mean is 85.6%

# examining data
plot(df$genderF, df$acc_mean)  # base R boxplots

# jittered "scatterplot"
df %>% 
  filter(grade == 3) %>% 
  ggplot(aes(x = genderF, y = acc_mean)) + 
  geom_jitter(width = .1) +
  theme_bw()

# models
modelFactor <- df %>% 
  filter(grade == 3) %>% 
  lm(data = ., acc_mean ~ genderF)

model01 <- df %>% 
  filter(grade == 3) %>% 
  lm(data = ., acc_mean ~ gender01)

model55 <- df %>% 
  filter(grade == 3) %>% 
  lm(data = ., acc_mean ~ gender55)

model11 <- df %>% 
  filter(grade == 3) %>% 
  lm(data = ., acc_mean ~ gender11)

# let's just look at parameters first (significance is less important)
coef(modelFactor)
coef(model01)
coef(model55)
coef(model11)
# all but the the last model reproduce the (approximately) 3.8 percentage point
# difference between groups in their slopes

# two of the models have the female mean (83.8%) as their intercept

# two of the models have the overall mean (85.6%) as their intercept

# the -1, +1 model has half the 3.8 percentage point difference for its slope

# let's look at p-values for the gender slope
summary(modelFactor)
summary(model01)
summary(model55)
summary(model11)
# same p-value in every case!

# we could do the male vs female comparison as a t-test

df %>% 
  filter(grade == 3) %>% 
  t.test(data = ., acc_mean ~ genderF, var.equal = T)

# what's up with the var.equal = T thing? the t.test function defaults to what
# is usually called Welch's t-test, which does NOT assume that the equal
# variance assumption is met; running the t-test function without fixes the
# problem by adjusting df downward (to reduce the chance of a Type I error)

df %>% 
  filter(grade == 3) %>% 
  t.test(data = ., acc_mean ~ genderF)

# the p-value is higher; the df value is fractional! brace yourself!

#BONUS: Another (shorter) way of changing the reference groups/dummy coding

#Let's return to our original code where we can check the dummy coding. 
contrasts(df$genderF) #Again Female = 0 and Male = 1

#What if we wanted Males to be the reference group instead? Instead of creating 
#a new column, we can just use the combine function to update the values
contrasts(df$genderF) <- c(1,0)

#Let's check to see what we changed
contrasts(df$genderF)

#Now let's see how changing the reference group affects our coefficients. First,
#we'll have to create a new model with our updated dummy coding
modelFactorM <- df %>% 
  filter(grade == 3) %>% 
  lm(data = ., acc_mean ~ genderF) 

#Let's check out the parameter values
coef(modelFactorM)
coef(modelFactor) #original model w/ Female as reference group

#what happens if we change the coding again
contrasts(df$genderF) <- c(-.5,.5)

contrasts(df$genderF)

model55_2 <- df %>% 
  filter(grade == 3) %>% 
  lm(data = ., acc_mean ~ genderF) 

coef(model55_2)
coef(model55) #original model with female = -0.5, male = 0.5

#Here you can see the we get the same results as before. These same steps are 
#repeated below for the female = -1, male = 1 coding

contrasts(df$genderF) <- c(-1,1)

contrasts(df$genderF)

model11_2 <- df %>% 
  filter(grade == 3) %>% 
  lm(data = ., acc_mean ~ genderF) 

coef(model11_2)
coef(model11)