# Drill #2: single-categorical predictor models # thanks to Austin Eubanks for some of this code library(tidyverse) library(lmSupport) df <- read_csv("https://whlevine.hosted.uark.edu/psyc5143/royer.csv") # id # gender (two levels: male and female) # grade (3 through 8) # addacc (accuracy on addition questions) # subacc (accuracy on subtraction questions) # multacc (accuracy on multiplication questions) # acc_mean (overall accuracy) # we'll be comparing boys and girls (3rd graders) on the overall accuracy here # simply to illustrate how to do it # we'll do this by giving the groups numeric values several ways df <- df %>% mutate(genderF = as.factor(gender), # R's default gender01 = ifelse(gender == "Female", 0, 1), # female = 0, male = 1 gender55 = ifelse(gender == "Female", -0.5, 0.5), # female = -0.5, male = 0.5 gender11 = ifelse(gender == "Female", -1, 1)) # female = -1, male = 1 # let's see what the new variables look like head(df) tail(df) # so far, so good; but we need to do more to examine the factor levels(df$genderF) # gives verbal labels contrasts(df$genderF) # okay; I RECOMMEND USING THIS FUNCTION EVERY SINGLE # TIME YOU HAVE A FACTOR VARIABLE - EVERY SINGLE TIME!! # ASSUME NOTHING!! CHECK ALWAYS!! VERIFY VERIFY VERIFY!! # some descriptives df %>% filter(grade == 3) %>% group_by(genderF) %>% summarise(M_acc = mean(acc_mean), SD_acc = sd(acc_mean), n = length(acc_mean)) # useful little table! df %>% filter(grade == 3) %>% summarise(overall_M_acc = mean(acc_mean)) # 3rd-grade males (87.6%) score higher than 3rd-grade females (83.8%) by 3.8 # percentage points; the overall mean is 85.6% # examining data plot(df$genderF, df$acc_mean) # base R boxplots # jittered "scatterplot" df %>% filter(grade == 3) %>% ggplot(aes(x = genderF, y = acc_mean)) + geom_jitter(width = .1) + theme_bw() # models modelFactor <- df %>% filter(grade == 3) %>% lm(data = ., acc_mean ~ genderF) model01 <- df %>% filter(grade == 3) %>% lm(data = ., acc_mean ~ gender01) model55 <- df %>% filter(grade == 3) %>% lm(data = ., acc_mean ~ gender55) model11 <- df %>% filter(grade == 3) %>% lm(data = ., acc_mean ~ gender11) # let's just look at parameters first (significance is less important) coef(modelFactor) coef(model01) coef(model55) coef(model11) # all but the the last model reproduce the (approximately) 3.8 percentage point # difference between groups in their slopes # two of the models have the female mean (83.8%) as their intercept # two of the models have the overall mean (85.6%) as their intercept # the -1, +1 model has half the 3.8 percentage point difference for its slope # let's look at p-values for the gender slope summary(modelFactor) summary(model01) summary(model55) summary(model11) # same p-value in every case! # we could do the male vs female comparison as a t-test df %>% filter(grade == 3) %>% t.test(data = ., acc_mean ~ genderF, var.equal = T) # what's up with the var.equal = T thing? the t.test function defaults to what # is usually called Welch's t-test, which does NOT assume that the equal # variance assumption is met; running the t-test function without fixes the # problem by adjusting df downward (to reduce the chance of a Type I error) df %>% filter(grade == 3) %>% t.test(data = ., acc_mean ~ genderF) # the p-value is higher; the df value is fractional! brace yourself! #BONUS: Another (shorter) way of changing the reference groups/dummy coding #Let's return to our original code where we can check the dummy coding. contrasts(df$genderF) #Again Female = 0 and Male = 1 #What if we wanted Males to be the reference group instead? Instead of creating #a new column, we can just use the combine function to update the values contrasts(df$genderF) <- c(1,0) #Let's check to see what we changed contrasts(df$genderF) #Now let's see how changing the reference group affects our coefficients. First, #we'll have to create a new model with our updated dummy coding modelFactorM <- df %>% filter(grade == 3) %>% lm(data = ., acc_mean ~ genderF) #Let's check out the parameter values coef(modelFactorM) coef(modelFactor) #original model w/ Female as reference group #what happens if we change the coding again contrasts(df$genderF) <- c(-.5,.5) contrasts(df$genderF) model55_2 <- df %>% filter(grade == 3) %>% lm(data = ., acc_mean ~ genderF) coef(model55_2) coef(model55) #original model with female = -0.5, male = 0.5 #Here you can see the we get the same results as before. These same steps are #repeated below for the female = -1, male = 1 coding contrasts(df$genderF) <- c(-1,1) contrasts(df$genderF) model11_2 <- df %>% filter(grade == 3) %>% lm(data = ., acc_mean ~ genderF) coef(model11_2) coef(model11)