###############
# 
#  Preparation
# 
###############

library(tidyverse)
library(lme4)

# loading, checking, prepping data
read.csv("http://whlevine.hosted.uark.edu/psyc6343/twoclasses.csv") -> twoclasses
str(twoclasses)

# creating a factor version of classroom
twoclasses$classroom.f <- factor(twoclasses$classroom, 
                                 labels = c("class1", "class2"))

# how does R represent factors with dummy variables?
contrasts(twoclasses$classroom.f)

# dummy-coding classroom (strictly speaking, this isn't necessary)
twoclasses$classroom.d <- 0
twoclasses$classroom.d[twoclasses$classroom == 2] <- 1

#######################
# 
#  Section 1
#  examining the data
# 
#######################

ggplot(twoclasses,
       aes(x = pretest.c, 
           y = posttest, 
           color = classroom.f)) +
  scale_color_brewer(palette = "Dark2") +
  theme_bw() +
  geom_point(size = 2) +
  geom_smooth(method = lm, se = F) +
  geom_vline(xintercept = 0)

###########################################################
#
#  comparing various analytic options with multilevel data
#
###########################################################

############################################
# 
#  Section 2
#  separate regressions for each classroom
# 
############################################

model1 <- lm(data = twoclasses[twoclasses$classroom == 1,], posttest ~ pretest.c)
coef(model1)
model2 <- lm(data = twoclasses[twoclasses$classroom == 2,], posttest ~ pretest.c)
coef(model2)

# intercepts are (essentially) DV means when predictors are mean-centered

mean(twoclasses$posttest[twoclasses$classroom == 1])  # same as intercept 1
mean(twoclasses$posttest[twoclasses$classroom == 2])  # same as intercept 2

#################################
# 
#  Section 3
#  collapsing across classrooms
# 
#################################

model3 <- lm(data = twoclasses, posttest ~ pretest.c)
coef(model3)

predicted3 <- c(fitted.values(model3))
residual3 <- c(resid(model3))
to.plot3 <- data.frame(predicted3, residual3, twoclasses$classroom.f)

# visualizing clustering (non-independence) of residuals
ggplot(to.plot3,
       aes(x = predicted3, 
           y = residual3, 
           color = twoclasses.classroom.f)) +
  scale_color_brewer(palette = "Dark2") +
  theme_bw() +
  geom_point(size = 2) +
  geom_hline(yintercept = 0)

# quantifying non-independence of residuals
summary(to.plot3$residual3[to.plot3$twoclasses.classroom.f == "class1"])
summary(to.plot3$residual3[to.plot3$twoclasses.classroom.f == "class2"])

# how good are predictions?
summary(model3)$r.squared  # R-squared
summary(model3)$sigma      # SE of the estimate ("average" error of prediction)

######################################
# 
#  Section 4
#  modeling classrooms as a predictor
# 
######################################

model4 <- lm(data = twoclasses, posttest ~ pretest.c + classroom.d)
coef(model4)

  # other information that may be worth a look
  # summary(model4)
  # anova(model4)
  
  # a digression for you to play with on your own time
  # stuff you can pull out of lm()'s output (but not lme's!)
  # model4$coefficients   # betas
  # model4$residuals      # residuals
  # model4$fitted.values  # predicted scores
  # model4$call           # the regression equation
  # model4$model          # the data that were analyzed

predicted4 <- c(fitted.values(model4))
residual4 <- c(resid(model4))
to.plot4 <- data.frame(predicted4, residual4, twoclasses$classroom.f)

# visualizing residuals
ggplot(to.plot4,
       aes(x = predicted4, 
           y = residual4, 
           color = twoclasses.classroom.f)) +
  scale_color_brewer(palette = "Dark2") +
  theme_bw() +
  geom_point(size = 2) +
  geom_hline(yintercept = 0)

# quantifying residuals
summary(to.plot4$residual4[to.plot4$twoclasses.classroom.f == "class1"])
summary(to.plot4$residual4[to.plot4$twoclasses.classroom.f == "class2"])

# how good are predictions?
summary(model4)$r.squared  # R-squared
summary(model4)$sigma      # SE of the estimate ("average" error of prediction)

######################################
# 
#  Section 5
#  multilevel modeling
# 
######################################

model5 <- lmer(data = twoclasses, posttest ~ (1 | classroom.d) + pretest.c)
summary(model5)

predicted5 <- c(fitted.values(model5))
residual5 <- c(resid(model5))
to.plot5 <- data.frame(predicted5, residual5, twoclasses$classroom.f)

# visualizing residuals
ggplot(to.plot5,
       aes(x = predicted5, 
           y = residual5, 
           color = twoclasses.classroom.f)) +
  scale_color_brewer(palette = "Dark2") +
  theme_bw() +
  geom_point(size = 2) +
  geom_hline(yintercept = 0)

# quantifying residuals
summary(to.plot5$residual5[to.plot5$twoclasses.classroom.f == "class1"])
summary(to.plot5$residual5[to.plot5$twoclasses.classroom.f == "class2"])

# how good are predictions?

# summary(model5)$r.squared  # R-squared is not part of model summary for lmer()
# there are R-squared analogs for MLM (which we'll discuss in detail later)

summary(model5)$sigma      # SE of the estimate ("average" error of prediction)

######################################
#
# probably as far as we'll get today
#
######################################

######################################
# 
#  Preparation for scaling up
# 
######################################

# loading, checking, prepping data

read.csv("http://whlevine.hosted.uark.edu/psyc6343/thirtyclasses.csv") -> data
str(data)

# creating a class ID variable

  # the two parts of the class ID
  data$name <- "class"
  data$number <- as.character(data$classroom)
  
  # paste() the two parts together
  data$class.id <- paste(data$name, data$number)
  
  # coerce it to be a factor
  data$class.id <- as.factor(data$class.id)
  
  # clean-up
  data$name <- NULL
  data$number <- NULL

# other prep stuff
  
  # create an empty data frame for later use
  slope.int <- data.frame()
  
  # grand-mean center pretest scores
  data$pretest.c <- data$pretest - mean(data$pretest)

###############################################
# 
#  Section 6
#  scaling up
#  first, let's do 30 separate regressions!
# 
###############################################

# create a data frame of slopes and intercepts from 30 regressions, one for each
# class
for(x in 1:30)
{
  subset <- data[data$classroom == x,]
  temp <- lm(data = subset, posttest ~ pretest.c)
  slope <- summary(temp)$coefficients[2, 1]
  int <- summary(temp)$coefficients[1, 1]
  current.ab <- data.frame(int, slope, x)
  slope.int <- rbind(slope.int, current.ab)
}

# all the coefficients
slope.int
  
# examining the 30 slopes & intercepts

#install.packages("psych")
library("psych")

describe(slope.int$slope)
hist(slope.int$slope)
plot(density(slope.int$slope))

describe(slope.int$int)
hist(slope.int$int)
plot(density(slope.int$int))

###############################################
# 
#  Section 7
#  modeling classroom as a predictor
# 
###############################################

model6 <- lm(data = data, posttest ~ pretest.c + class.id)
coef(model6)

# crazy!

###############################################
# 
#  Section 8
#  MLM
# 
###############################################

model7 <- lmer(data = data, posttest ~ pretest.c + (1|class.id))

# fixed effects: intercept and any slopes
fixef(model7)
summary(model7)

    # digressions
    # random effects (Level 2 [classes] residuals)
    ranef(model7)
    # could put the random effects in a data frame if you want to play with them
    ranef(model7)$class.id -> randomEffects
    # Level 1 (students) residuals (all 822 of them)
    residuals(model7)
    # put them in a data frame for examination
    as.data.frame(residuals(model7)) -> subjectResiduals