library(foreign)

## download the kidiq dataset and read it into an R dataframe
kidiq_url <- "http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/kidiq.dta"
kidiq_file <- "kidiq.dta"
download.file(url = kidiq_url, destfile = kidiq_file, mode="w")

iq <- read.dta(kidiq_file)

###### Using factors for categorical predictors
## Coding categorical predictors as factors can be convenient because
## a) it changes the summmary and print behavior for that predictor,
## b) factors are often handled differently than continuous predictors
## in plotting, and c) R will keep you from treating categorical
## predictors as continous ones (unless you explicitly recode a factor
## as numeric)
iq.f <- iq

## if we want informative names for the levels of the predictor, we
## specify which levels are observed in the data, and how we want to
## label them.  The default is to use the levels themselves as the
## labels.
iq.f$mom_hs <- factor(iq$mom_hs, levels=c(0,1), labels=c("no", "yes"))

## if we don't need special labels, as.factor is a quick conversion
## method.
iq.f$mom_work <- as.factor(iq$mom_work)

## ordered() and as.ordered() are just variations on factor() and
## as.factor(), with the argument ordered=TRUE.  ifelse() can be very
## useful for recoding a variable, but it can get to be hard to read
## if you nest too many of them.  In that case, use either a full set
## of if/else if/else, or use switch().
iq.f$mom_age <- ordered( ifelse( iq$mom_age <= 21, 0,
                                ifelse( iq$mom_age <= 25, 1, 2)),
                        levels = c(0,1,2),
                        labels = c("early", "mid", "late"))

###### Dummy coding (treatment contrasts) for binary predictors
## test the hypothesis that there is a difference between a mom going
## to high school and not
iq.lm <- lm(kid_score ~ mom_hs, data = iq)
library(arm)
display(iq.lm)

## same test, coding mom_hs as a factor
iq.f.lm <- update(iq.lm, data = iq.f, evaluate=TRUE)
display(iq.f.lm)

## why are these the same?  R treats the levels of the factor as 0 and
## 1, the same way they were coded in the original data (dummy or
## treatment coding)
contrasts(iq.f$mom_hs)

## this is controlled by the option called "contrasts".  The first
## value is which contrast matrix to use for unordered factors, the
## second is which contrast matrix to use for ordered factors.
getOption("contrasts")                  

## the default setting for the contrasts option can be explicitly set
## like this
options(contrasts=c("contr.treatment", "contr.poly"))

###### Contast coding (sum-to-zero contrasts) for binary predictors
contrasts(iq.f$mom_hs) <- contr.sum(2)
contrasts(iq.f$mom_hs)
## by default, contr.sum assigns -1 to the level that comes last
## alphabetically.  change this by hand
iq.f$mom_hs <- C(iq.f$mom_hs, c(-1, 1)) # instead of c(1, -1)
contrasts(iq.f$mom_hs)
## or change the internal order of the factor
iq.f$mom_hs <- factor(iq$mom_hs,
                      levels = c(1, 0),
                      labels = c("yes", "no"))
contrasts(iq.f$mom_hs) <- contr.sum(2)
contrasts(iq.f$mom_hs)
## test the hypothesis that a kid whose mom went to high school scores
## higher than "the average kid".  This assumes balanced data, which
## we don't have.
iq.f.lm <- update(iq.f.lm)
display(iq.f.lm)

## for balanced data, the intercept should be the grand mean
iq.f.bal <- rbind(iq.f[iq.f$mom_hs == "no", ][1:90,],
                  iq.f[iq.f$mom_hs == "yes", ][1:90,])
iq.f.bal$mom_hs <- C(iq.f.bal$mom_hs, contr.sum)
mean(iq.f.bal$kid_score)
iq.f.bal.lm <- update(iq.f.lm, data = iq.f.bal)

display(iq.f.bal.lm)

## if we flip the contrasts for balanced data, we flip the sign of the
## coefficient
iq.f.bal$mom_hs <- C(iq.f.bal$mom_hs, c(-1,1))
iq.f.bal.lm <- update(iq.f.bal.lm)

display(iq.f.bal.lm)

###### Centered binary predictors
## For balanced data , contrast coding (sum-to-zero coding) tests the
## hypothesis that a given level of a factor is different than the
## mean value of the factor.  How do we test this same hypothesis for
## unbalanced data?  Center our binary predictors.

## by hand, with numeric coding
iq$mom_hs <- (iq$mom_hs - min(iq$mom_hs)) /
  (max(iq$mom_hs) - min(iq$mom_hs))     # turns everything into 0 or 1
iq$mom_hs <- iq$mom_hs - mean(iq$mom_hs) # subtract the mean to center

unique(iq$mom_hs)                       # we still have two levels,
                                        # but they're centered now

iq.lm <- update(iq.lm)                  # intercept should be the
                                        # grand mean
display(iq.lm)

## to make this work with factors, convert them back to numeric coding
iq.f$mom_hs <- as.factor( ifelse( iq.f$mom_hs == "yes", 1, 0))
iq.f$mom_hs <- as.numeric(levels(iq.f$mom_hs))[iq.f$mom_hs]
iq.f$mom_hs <- (iq.f$mom_hs - min(iq.f$mom_hs)) / (max(iq.f$mom_hs) - min(iq.f$mom_hs))
iq.f$mom_hs <- iq.f$mom_hs - mean(iq.f$mom_hs) # same as
                                               # scale(iq.f$mom_hs,
                                               # center=TRUE,
                                               # scale=FALSE)
unique(iq.f$mom_hs)
iq.f.lm <- update(iq.f.lm)
display(iq.f.lm)

## You can also use the rescale() method from the arm package to
## center a single binary predictor, or the standardize() method to
## center every binary predictor in a model.  Methods from arm tend
## not to work very well with categorical predictors stored as
## factors.  Hopefully this will change soon.

###### Categorical predictors with more than two levels
## Storing a categorical predictor as a factor with more than two
## levels leads to a series of comparisons between levels of the
## factor.  By default, R uses dummy coding (treatment contrasts),
## comparing each level to a reference level
levels(iq.f$mom_work)
contrasts(iq.f$mom_work)
iq.f.lm <- update(iq.f.lm, kid_score ~ mom_work)
display(iq.f.lm)
## right now, each level is compared to the level coded as 1.  We can
## change this
iq.f$mom_work <- relevel(iq.f$mom_work, ref=4)
contrasts(iq.f$mom_work)                # now everything is compared
                                        # to the level coded as 4
iq.f.lm <- update(iq.f.lm)
display(iq.f.lm)
## We can also use contrast coding, comparing each level to the mean
## across all levels
contrasts(iq.f$mom_work) <- contr.sum(4)
iq.f.lm <- update(iq.f.lm)
display(iq.f.lm)

## if we want to test the second, third, and fourth levels instead, we
## need to reorder the levels of the factor
levels(iq.f$mom_work)
iq.f$mom_work <- factor(iq.f$mom_work, levels=c(4,3,2,1))
contrasts(iq.f$mom_work) <- contr.sum(4)
iq.f.lm <- update(iq.f.lm)
display(iq.f.lm)
contrasts(iq.f$mom_work)
## notice that the names used in the lm() results didn't change, even
## though we're doing different comparisons now.  This can be a pain
## and can make it harder to interpret the comparisons that are
## carried out.  Make sure that you always know which comparison
## corresponds to which label.  If you want to insert your own labels,
## try
dimnames(contrasts(iq.f$mom_work))[[2]] <- c("4", "3", "2")
iq.f.lm <- update(iq.f.lm)
display(iq.f.lm)
## Using contrast coding assumes that all of the levels are balanced.
## If this is not true, we get the same problems as in the case of
## binary predictors (intercept is hard to interpret, works poorly in
## interactions, ...).  Ideally, we want to center each pairwise
## comparison.  This can be done by recoding the index variable into a
## set of indicator variables, and centering as shown above.  If
## anyone knows of a function that sets centered contrasts for a
## series of comparisons, please let me know.
iq.f$mom_work1 <- ifelse(iq.f$mom_work == "1", 1, 0)
iq.f$mom_work2 <- ifelse(iq.f$mom_work == "2", 1, 0)
iq.f$mom_work3 <- ifelse(iq.f$mom_work == "3", 1, 0)
iq.f$mom_work4 <- ifelse(iq.f$mom_work == "4", 1, 0)

iq.f$mom_work1 <- scale(iq.f$mom_work1, scale=FALSE)
iq.f$mom_work2 <- scale(iq.f$mom_work2, scale=FALSE)
iq.f$mom_work3 <- scale(iq.f$mom_work3, scale=FALSE)
iq.f$mom_work4 <- scale(iq.f$mom_work4, scale=FALSE)
unique(iq.f$mom_work1)
unique(iq.f$mom_work2)
unique(iq.f$mom_work3)
unique(iq.f$mom_work4)

## Remember that we can only test three of the four levels, because
## one df is use in estimating the mean across all levels.  Also,
## after centering all indicator variables, the intercept should
## correspond to the grand mean
iq.f.lm <- update(iq.f.lm,
                  kid_score ~ mom_work1 + mom_work2 + mom_work3, data = iq.f)
display(iq.f.lm)
## or
## update(iq.f.lm, kid_score ~ mom_work2 + mom_work3 + mom_work4, data = iq.f)

###### Ordered factors
## We can test the hypothesis that moving from one level of an ordered
## categorical predictor to the next level has an effect on the
## dependent variable.  This is different than using a continuous
## predictor, because the levels of the ordered factor aren't
## necessarily equidistant.

## linear predictor.  intercept corresponds to a mother with age 0.
iq.lm <- update(iq.lm, kid_score ~ mom_age, data = iq) # marginal
                                                       # linear effect
## ordered categorical predictor
iq.f.lm <- update(iq.f.lm, kid_score ~ mom_age, data = iq.f) # marginal
                                                             # linear
                                                             # effect,
                                                             # no
                                                             # quadratic
                                                             # effect
display(iq.f.lm)
## by default, ordered predictors use polynomial contrasts
contr.poly(3)
## The column names indicate the polynomial term that is being
## estimated: (L)inear, (Q)uadratic, (C)ubic, ^4, ^5, ...

## the other option that is sometimes used for ordered predictors is
## Helmert contrasts (contr.helmert).

###### Interactions and categorical predictors
## Categorical predictors should be centered before being entered into
## an interaction.  Otherwise, correlations between the levels of the
## factors change the results of the interaction.

## reset the data frame
iq <- read.dta(kidiq_file)
iq.f <- iq
iq.f$mom_hs <- factor(iq$mom_hs,
                      levels = c(0, 1),
                      labels = c("no", "yes"))

iq.f$mom_work <- factor(iq$mom_work,
                        levels = c(1, 2, 3, 4))

iq.f$mom_age <- ordered( ifelse( iq$mom_age <= 21, 0,
                                ifelse( iq$mom_age <= 25, 1, 2)),
                        levels = c(0,1,2),
                        labels = c("early", "mid", "late"))

## compare dummy coding 
contrasts(iq.f$mom_hs)
contrasts(iq.f$mom_work)
iq.f.lm.int1 <- lm(kid_score ~ mom_hs + mom_work + mom_hs:mom_work, iq.f)
display(iq.f.lm.int1)

## to contrast coding
getOption("contrasts")
options(contrasts=c("contr.sum", "contr.poly"))
getOption("contrasts")

contrasts(iq.f$mom_hs)
contrasts(iq.f$mom_work)
iq.f.lm.int2 <- update(iq.f.lm.int1)

## to centered predictors
iq.f$mom_hs <- ifelse(iq.f$mom_hs == "yes", 1, 0)
iq.f$mom_work1 <- ifelse(iq.f$mom_work == "1", 1, 0)
iq.f$mom_work2 <- ifelse(iq.f$mom_work == "2", 1, 0)
iq.f$mom_work3 <- ifelse(iq.f$mom_work == "3", 1, 0)
iq.f$mom_work4 <- ifelse(iq.f$mom_work == "4", 1, 0)

iq.f$mom_hs <- scale(iq.f$mom_hs, scale=FALSE)
iq.f$mom_work1 <- scale(iq.f$mom_work1, scale=FALSE)
iq.f$mom_work2 <- scale(iq.f$mom_work2, scale=FALSE)
iq.f$mom_work3 <- scale(iq.f$mom_work3, scale=FALSE)
iq.f$mom_work4 <- scale(iq.f$mom_work4, scale=FALSE)

iq.f.lm.int3 <- lm( kid_score ~ mom_hs * (mom_work1 + mom_work2 + mom_work3), iq.f)
display(iq.f.lm.int3)
