attachment:contrast-coding.R of HLPMiniCourseSession1

   1 library(foreign)
   2 
   3 ## download the kidiq dataset and read it into an R dataframe
   4 kidiq_url <- "http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/kidiq.dta"
   5 kidiq_file <- "kidiq.dta"
   6 download.file(url = kidiq_url, destfile = kidiq_file, mode="w")
   7 
   8 iq <- read.dta(kidiq_file)
   9 
  10 ###### Using factors for categorical predictors
  11 ## Coding categorical predictors as factors can be convenient because
  12 ## a) it changes the summmary and print behavior for that predictor,
  13 ## b) factors are often handled differently than continuous predictors
  14 ## in plotting, and c) R will keep you from treating categorical
  15 ## predictors as continous ones (unless you explicitly recode a factor
  16 ## as numeric)
  17 iq.f <- iq
  18 
  19 ## if we want informative names for the levels of the predictor, we
  20 ## specify which levels are observed in the data, and how we want to
  21 ## label them.  The default is to use the levels themselves as the
  22 ## labels.
  23 iq.f$mom.hs <- factor(iq$mom.hs,
  24                       levels = c(0, 1),
  25                       labels = c("no", "yes"))
  26 
  27 ## if we don't need special labels, as.factor is a quick conversion
  28 ## method.
  29 iq.f$mom.work <- as.factor(iq$mom.work)
  30 
  31 ## ordered() and as.ordered() are just variations on factor() and
  32 ## as.factor(), with the argument ordered=TRUE.  ifelse() can be very
  33 ## useful for recoding a variable, but it can get to be hard to read
  34 ## if you nest too many of them.  In that case, use either a full set
  35 ## of if/else if/else, or use switch().
  36 iq.f$mom.age <- ordered( ifelse( iq$mom.age <= 21, 0,
  37                                 ifelse( iq$mom.age <= 25, 1, 2)),
  38                         levels = c(0,1,2),
  39                         labels = c("early", "mid", "late"))
  40 
  41 ###### Dummy coding (treatment contrasts) for binary predictors
  42 ## test the hypothesis that there is a difference between a mom going
  43 ## to high school and not
  44 iq.lm <- lm(kid.score ~ mom.hs, data = iq)
  45 library(arm)
  46 display(iq.lm)
  47 
  48 ## same test, coding mom.hs as a factor
  49 iq.f.lm <- update(iq.lm, data = iq.f, evaluate=TRUE)
  50 display(iq.f.lm)
  51 
  52 ## why are these the same?  R treats the levels of the factor as 0 and
  53 ## 1, the same way they were coded in the original data (dummy or
  54 ## treatment coding)
  55 contrasts(iq.f$mom.hs)
  56 
  57 ## this is controlled by the option called "contrasts".  The first
  58 ## value is which contrast matrix to use for unordered factors, the
  59 ## second is which contrast matrix to use for ordered factors.
  60 getOption("contrasts")                  
  61 
  62 ## the default setting for the contrasts option can be explicitly set
  63 ## like this
  64 options(contrasts=c("contr.treatment", "contr.poly"))
  65 
  66 ###### Contast coding (sum-to-zero contrasts) for binary predictors
  67 contrasts(iq.f$mom.hs) <- contr.sum(2)
  68 contrasts(iq.f$mom.hs)
  69 ## by default, contr.sum assigns -1 to the level that comes last
  70 ## alphabetically.  change this by hand
  71 iq.f$mom.hs <- C(iq.f$mom.hs, c(-1, 1)) # instead of c(1, -1)
  72 contrasts(iq.f$mom.hs)
  73 ## or change the internal order of the factor
  74 iq.f$mom.hs <- factor(iq$mom.hs,
  75                       levels = c(1, 0),
  76                       labels = c("yes", "no"))
  77 contrasts(iq.f$mom.hs) <- contr.sum(2)
  78 contrasts(iq.f$mom.hs)
  79 ## test the hypothesis that a kid whose mom went to high school scores
  80 ## higher than "the average kid".  This assumes balanced data, which
  81 ## we don't have.
  82 iq.f.lm <- update(iq.f.lm)
  83 display(iq.f.lm)
  84 
  85 ## for balanced data, the intercept should be the grand mean
  86 iq.f.bal <- rbind(iq.f[iq.f$mom.hs == "no", ][1:90,],
  87                   iq.f[iq.f$mom.hs == "yes", ][1:90,])
  88 iq.f.bal$mom.hs <- C(iq.f.bal$mom.hs, contr.sum)
  89 mean(iq.f.bal$kid.score)
  90 iq.f.bal.lm <- update(iq.f.lm, data = iq.f.bal)
  91 
  92 display(iq.f.bal.lm)
  93 
  94 ## if we flip the contrasts for balanced data, we flip the sign of the
  95 ## coefficient
  96 iq.f.bal$mom.hs <- C(iq.f.bal$mom.hs, c(-1,1))
  97 iq.f.bal.lm <- update(iq.f.bal.lm)
  98 
  99 display(iq.f.bal.lm)
 100 
 101 ###### Centered binary predictors
 102 ## For balanced data , contrast coding (sum-to-zero coding) tests the
 103 ## hypothesis that a given level of a factor is different than the
 104 ## mean value of the factor.  How do we test this same hypothesis for
 105 ## unbalanced data?  Center our binary predictors.
 106 
 107 ## by hand, with numeric coding
 108 iq$mom.hs <- (iq$mom.hs - min(iq$mom.hs)) /
 109   (max(iq$mom.hs) - min(iq$mom.hs))     # turns everything into 0 or 1
 110 iq$mom.hs <- iq$mom.hs - mean(iq$mom.hs) # subtract the mean to center
 111 
 112 unique(iq$mom.hs)                       # we still have two levels,
 113                                         # but they're centered now
 114 
 115 iq.lm <- update(iq.lm)                  # intercept should be the
 116                                         # grand mean
 117 display(iq.lm)
 118 
 119 ## to make this work with factors, convert them back to numeric coding
 120 iq.f$mom.hs <- as.factor( ifelse( iq.f$mom.hs == "yes", 1, 0))
 121 iq.f$mom.hs <- as.numeric(levels(iq.f$mom.hs))[iq.f$mom.hs]
 122 iq.f$mom.hs <- (iq.f$mom.hs - min(iq.f$mom.hs)) / (max(iq.f$mom.hs) - min(iq.f$mom.hs))
 123 iq.f$mom.hs <- iq.f$mom.hs - mean(iq.f$mom.hs) # same as
 124                                                # scale(iq.f$mom.hs,
 125                                                # center=TRUE,
 126                                                # scale=FALSE)
 127 unique(iq.f$mom.hs)
 128 iq.f.lm <- update(iq.f.lm)
 129 display(iq.f.lm)
 130 
 131 ## You can also use the rescale() method from the arm package to
 132 ## center a single binary predictor, or the standardize() method to
 133 ## center every binary predictor in a model.  Methods from arm tend
 134 ## not to work very well with categorical predictors stored as
 135 ## factors.  Hopefully this will change soon.
 136 
 137 ###### Categorical predictors with more than two levels
 138 ## Storing a categorical predictor as a factor with more than two
 139 ## levels leads to a series of comparisons between levels of the
 140 ## factor.  By default, R uses dummy coding (treatment contrasts),
 141 ## comparing each level to a reference level
 142 levels(iq.f$mom.work)
 143 contrasts(iq.f$mom.work)
 144 iq.f.lm <- update(iq.f.lm, kid.score ~ mom.work)
 145 display(iq.f.lm)
 146 ## right now, each level is compared to the level coded as 1.  We can
 147 ## change this
 148 iq.f$mom.work <- relevel(iq.f$mom.work, ref=4)
 149 contrasts(iq.f$mom.work)                # now everything is compared
 150                                         # to the level coded as 4
 151 iq.f.lm <- update(iq.f.lm)
 152 display(iq.f.lm)
 153 ## We can also use contrast coding, comparing each level to the mean
 154 ## across all levels
 155 contrasts(iq.f$mom.work) <- contr.sum(4)
 156 iq.f.lm <- update(iq.f.lm)
 157 display(iq.f.lm)
 158 
 159 ## if we want to test the second, third, and fourth levels instead, we
 160 ## need to reorder the levels of the factor
 161 levels(iq.f$mom.work)
 162 iq.f$mom.work <- factor(iq.f$mom.work, levels=c(4,3,2,1))
 163 contrasts(iq.f$mom.work) <- contr.sum(4)
 164 iq.f.lm <- update(iq.f.lm)
 165 display(iq.f.lm)
 166 contrasts(iq.f$mom.work)
 167 ## notice that the names used in the lm() results didn't change, even
 168 ## though we're doing different comparisons now.  This can be a pain
 169 ## and can make it harder to interpret the comparisons that are
 170 ## carried out.  Make sure that you always know which comparison
 171 ## corresponds to which label.  If you want to insert your own labels,
 172 ## try
 173 dimnames(contrasts(iq.f$mom.work))[[2]] <- c("4", "3", "2")
 174 iq.f.lm <- update(iq.f.lm)
 175 display(iq.f.lm)
 176 ## Using contrast coding assumes that all of the levels are balanced.
 177 ## If this is not true, we get the same problems as in the case of
 178 ## binary predictors (intercept is hard to interpret, works poorly in
 179 ## interactions, ...).  Ideally, we want to center each pairwise
 180 ## comparison.  This can be done by recoding the index variable into a
 181 ## set of indicator variables, and centering as shown above.  If
 182 ## anyone knows of a function that sets centered contrasts for a
 183 ## series of comparisons, please let me know.
 184 iq.f$mom.work1 <- ifelse(iq.f$mom.work == "1", 1, 0)
 185 iq.f$mom.work2 <- ifelse(iq.f$mom.work == "2", 1, 0)
 186 iq.f$mom.work3 <- ifelse(iq.f$mom.work == "3", 1, 0)
 187 iq.f$mom.work4 <- ifelse(iq.f$mom.work == "4", 1, 0)
 188 
 189 iq.f$mom.work1 <- scale(iq.f$mom.work1, scale=FALSE)
 190 iq.f$mom.work2 <- scale(iq.f$mom.work2, scale=FALSE)
 191 iq.f$mom.work3 <- scale(iq.f$mom.work3, scale=FALSE)
 192 iq.f$mom.work4 <- scale(iq.f$mom.work4, scale=FALSE)
 193 unique(iq.f$mom.work1)
 194 unique(iq.f$mom.work2)
 195 unique(iq.f$mom.work3)
 196 unique(iq.f$mom.work4)
 197 
 198 ## Remember that we can only test three of the four levels, because
 199 ## one df is use in estimating the mean across all levels.  Also,
 200 ## after centering all indicator variables, the intercept should
 201 ## correspond to the grand mean
 202 iq.f.lm <- update(iq.f.lm,
 203                   kid.score ~ mom.work1 + mom.work2 + mom.work3, data = iq.f)
 204 display(iq.f.lm)
 205 ## or
 206 ## update(iq.f.lm, kid.score ~ mom.work2 + mom.work3 + mom.work4, data = iq.f)
 207 
 208 ###### Ordered factors
 209 ## We can test the hypothesis that moving from one level of an ordered
 210 ## categorical predictor to the next level has an effect on the
 211 ## dependent variable.  This is different than using a continuous
 212 ## predictor, because the levels of the ordered factor aren't
 213 ## necessarily equidistant.
 214 
 215 ## linear predictor.  intercept corresponds to a mother with age 0.
 216 iq.lm <- update(iq.lm, kid.score ~ mom.age, data = iq) # marginal
 217                                                        # linear effect
 218 ## ordered categorical predictor
 219 iq.f.lm <- update(iq.f.lm, kid.score ~ mom.age, data = iq.f) # marginal
 220                                                              # linear
 221                                                              # effect,
 222                                                              # no
 223                                                              # quadratic
 224                                                              # effect
 225 display(iq.f.lm)
 226 ## by default, ordered predictors use polynomial contrasts
 227 contr.poly(3)
 228 ## The column names indicate the polynomial term that is being
 229 ## estimated: (L)inear, (Q)uadratic, (C)ubic, ^4, ^5, ...
 230 
 231 ## the other option that is sometimes used for ordered predictors is
 232 ## Helmert contrasts (contr.helmert).
 233 
 234 ###### Interactions and categorical predictors
 235 ## Categorical predictors should be centered before being entered into
 236 ## an interaction.  Otherwise, correlations between the levels of the
 237 ## factors change the results of the interaction.
 238 
 239 ## reset the data frame
 240 iq <- read.dta(kidiq_file)
 241 iq.f <- iq
 242 iq.f$mom.hs <- factor(iq$mom.hs,
 243                       levels = c(0, 1),
 244                       labels = c("no", "yes"))
 245 
 246 iq.f$mom.work <- factor(iq$mom.work,
 247                         levels = c(1, 2, 3, 4))
 248 
 249 iq.f$mom.age <- ordered( ifelse( iq$mom.age <= 21, 0,
 250                                 ifelse( iq$mom.age <= 25, 1, 2)),
 251                         levels = c(0,1,2),
 252                         labels = c("early", "mid", "late"))
 253 
 254 ## compare dummy coding 
 255 contrasts(iq.f$mom.hs)
 256 contrasts(iq.f$mom.work)
 257 iq.f.lm.int1 <- lm(kid.score ~ mom.hs + mom.work + mom.hs:mom.work, iq.f)
 258 display(iq.f.lm.int1)
 259 
 260 ## to contrast coding
 261 getOption("contrasts")
 262 options(contrasts=c("contr.sum", "contr.poly"))
 263 getOption("contrasts")
 264 
 265 contrasts(iq.f$mom.hs)
 266 contrasts(iq.f$mom.work)
 267 iq.f.lm.int2 <- update(iq.f.lm.int1)
 268 
 269 ## to centered predictors
 270 iq.f$mom.hs <- ifelse(iq.f$mom.hs == "yes", 1, 0)
 271 iq.f$mom.work1 <- ifelse(iq.f$mom.work == "1", 1, 0)
 272 iq.f$mom.work2 <- ifelse(iq.f$mom.work == "2", 1, 0)
 273 iq.f$mom.work3 <- ifelse(iq.f$mom.work == "3", 1, 0)
 274 iq.f$mom.work4 <- ifelse(iq.f$mom.work == "4", 1, 0)
 275 
 276 iq.f$mom.hs <- scale(iq.f$mom.hs, scale=FALSE)
 277 iq.f$mom.work1 <- scale(iq.f$mom.work1, scale=FALSE)
 278 iq.f$mom.work2 <- scale(iq.f$mom.work2, scale=FALSE)
 279 iq.f$mom.work3 <- scale(iq.f$mom.work3, scale=FALSE)
 280 iq.f$mom.work4 <- scale(iq.f$mom.work4, scale=FALSE)
 281 
 282 iq.f.lm.int3 <- lm( kid.score ~ mom.hs * (mom.work1 + mom.work2 + mom.work3), iq.f)
 283 display(iq.f.lm.int3)
Attachment 'contrast-coding.R'

Attached Files