attachment:contrast-coding2.R of HLPMiniCourseSession1

   1 library(foreign)
   2 
   3 ## download the kidiq dataset and read it into an R dataframe
   4 kidiq_url <- "http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/kidiq.dta"
   5 kidiq_file <- "kidiq.dta"
   6 download.file(url = kidiq_url, destfile = kidiq_file, mode="w")
   7 
   8 iq <- read.dta(kidiq_file)
   9 
  10 ###### Using factors for categorical predictors
  11 ## Coding categorical predictors as factors can be convenient because
  12 ## a) it changes the summmary and print behavior for that predictor,
  13 ## b) factors are often handled differently than continuous predictors
  14 ## in plotting, and c) R will keep you from treating categorical
  15 ## predictors as continous ones (unless you explicitly recode a factor
  16 ## as numeric)
  17 iq.f <- iq
  18 
  19 ## if we want informative names for the levels of the predictor, we
  20 ## specify which levels are observed in the data, and how we want to
  21 ## label them.  The default is to use the levels themselves as the
  22 ## labels.
  23 iq.f$mom_hs <- factor(iq$mom_hs, levels=c(0,1), labels=c("no", "yes"))
  24 
  25 ## if we don't need special labels, as.factor is a quick conversion
  26 ## method.
  27 iq.f$mom_work <- as.factor(iq$mom_work)
  28 
  29 ## ordered() and as.ordered() are just variations on factor() and
  30 ## as.factor(), with the argument ordered=TRUE.  ifelse() can be very
  31 ## useful for recoding a variable, but it can get to be hard to read
  32 ## if you nest too many of them.  In that case, use either a full set
  33 ## of if/else if/else, or use switch().
  34 iq.f$mom_age <- ordered( ifelse( iq$mom_age <= 21, 0,
  35                                 ifelse( iq$mom_age <= 25, 1, 2)),
  36                         levels = c(0,1,2),
  37                         labels = c("early", "mid", "late"))
  38 
  39 ###### Dummy coding (treatment contrasts) for binary predictors
  40 ## test the hypothesis that there is a difference between a mom going
  41 ## to high school and not
  42 iq.lm <- lm(kid_score ~ mom_hs, data = iq)
  43 library(arm)
  44 display(iq.lm)
  45 
  46 ## same test, coding mom_hs as a factor
  47 iq.f.lm <- update(iq.lm, data = iq.f, evaluate=TRUE)
  48 display(iq.f.lm)
  49 
  50 ## why are these the same?  R treats the levels of the factor as 0 and
  51 ## 1, the same way they were coded in the original data (dummy or
  52 ## treatment coding)
  53 contrasts(iq.f$mom_hs)
  54 
  55 ## this is controlled by the option called "contrasts".  The first
  56 ## value is which contrast matrix to use for unordered factors, the
  57 ## second is which contrast matrix to use for ordered factors.
  58 getOption("contrasts")                  
  59 
  60 ## the default setting for the contrasts option can be explicitly set
  61 ## like this
  62 options(contrasts=c("contr.treatment", "contr.poly"))
  63 
  64 ###### Contast coding (sum-to-zero contrasts) for binary predictors
  65 contrasts(iq.f$mom_hs) <- contr.sum(2)
  66 contrasts(iq.f$mom_hs)
  67 ## by default, contr.sum assigns -1 to the level that comes last
  68 ## alphabetically.  change this by hand
  69 iq.f$mom_hs <- C(iq.f$mom_hs, c(-1, 1)) # instead of c(1, -1)
  70 contrasts(iq.f$mom_hs)
  71 ## or change the internal order of the factor
  72 iq.f$mom_hs <- factor(iq$mom_hs,
  73                       levels = c(1, 0),
  74                       labels = c("yes", "no"))
  75 contrasts(iq.f$mom_hs) <- contr.sum(2)
  76 contrasts(iq.f$mom_hs)
  77 ## test the hypothesis that a kid whose mom went to high school scores
  78 ## higher than "the average kid".  This assumes balanced data, which
  79 ## we don't have.
  80 iq.f.lm <- update(iq.f.lm)
  81 display(iq.f.lm)
  82 
  83 ## for balanced data, the intercept should be the grand mean
  84 iq.f.bal <- rbind(iq.f[iq.f$mom_hs == "no", ][1:90,],
  85                   iq.f[iq.f$mom_hs == "yes", ][1:90,])
  86 iq.f.bal$mom_hs <- C(iq.f.bal$mom_hs, contr.sum)
  87 mean(iq.f.bal$kid_score)
  88 iq.f.bal.lm <- update(iq.f.lm, data = iq.f.bal)
  89 
  90 display(iq.f.bal.lm)
  91 
  92 ## if we flip the contrasts for balanced data, we flip the sign of the
  93 ## coefficient
  94 iq.f.bal$mom_hs <- C(iq.f.bal$mom_hs, c(-1,1))
  95 iq.f.bal.lm <- update(iq.f.bal.lm)
  96 
  97 display(iq.f.bal.lm)
  98 
  99 ###### Centered binary predictors
 100 ## For balanced data , contrast coding (sum-to-zero coding) tests the
 101 ## hypothesis that a given level of a factor is different than the
 102 ## mean value of the factor.  How do we test this same hypothesis for
 103 ## unbalanced data?  Center our binary predictors.
 104 
 105 ## by hand, with numeric coding
 106 iq$mom_hs <- (iq$mom_hs - min(iq$mom_hs)) /
 107   (max(iq$mom_hs) - min(iq$mom_hs))     # turns everything into 0 or 1
 108 iq$mom_hs <- iq$mom_hs - mean(iq$mom_hs) # subtract the mean to center
 109 
 110 unique(iq$mom_hs)                       # we still have two levels,
 111                                         # but they're centered now
 112 
 113 iq.lm <- update(iq.lm)                  # intercept should be the
 114                                         # grand mean
 115 display(iq.lm)
 116 
 117 ## to make this work with factors, convert them back to numeric coding
 118 iq.f$mom_hs <- as.factor( ifelse( iq.f$mom_hs == "yes", 1, 0))
 119 iq.f$mom_hs <- as.numeric(levels(iq.f$mom_hs))[iq.f$mom_hs]
 120 iq.f$mom_hs <- (iq.f$mom_hs - min(iq.f$mom_hs)) / (max(iq.f$mom_hs) - min(iq.f$mom_hs))
 121 iq.f$mom_hs <- iq.f$mom_hs - mean(iq.f$mom_hs) # same as
 122                                                # scale(iq.f$mom_hs,
 123                                                # center=TRUE,
 124                                                # scale=FALSE)
 125 unique(iq.f$mom_hs)
 126 iq.f.lm <- update(iq.f.lm)
 127 display(iq.f.lm)
 128 
 129 ## You can also use the rescale() method from the arm package to
 130 ## center a single binary predictor, or the standardize() method to
 131 ## center every binary predictor in a model.  Methods from arm tend
 132 ## not to work very well with categorical predictors stored as
 133 ## factors.  Hopefully this will change soon.
 134 
 135 ###### Categorical predictors with more than two levels
 136 ## Storing a categorical predictor as a factor with more than two
 137 ## levels leads to a series of comparisons between levels of the
 138 ## factor.  By default, R uses dummy coding (treatment contrasts),
 139 ## comparing each level to a reference level
 140 levels(iq.f$mom_work)
 141 contrasts(iq.f$mom_work)
 142 iq.f.lm <- update(iq.f.lm, kid_score ~ mom_work)
 143 display(iq.f.lm)
 144 ## right now, each level is compared to the level coded as 1.  We can
 145 ## change this
 146 iq.f$mom_work <- relevel(iq.f$mom_work, ref=4)
 147 contrasts(iq.f$mom_work)                # now everything is compared
 148                                         # to the level coded as 4
 149 iq.f.lm <- update(iq.f.lm)
 150 display(iq.f.lm)
 151 ## We can also use contrast coding, comparing each level to the mean
 152 ## across all levels
 153 contrasts(iq.f$mom_work) <- contr.sum(4)
 154 iq.f.lm <- update(iq.f.lm)
 155 display(iq.f.lm)
 156 
 157 ## if we want to test the second, third, and fourth levels instead, we
 158 ## need to reorder the levels of the factor
 159 levels(iq.f$mom_work)
 160 iq.f$mom_work <- factor(iq.f$mom_work, levels=c(4,3,2,1))
 161 contrasts(iq.f$mom_work) <- contr.sum(4)
 162 iq.f.lm <- update(iq.f.lm)
 163 display(iq.f.lm)
 164 contrasts(iq.f$mom_work)
 165 ## notice that the names used in the lm() results didn't change, even
 166 ## though we're doing different comparisons now.  This can be a pain
 167 ## and can make it harder to interpret the comparisons that are
 168 ## carried out.  Make sure that you always know which comparison
 169 ## corresponds to which label.  If you want to insert your own labels,
 170 ## try
 171 dimnames(contrasts(iq.f$mom_work))[[2]] <- c("4", "3", "2")
 172 iq.f.lm <- update(iq.f.lm)
 173 display(iq.f.lm)
 174 ## Using contrast coding assumes that all of the levels are balanced.
 175 ## If this is not true, we get the same problems as in the case of
 176 ## binary predictors (intercept is hard to interpret, works poorly in
 177 ## interactions, ...).  Ideally, we want to center each pairwise
 178 ## comparison.  This can be done by recoding the index variable into a
 179 ## set of indicator variables, and centering as shown above.  If
 180 ## anyone knows of a function that sets centered contrasts for a
 181 ## series of comparisons, please let me know.
 182 iq.f$mom_work1 <- ifelse(iq.f$mom_work == "1", 1, 0)
 183 iq.f$mom_work2 <- ifelse(iq.f$mom_work == "2", 1, 0)
 184 iq.f$mom_work3 <- ifelse(iq.f$mom_work == "3", 1, 0)
 185 iq.f$mom_work4 <- ifelse(iq.f$mom_work == "4", 1, 0)
 186 
 187 iq.f$mom_work1 <- scale(iq.f$mom_work1, scale=FALSE)
 188 iq.f$mom_work2 <- scale(iq.f$mom_work2, scale=FALSE)
 189 iq.f$mom_work3 <- scale(iq.f$mom_work3, scale=FALSE)
 190 iq.f$mom_work4 <- scale(iq.f$mom_work4, scale=FALSE)
 191 unique(iq.f$mom_work1)
 192 unique(iq.f$mom_work2)
 193 unique(iq.f$mom_work3)
 194 unique(iq.f$mom_work4)
 195 
 196 ## Remember that we can only test three of the four levels, because
 197 ## one df is use in estimating the mean across all levels.  Also,
 198 ## after centering all indicator variables, the intercept should
 199 ## correspond to the grand mean
 200 iq.f.lm <- update(iq.f.lm,
 201                   kid_score ~ mom_work1 + mom_work2 + mom_work3, data = iq.f)
 202 display(iq.f.lm)
 203 ## or
 204 ## update(iq.f.lm, kid_score ~ mom_work2 + mom_work3 + mom_work4, data = iq.f)
 205 
 206 ###### Ordered factors
 207 ## We can test the hypothesis that moving from one level of an ordered
 208 ## categorical predictor to the next level has an effect on the
 209 ## dependent variable.  This is different than using a continuous
 210 ## predictor, because the levels of the ordered factor aren't
 211 ## necessarily equidistant.
 212 
 213 ## linear predictor.  intercept corresponds to a mother with age 0.
 214 iq.lm <- update(iq.lm, kid_score ~ mom_age, data = iq) # marginal
 215                                                        # linear effect
 216 ## ordered categorical predictor
 217 iq.f.lm <- update(iq.f.lm, kid_score ~ mom_age, data = iq.f) # marginal
 218                                                              # linear
 219                                                              # effect,
 220                                                              # no
 221                                                              # quadratic
 222                                                              # effect
 223 display(iq.f.lm)
 224 ## by default, ordered predictors use polynomial contrasts
 225 contr.poly(3)
 226 ## The column names indicate the polynomial term that is being
 227 ## estimated: (L)inear, (Q)uadratic, (C)ubic, ^4, ^5, ...
 228 
 229 ## the other option that is sometimes used for ordered predictors is
 230 ## Helmert contrasts (contr.helmert).
 231 
 232 ###### Interactions and categorical predictors
 233 ## Categorical predictors should be centered before being entered into
 234 ## an interaction.  Otherwise, correlations between the levels of the
 235 ## factors change the results of the interaction.
 236 
 237 ## reset the data frame
 238 iq <- read.dta(kidiq_file)
 239 iq.f <- iq
 240 iq.f$mom_hs <- factor(iq$mom_hs,
 241                       levels = c(0, 1),
 242                       labels = c("no", "yes"))
 243 
 244 iq.f$mom_work <- factor(iq$mom_work,
 245                         levels = c(1, 2, 3, 4))
 246 
 247 iq.f$mom_age <- ordered( ifelse( iq$mom_age <= 21, 0,
 248                                 ifelse( iq$mom_age <= 25, 1, 2)),
 249                         levels = c(0,1,2),
 250                         labels = c("early", "mid", "late"))
 251 
 252 ## compare dummy coding 
 253 contrasts(iq.f$mom_hs)
 254 contrasts(iq.f$mom_work)
 255 iq.f.lm.int1 <- lm(kid_score ~ mom_hs + mom_work + mom_hs:mom_work, iq.f)
 256 display(iq.f.lm.int1)
 257 
 258 ## to contrast coding
 259 getOption("contrasts")
 260 options(contrasts=c("contr.sum", "contr.poly"))
 261 getOption("contrasts")
 262 
 263 contrasts(iq.f$mom_hs)
 264 contrasts(iq.f$mom_work)
 265 iq.f.lm.int2 <- update(iq.f.lm.int1)
 266 
 267 ## to centered predictors
 268 iq.f$mom_hs <- ifelse(iq.f$mom_hs == "yes", 1, 0)
 269 iq.f$mom_work1 <- ifelse(iq.f$mom_work == "1", 1, 0)
 270 iq.f$mom_work2 <- ifelse(iq.f$mom_work == "2", 1, 0)
 271 iq.f$mom_work3 <- ifelse(iq.f$mom_work == "3", 1, 0)
 272 iq.f$mom_work4 <- ifelse(iq.f$mom_work == "4", 1, 0)
 273 
 274 iq.f$mom_hs <- scale(iq.f$mom_hs, scale=FALSE)
 275 iq.f$mom_work1 <- scale(iq.f$mom_work1, scale=FALSE)
 276 iq.f$mom_work2 <- scale(iq.f$mom_work2, scale=FALSE)
 277 iq.f$mom_work3 <- scale(iq.f$mom_work3, scale=FALSE)
 278 iq.f$mom_work4 <- scale(iq.f$mom_work4, scale=FALSE)
 279 
 280 iq.f.lm.int3 <- lm( kid_score ~ mom_hs * (mom_work1 + mom_work2 + mom_work3), iq.f)
 281 display(iq.f.lm.int3)
Attachment 'contrast-coding2.R'

Attached Files