Attachment 'contrast-coding2.R'
Download 1 library(foreign)
2
3 ## download the kidiq dataset and read it into an R dataframe
4 kidiq_url <- "http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/kidiq.dta"
5 kidiq_file <- "kidiq.dta"
6 download.file(url = kidiq_url, destfile = kidiq_file, mode="w")
7
8 iq <- read.dta(kidiq_file)
9
10 ###### Using factors for categorical predictors
11 ## Coding categorical predictors as factors can be convenient because
12 ## a) it changes the summmary and print behavior for that predictor,
13 ## b) factors are often handled differently than continuous predictors
14 ## in plotting, and c) R will keep you from treating categorical
15 ## predictors as continous ones (unless you explicitly recode a factor
16 ## as numeric)
17 iq.f <- iq
18
19 ## if we want informative names for the levels of the predictor, we
20 ## specify which levels are observed in the data, and how we want to
21 ## label them. The default is to use the levels themselves as the
22 ## labels.
23 iq.f$mom_hs <- factor(iq$mom_hs, levels=c(0,1), labels=c("no", "yes"))
24
25 ## if we don't need special labels, as.factor is a quick conversion
26 ## method.
27 iq.f$mom_work <- as.factor(iq$mom_work)
28
29 ## ordered() and as.ordered() are just variations on factor() and
30 ## as.factor(), with the argument ordered=TRUE. ifelse() can be very
31 ## useful for recoding a variable, but it can get to be hard to read
32 ## if you nest too many of them. In that case, use either a full set
33 ## of if/else if/else, or use switch().
34 iq.f$mom_age <- ordered( ifelse( iq$mom_age <= 21, 0,
35 ifelse( iq$mom_age <= 25, 1, 2)),
36 levels = c(0,1,2),
37 labels = c("early", "mid", "late"))
38
39 ###### Dummy coding (treatment contrasts) for binary predictors
40 ## test the hypothesis that there is a difference between a mom going
41 ## to high school and not
42 iq.lm <- lm(kid_score ~ mom_hs, data = iq)
43 library(arm)
44 display(iq.lm)
45
46 ## same test, coding mom_hs as a factor
47 iq.f.lm <- update(iq.lm, data = iq.f, evaluate=TRUE)
48 display(iq.f.lm)
49
50 ## why are these the same? R treats the levels of the factor as 0 and
51 ## 1, the same way they were coded in the original data (dummy or
52 ## treatment coding)
53 contrasts(iq.f$mom_hs)
54
55 ## this is controlled by the option called "contrasts". The first
56 ## value is which contrast matrix to use for unordered factors, the
57 ## second is which contrast matrix to use for ordered factors.
58 getOption("contrasts")
59
60 ## the default setting for the contrasts option can be explicitly set
61 ## like this
62 options(contrasts=c("contr.treatment", "contr.poly"))
63
64 ###### Contast coding (sum-to-zero contrasts) for binary predictors
65 contrasts(iq.f$mom_hs) <- contr.sum(2)
66 contrasts(iq.f$mom_hs)
67 ## by default, contr.sum assigns -1 to the level that comes last
68 ## alphabetically. change this by hand
69 iq.f$mom_hs <- C(iq.f$mom_hs, c(-1, 1)) # instead of c(1, -1)
70 contrasts(iq.f$mom_hs)
71 ## or change the internal order of the factor
72 iq.f$mom_hs <- factor(iq$mom_hs,
73 levels = c(1, 0),
74 labels = c("yes", "no"))
75 contrasts(iq.f$mom_hs) <- contr.sum(2)
76 contrasts(iq.f$mom_hs)
77 ## test the hypothesis that a kid whose mom went to high school scores
78 ## higher than "the average kid". This assumes balanced data, which
79 ## we don't have.
80 iq.f.lm <- update(iq.f.lm)
81 display(iq.f.lm)
82
83 ## for balanced data, the intercept should be the grand mean
84 iq.f.bal <- rbind(iq.f[iq.f$mom_hs == "no", ][1:90,],
85 iq.f[iq.f$mom_hs == "yes", ][1:90,])
86 iq.f.bal$mom_hs <- C(iq.f.bal$mom_hs, contr.sum)
87 mean(iq.f.bal$kid_score)
88 iq.f.bal.lm <- update(iq.f.lm, data = iq.f.bal)
89
90 display(iq.f.bal.lm)
91
92 ## if we flip the contrasts for balanced data, we flip the sign of the
93 ## coefficient
94 iq.f.bal$mom_hs <- C(iq.f.bal$mom_hs, c(-1,1))
95 iq.f.bal.lm <- update(iq.f.bal.lm)
96
97 display(iq.f.bal.lm)
98
99 ###### Centered binary predictors
100 ## For balanced data , contrast coding (sum-to-zero coding) tests the
101 ## hypothesis that a given level of a factor is different than the
102 ## mean value of the factor. How do we test this same hypothesis for
103 ## unbalanced data? Center our binary predictors.
104
105 ## by hand, with numeric coding
106 iq$mom_hs <- (iq$mom_hs - min(iq$mom_hs)) /
107 (max(iq$mom_hs) - min(iq$mom_hs)) # turns everything into 0 or 1
108 iq$mom_hs <- iq$mom_hs - mean(iq$mom_hs) # subtract the mean to center
109
110 unique(iq$mom_hs) # we still have two levels,
111 # but they're centered now
112
113 iq.lm <- update(iq.lm) # intercept should be the
114 # grand mean
115 display(iq.lm)
116
117 ## to make this work with factors, convert them back to numeric coding
118 iq.f$mom_hs <- as.factor( ifelse( iq.f$mom_hs == "yes", 1, 0))
119 iq.f$mom_hs <- as.numeric(levels(iq.f$mom_hs))[iq.f$mom_hs]
120 iq.f$mom_hs <- (iq.f$mom_hs - min(iq.f$mom_hs)) / (max(iq.f$mom_hs) - min(iq.f$mom_hs))
121 iq.f$mom_hs <- iq.f$mom_hs - mean(iq.f$mom_hs) # same as
122 # scale(iq.f$mom_hs,
123 # center=TRUE,
124 # scale=FALSE)
125 unique(iq.f$mom_hs)
126 iq.f.lm <- update(iq.f.lm)
127 display(iq.f.lm)
128
129 ## You can also use the rescale() method from the arm package to
130 ## center a single binary predictor, or the standardize() method to
131 ## center every binary predictor in a model. Methods from arm tend
132 ## not to work very well with categorical predictors stored as
133 ## factors. Hopefully this will change soon.
134
135 ###### Categorical predictors with more than two levels
136 ## Storing a categorical predictor as a factor with more than two
137 ## levels leads to a series of comparisons between levels of the
138 ## factor. By default, R uses dummy coding (treatment contrasts),
139 ## comparing each level to a reference level
140 levels(iq.f$mom_work)
141 contrasts(iq.f$mom_work)
142 iq.f.lm <- update(iq.f.lm, kid_score ~ mom_work)
143 display(iq.f.lm)
144 ## right now, each level is compared to the level coded as 1. We can
145 ## change this
146 iq.f$mom_work <- relevel(iq.f$mom_work, ref=4)
147 contrasts(iq.f$mom_work) # now everything is compared
148 # to the level coded as 4
149 iq.f.lm <- update(iq.f.lm)
150 display(iq.f.lm)
151 ## We can also use contrast coding, comparing each level to the mean
152 ## across all levels
153 contrasts(iq.f$mom_work) <- contr.sum(4)
154 iq.f.lm <- update(iq.f.lm)
155 display(iq.f.lm)
156
157 ## if we want to test the second, third, and fourth levels instead, we
158 ## need to reorder the levels of the factor
159 levels(iq.f$mom_work)
160 iq.f$mom_work <- factor(iq.f$mom_work, levels=c(4,3,2,1))
161 contrasts(iq.f$mom_work) <- contr.sum(4)
162 iq.f.lm <- update(iq.f.lm)
163 display(iq.f.lm)
164 contrasts(iq.f$mom_work)
165 ## notice that the names used in the lm() results didn't change, even
166 ## though we're doing different comparisons now. This can be a pain
167 ## and can make it harder to interpret the comparisons that are
168 ## carried out. Make sure that you always know which comparison
169 ## corresponds to which label. If you want to insert your own labels,
170 ## try
171 dimnames(contrasts(iq.f$mom_work))[[2]] <- c("4", "3", "2")
172 iq.f.lm <- update(iq.f.lm)
173 display(iq.f.lm)
174 ## Using contrast coding assumes that all of the levels are balanced.
175 ## If this is not true, we get the same problems as in the case of
176 ## binary predictors (intercept is hard to interpret, works poorly in
177 ## interactions, ...). Ideally, we want to center each pairwise
178 ## comparison. This can be done by recoding the index variable into a
179 ## set of indicator variables, and centering as shown above. If
180 ## anyone knows of a function that sets centered contrasts for a
181 ## series of comparisons, please let me know.
182 iq.f$mom_work1 <- ifelse(iq.f$mom_work == "1", 1, 0)
183 iq.f$mom_work2 <- ifelse(iq.f$mom_work == "2", 1, 0)
184 iq.f$mom_work3 <- ifelse(iq.f$mom_work == "3", 1, 0)
185 iq.f$mom_work4 <- ifelse(iq.f$mom_work == "4", 1, 0)
186
187 iq.f$mom_work1 <- scale(iq.f$mom_work1, scale=FALSE)
188 iq.f$mom_work2 <- scale(iq.f$mom_work2, scale=FALSE)
189 iq.f$mom_work3 <- scale(iq.f$mom_work3, scale=FALSE)
190 iq.f$mom_work4 <- scale(iq.f$mom_work4, scale=FALSE)
191 unique(iq.f$mom_work1)
192 unique(iq.f$mom_work2)
193 unique(iq.f$mom_work3)
194 unique(iq.f$mom_work4)
195
196 ## Remember that we can only test three of the four levels, because
197 ## one df is use in estimating the mean across all levels. Also,
198 ## after centering all indicator variables, the intercept should
199 ## correspond to the grand mean
200 iq.f.lm <- update(iq.f.lm,
201 kid_score ~ mom_work1 + mom_work2 + mom_work3, data = iq.f)
202 display(iq.f.lm)
203 ## or
204 ## update(iq.f.lm, kid_score ~ mom_work2 + mom_work3 + mom_work4, data = iq.f)
205
206 ###### Ordered factors
207 ## We can test the hypothesis that moving from one level of an ordered
208 ## categorical predictor to the next level has an effect on the
209 ## dependent variable. This is different than using a continuous
210 ## predictor, because the levels of the ordered factor aren't
211 ## necessarily equidistant.
212
213 ## linear predictor. intercept corresponds to a mother with age 0.
214 iq.lm <- update(iq.lm, kid_score ~ mom_age, data = iq) # marginal
215 # linear effect
216 ## ordered categorical predictor
217 iq.f.lm <- update(iq.f.lm, kid_score ~ mom_age, data = iq.f) # marginal
218 # linear
219 # effect,
220 # no
221 # quadratic
222 # effect
223 display(iq.f.lm)
224 ## by default, ordered predictors use polynomial contrasts
225 contr.poly(3)
226 ## The column names indicate the polynomial term that is being
227 ## estimated: (L)inear, (Q)uadratic, (C)ubic, ^4, ^5, ...
228
229 ## the other option that is sometimes used for ordered predictors is
230 ## Helmert contrasts (contr.helmert).
231
232 ###### Interactions and categorical predictors
233 ## Categorical predictors should be centered before being entered into
234 ## an interaction. Otherwise, correlations between the levels of the
235 ## factors change the results of the interaction.
236
237 ## reset the data frame
238 iq <- read.dta(kidiq_file)
239 iq.f <- iq
240 iq.f$mom_hs <- factor(iq$mom_hs,
241 levels = c(0, 1),
242 labels = c("no", "yes"))
243
244 iq.f$mom_work <- factor(iq$mom_work,
245 levels = c(1, 2, 3, 4))
246
247 iq.f$mom_age <- ordered( ifelse( iq$mom_age <= 21, 0,
248 ifelse( iq$mom_age <= 25, 1, 2)),
249 levels = c(0,1,2),
250 labels = c("early", "mid", "late"))
251
252 ## compare dummy coding
253 contrasts(iq.f$mom_hs)
254 contrasts(iq.f$mom_work)
255 iq.f.lm.int1 <- lm(kid_score ~ mom_hs + mom_work + mom_hs:mom_work, iq.f)
256 display(iq.f.lm.int1)
257
258 ## to contrast coding
259 getOption("contrasts")
260 options(contrasts=c("contr.sum", "contr.poly"))
261 getOption("contrasts")
262
263 contrasts(iq.f$mom_hs)
264 contrasts(iq.f$mom_work)
265 iq.f.lm.int2 <- update(iq.f.lm.int1)
266
267 ## to centered predictors
268 iq.f$mom_hs <- ifelse(iq.f$mom_hs == "yes", 1, 0)
269 iq.f$mom_work1 <- ifelse(iq.f$mom_work == "1", 1, 0)
270 iq.f$mom_work2 <- ifelse(iq.f$mom_work == "2", 1, 0)
271 iq.f$mom_work3 <- ifelse(iq.f$mom_work == "3", 1, 0)
272 iq.f$mom_work4 <- ifelse(iq.f$mom_work == "4", 1, 0)
273
274 iq.f$mom_hs <- scale(iq.f$mom_hs, scale=FALSE)
275 iq.f$mom_work1 <- scale(iq.f$mom_work1, scale=FALSE)
276 iq.f$mom_work2 <- scale(iq.f$mom_work2, scale=FALSE)
277 iq.f$mom_work3 <- scale(iq.f$mom_work3, scale=FALSE)
278 iq.f$mom_work4 <- scale(iq.f$mom_work4, scale=FALSE)
279
280 iq.f.lm.int3 <- lm( kid_score ~ mom_hs * (mom_work1 + mom_work2 + mom_work3), iq.f)
281 display(iq.f.lm.int3)
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.