3 ## download the kidiq dataset and read it into an R dataframe
4 kidiq_url <- ""
5 kidiq_file <- "kidiq.dta"
6 download.file(url = kidiq_url, destfile = kidiq_file, mode="w")
8 iq <- read.dta(kidiq_file)
10 ###### Using factors for categorical predictors
11 ## Coding categorical predictors as factors can be convenient because
12 ## a) it changes the summmary and print behavior for that predictor,
13 ## b) factors are often handled differently than continuous predictors
14 ## in plotting, and c) R will keep you from treating categorical
15 ## predictors as continous ones (unless you explicitly recode a factor
16 ## as numeric)
17 iq.f <- iq
19 ## if we want informative names for the levels of the predictor, we
20 ## specify which levels are observed in the data, and how we want to
21 ## label them. The default is to use the levels themselves as the
22 ## labels.
23 iq.f$mom.hs <- factor(iq$mom.hs,
24 levels = c(0, 1),
25 labels = c("no", "yes"))
27 ## if we don't need special labels, as.factor is a quick conversion
28 ## method.
29 iq.f$ <- as.factor(iq$
31 ## ordered() and as.ordered() are just variations on factor() and
32 ## as.factor(), with the argument ordered=TRUE. ifelse() can be very
33 ## useful for recoding a variable, but it can get to be hard to read
34 ## if you nest too many of them. In that case, use either a full set
35 ## of if/else if/else, or use switch().
36 iq.f$mom.age <- ordered( ifelse( iq$mom.age <= 21, 0,
37 ifelse( iq$mom.age <= 25, 1, 2)),
38 levels = c(0,1,2),
39 labels = c("early", "mid", "late"))
41 ###### Dummy coding (treatment contrasts) for binary predictors
42 ## test the hypothesis that there is a difference between a mom going
43 ## to high school and not
44 iq.lm <- lm(kid.score ~ mom.hs, data = iq)
45 library(arm)
46 display(iq.lm)
48 ## same test, coding mom.hs as a factor
49 iq.f.lm <- update(iq.lm, data = iq.f, evaluate=TRUE)
50 display(iq.f.lm)
52 ## why are these the same? R treats the levels of the factor as 0 and
53 ## 1, the same way they were coded in the original data (dummy or
54 ## treatment coding)
55 contrasts(iq.f$mom.hs)
57 ## this is controlled by the option called "contrasts". The first
58 ## value is which contrast matrix to use for unordered factors, the
59 ## second is which contrast matrix to use for ordered factors.
60 getOption("contrasts")
62 ## the default setting for the contrasts option can be explicitly set
63 ## like this
64 options(contrasts=c("contr.treatment", "contr.poly"))
66 ###### Contast coding (sum-to-zero contrasts) for binary predictors
67 contrasts(iq.f$mom.hs) <- contr.sum(2)
68 contrasts(iq.f$mom.hs)
69 ## by default, contr.sum assigns -1 to the level that comes last
70 ## alphabetically. change this by hand
71 iq.f$mom.hs <- C(iq.f$mom.hs, c(-1, 1)) # instead of c(1, -1)
72 contrasts(iq.f$mom.hs)
73 ## or change the internal order of the factor
74 iq.f$mom.hs <- factor(iq$mom.hs,
75 levels = c(1, 0),
76 labels = c("yes", "no"))
77 contrasts(iq.f$mom.hs) <- contr.sum(2)
78 contrasts(iq.f$mom.hs)
79 ## test the hypothesis that a kid whose mom went to high school scores
80 ## higher than "the average kid". This assumes balanced data, which
81 ## we don't have.
82 iq.f.lm <- update(iq.f.lm)
83 display(iq.f.lm)
85 ## for balanced data, the intercept should be the grand mean
86 iq.f.bal <- rbind(iq.f[iq.f$mom.hs == "no", ][1:90,],
87 iq.f[iq.f$mom.hs == "yes", ][1:90,])
88 iq.f.bal$mom.hs <- C(iq.f.bal$mom.hs, contr.sum)
89 mean(iq.f.bal$kid.score)
90 iq.f.bal.lm <- update(iq.f.lm, data = iq.f.bal)
92 display(iq.f.bal.lm)
94 ## if we flip the contrasts for balanced data, we flip the sign of the
95 ## coefficient
96 iq.f.bal$mom.hs <- C(iq.f.bal$mom.hs, c(-1,1))
97 iq.f.bal.lm <- update(iq.f.bal.lm)
99 display(iq.f.bal.lm)
101 ###### Centered binary predictors
102 ## For balanced data , contrast coding (sum-to-zero coding) tests the
103 ## hypothesis that a given level of a factor is different than the
104 ## mean value of the factor. How do we test this same hypothesis for
105 ## unbalanced data? Center our binary predictors.
107 ## by hand, with numeric coding
108 iq$mom.hs <- (iq$mom.hs - min(iq$mom.hs)) /
109 (max(iq$mom.hs) - min(iq$mom.hs)) # turns everything into 0 or 1
110 iq$mom.hs <- iq$mom.hs - mean(iq$mom.hs) # subtract the mean to center
112 unique(iq$mom.hs) # we still have two levels,
113 # but they're centered now
115 iq.lm <- update(iq.lm) # intercept should be the
116 # grand mean
117 display(iq.lm)
119 ## to make this work with factors, convert them back to numeric coding
120 iq.f$mom.hs <- as.factor( ifelse( iq.f$mom.hs == "yes", 1, 0))
121 iq.f$mom.hs <- as.numeric(levels(iq.f$mom.hs))[iq.f$mom.hs]
122 iq.f$mom.hs <- (iq.f$mom.hs - min(iq.f$mom.hs)) / (max(iq.f$mom.hs) - min(iq.f$mom.hs))
123 iq.f$mom.hs <- iq.f$mom.hs - mean(iq.f$mom.hs) # same as
124 # scale(iq.f$mom.hs,
125 # center=TRUE,
126 # scale=FALSE)
127 unique(iq.f$mom.hs)
128 iq.f.lm <- update(iq.f.lm)
129 display(iq.f.lm)
131 ## You can also use the rescale() method from the arm package to
132 ## center a single binary predictor, or the standardize() method to
133 ## center every binary predictor in a model. Methods from arm tend
134 ## not to work very well with categorical predictors stored as
135 ## factors. Hopefully this will change soon.
137 ###### Categorical predictors with more than two levels
138 ## Storing a categorical predictor as a factor with more than two
139 ## levels leads to a series of comparisons between levels of the
140 ## factor. By default, R uses dummy coding (treatment contrasts),
141 ## comparing each level to a reference level
142 levels(iq.f$
143 contrasts(iq.f$
144 iq.f.lm <- update(iq.f.lm, kid.score ~
145 display(iq.f.lm)
146 ## right now, each level is compared to the level coded as 1. We can
147 ## change this
148 iq.f$ <- relevel(iq.f$, ref=4)
149 contrasts(iq.f$ # now everything is compared
150 # to the level coded as 4
151 iq.f.lm <- update(iq.f.lm)
152 display(iq.f.lm)
153 ## We can also use contrast coding, comparing each level to the mean
154 ## across all levels
155 contrasts(iq.f$ <- contr.sum(4)
156 iq.f.lm <- update(iq.f.lm)
157 display(iq.f.lm)
159 ## if we want to test the second, third, and fourth levels instead, we
160 ## need to reorder the levels of the factor
161 levels(iq.f$
162 iq.f$ <- factor(iq.f$, levels=c(4,3,2,1))
163 contrasts(iq.f$ <- contr.sum(4)
164 iq.f.lm <- update(iq.f.lm)
165 display(iq.f.lm)
166 contrasts(iq.f$
167 ## notice that the names used in the lm() results didn't change, even
168 ## though we're doing different comparisons now. This can be a pain
169 ## and can make it harder to interpret the comparisons that are
170 ## carried out. Make sure that you always know which comparison
171 ## corresponds to which label. If you want to insert your own labels,
172 ## try
173 dimnames(contrasts(iq.f$[[2]] <- c("4", "3", "2")
174 iq.f.lm <- update(iq.f.lm)
175 display(iq.f.lm)
176 ## Using contrast coding assumes that all of the levels are balanced.
177 ## If this is not true, we get the same problems as in the case of
178 ## binary predictors (intercept is hard to interpret, works poorly in
179 ## interactions, ...). Ideally, we want to center each pairwise
180 ## comparison. This can be done by recoding the index variable into a
181 ## set of indicator variables, and centering as shown above. If
182 ## anyone knows of a function that sets centered contrasts for a
183 ## series of comparisons, please let me know.
184 iq.f$mom.work1 <- ifelse(iq.f$ == "1", 1, 0)
185 iq.f$mom.work2 <- ifelse(iq.f$ == "2", 1, 0)
186 iq.f$mom.work3 <- ifelse(iq.f$ == "3", 1, 0)
187 iq.f$mom.work4 <- ifelse(iq.f$ == "4", 1, 0)
189 iq.f$mom.work1 <- scale(iq.f$mom.work1, scale=FALSE)
190 iq.f$mom.work2 <- scale(iq.f$mom.work2, scale=FALSE)
191 iq.f$mom.work3 <- scale(iq.f$mom.work3, scale=FALSE)
192 iq.f$mom.work4 <- scale(iq.f$mom.work4, scale=FALSE)
193 unique(iq.f$mom.work1)
194 unique(iq.f$mom.work2)
195 unique(iq.f$mom.work3)
196 unique(iq.f$mom.work4)
198 ## Remember that we can only test three of the four levels, because
199 ## one df is use in estimating the mean across all levels. Also,
200 ## after centering all indicator variables, the intercept should
201 ## correspond to the grand mean
202 iq.f.lm <- update(iq.f.lm,
203 kid.score ~ mom.work1 + mom.work2 + mom.work3, data = iq.f)
204 display(iq.f.lm)
205 ## or
206 ## update(iq.f.lm, kid.score ~ mom.work2 + mom.work3 + mom.work4, data = iq.f)
208 ###### Ordered factors
209 ## We can test the hypothesis that moving from one level of an ordered
210 ## categorical predictor to the next level has an effect on the
211 ## dependent variable. This is different than using a continuous
212 ## predictor, because the levels of the ordered factor aren't
213 ## necessarily equidistant.
215 ## linear predictor. intercept corresponds to a mother with age 0.
216 iq.lm <- update(iq.lm, kid.score ~ mom.age, data = iq) # marginal
217 # linear effect
218 ## ordered categorical predictor
219 iq.f.lm <- update(iq.f.lm, kid.score ~ mom.age, data = iq.f) # marginal
220 # linear
221 # effect,
222 # no
223 # quadratic
224 # effect
225 display(iq.f.lm)
226 ## by default, ordered predictors use polynomial contrasts
227 contr.poly(3)
228 ## The column names indicate the polynomial term that is being
229 ## estimated: (L)inear, (Q)uadratic, (C)ubic, ^4, ^5, ...
231 ## the other option that is sometimes used for ordered predictors is
232 ## Helmert contrasts (contr.helmert).
234 ###### Interactions and categorical predictors
235 ## Categorical predictors should be centered before being entered into
236 ## an interaction. Otherwise, correlations between the levels of the
237 ## factors change the results of the interaction.
239 ## reset the data frame
240 iq <- read.dta(kidiq_file)
241 iq.f <- iq
242 iq.f$mom.hs <- factor(iq$mom.hs,
243 levels = c(0, 1),
244 labels = c("no", "yes"))
246 iq.f$ <- factor(iq$,
247 levels = c(1, 2, 3, 4))
249 iq.f$mom.age <- ordered( ifelse( iq$mom.age <= 21, 0,
250 ifelse( iq$mom.age <= 25, 1, 2)),
251 levels = c(0,1,2),
252 labels = c("early", "mid", "late"))
254 ## compare dummy coding
255 contrasts(iq.f$mom.hs)
256 contrasts(iq.f$
257 iq.f.lm.int1 <- lm(kid.score ~ mom.hs + +, iq.f)
258 display(iq.f.lm.int1)
260 ## to contrast coding
261 getOption("contrasts")
262 options(contrasts=c("contr.sum", "contr.poly"))
263 getOption("contrasts")
265 contrasts(iq.f$mom.hs)
266 contrasts(iq.f$
267 iq.f.lm.int2 <- update(iq.f.lm.int1)
269 ## to centered predictors
270 iq.f$mom.hs <- ifelse(iq.f$mom.hs == "yes", 1, 0)
271 iq.f$mom.work1 <- ifelse(iq.f$ == "1", 1, 0)
272 iq.f$mom.work2 <- ifelse(iq.f$ == "2", 1, 0)
273 iq.f$mom.work3 <- ifelse(iq.f$ == "3", 1, 0)
274 iq.f$mom.work4 <- ifelse(iq.f$ == "4", 1, 0)
276 iq.f$mom.hs <- scale(iq.f$mom.hs, scale=FALSE)
277 iq.f$mom.work1 <- scale(iq.f$mom.work1, scale=FALSE)
278 iq.f$mom.work2 <- scale(iq.f$mom.work2, scale=FALSE)
279 iq.f$mom.work3 <- scale(iq.f$mom.work3, scale=FALSE)
280 iq.f$mom.work4 <- scale(iq.f$mom.work4, scale=FALSE)
282 iq.f.lm.int3 <- lm( kid.score ~ mom.hs * (mom.work1 + mom.work2 + mom.work3), iq.f)
283 display(iq.f.lm.int3)
