Attachment 'lexdecRT.R'
Download 1 library(languageR)
2 data(english)
3
4 str(english)
5 help(english)
6
7 # Data from: Balota, D., Cortese, M., Sergent-Marshall, S., Spieler,
8 # D. and Yap, M. (2004) Visual word recognition for single-syllable
9 # words, Journal of Experimental Psychology:General, 133, 283-316.
10 #
11 # For best model, see Baayen, R.H., Feldman, L. and Schreuder, R. (2006)
12 # Morphological influences on the recognition of monosyllabic monomorphemic
13 # words, Journal of Memory and Language, 53, 496-512.
14
15 library(Design)
16 english.dd = datadist(english)
17 options(datadist = 'english.dd')
18
19 # The data contain average lexical decision times on 2284 English words
20 # taken from both young and old subjects
21
22 # Question 1: Does age affect RTs?
23 # Step 1: Understand age variable
24 summary(english$AgeSubject)
25 with(english, table(AgeSubject, Word))
26 with(english, table(AgeSubject, paste(Word, WordCategory, sep=".")))
27
28 # We would predict that older subjects have slower RTs
29 l1 = ols(RTlexdec ~ AgeSubject,
30 data = english)
31
32 # default: test="F"
33 anova(l1)
34 anova(l1, test="Chisq")
35
36 # ok, so Age matters, but how?
37 print(l1)
38
39 # what does this effect translate into?
40 # i.e. how many msecs faster are young subjects on average?
41 # what else would be want to say in a summary of this result?
42
43
44 # Age has a strong effect on log RT, what about word frequency?
45 # We would expect that more frequent words are faster to decide on
46 e <- english
47
48 # just for entertainment, let's back-transform CELEX frequencies
49 # to the original counts (we will use the log below)
50 e$WrittenFrequency <- exp(e$WrittenFrequency)
51 par(mfrow=c(1,2))
52 hist(e$WrittenFrequency, breaks=20, main="CELEX frequencies")
53 hist(log(e$WrittenFrequency), breaks=20, main="CELEX log frequencies")
54 par(mfrow=c(1,1))
55
56 l2 = ols(RTlexdec ~ AgeSubject + log(WrittenFrequency),
57 data = e)
58 l2
59
60 # (1) how would you describe the frequency effect?
61 # think about high vs. low values (use quantiles())
62 # (2) how would you summarize the model? What *exactly* do the coefficients mean?
63
64 # the frequency coefficient goes in the right direction
65 # judging from the summary of the model, do you think the
66 # frequency and age effect are independent of each other?
67 # or are they collinear?
68 # (think about l1 vs. l2 and the coefficients)
69 vif(l2)
70 # also:kappa(l2, exact=T)
71
72 # what influences lexical decision times more, age or frequency?
73 plot(anova(l2), what="aic")
74 plot(anova(l2), what="partial R2")
75 plot(anova(l2), what="remaining R2")
76 plot(anova(l2), what="proportion R2")
77
78 # plot the partial effects as predicted by the model
79 par(mfrow = c(4, 3), mar = c(4, 4, 1, 1), oma = rep(1, 4))
80 plot(l2, adj.subtitle = FALSE, ylim = c(6.4, 7), conf.int = FALSE)
81 par(mfrow = c(1, 1))
82
83 l2.int = ols(RTlexdec ~ I(ifelse(e$AgeSubject == "young", 1, -1)) * as.numeric(scale(log(WrittenFrequency), scale=F)),
84 data = e)
85 l2.int
86
87
88 # ---- validate the model
89 validate(l2, bw = TRUE, B = 200)
90 # creates an error because we need to save the
91 # x (design matrix) and y (response) when we fit
92 # the model
93 l2 <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency),
94 data = e, x=T, y=T)
95 summary(l2$y)
96 summary(l2$x)
97 validate(l2, method="crossvalidation", bw = TRUE, B = 200)
98
99 # plot predicted vs. actual outcome to see where the model is good/bad
100 plot(calibrate(l2, B= 40))
101
102
103 ## ------------------------------------------------------------
104 # Backing up: You're at a conference and somebody asks you
105 # whether the log-transform of RTs is justfied. How do you
106 # determine this?
107 ## ------------------------------------------------------------
108
109
110
111
112 # (1) comparison of a priori distribution
113 par(mfrow=c(1,2))
114 hist(e$RTlexdec)
115 hist(exp(e$RTlexdec))
116 # also use shapiro.test()
117 par(mfrow=c(1,1))
118
119 # (2) comparison of qualities of models fit to that dependent variable
120 l2.exp <- ols(exp(RTlexdec) ~ AgeSubject + log(WrittenFrequency),
121 data = e, x=T, y=T)
122 l2$stats["R2"]
123 l2.exp$stats["R2"]
124
125 # (3) comparison of residuals of models fit to that dependent variable
126 hist(resid(l2))
127 qqnorm(resid(l2))
128 qqline(resid(l2), col="blue")
129
130 l2.exp <- ols(exp(RTlexdec) ~ AgeSubject + log(WrittenFrequency),
131 data = e, x=T, y=T)
132 hist(resid(l2.exp))
133 qqnorm(resid(l2.exp))
134 qqline(resid(l2.exp), col="blue")
135 plot(calibrate(l2.exp, B= 40))
136
137
138 ## ------------------------------------------------------------
139 # Further predictors
140 #
141 # Step 1: (building on l2)
142 # Group 1: Phonology 1 (CV, voice, obstruent, frication)
143 # Group 2: Phonology 2 (FrequencyInitialDiphoneWord, FrequencyInitialDiphoneSyllable)
144 # Group 3: CorrectLexdec and interactions with WrittenFrequency, non-linearity of WrittenFrequency (rcs)
145
146 help(english)
147
148 # Group 1
149 l4.1 <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
150 CV + Voice + Obstruent
151 , data = e, x=T, y=T)
152 l4.1
153 vif(l4.1)
154
155 with(e, table(Voice, Obstruent, Frication))
156 l4.1.cv <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
157 CV
158 , data = e, x=T, y=T)
159 l4.1.cv
160 l4.1.cvv <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
161 CV + Voice
162 , data = e, x=T, y=T)
163 l4.1.cvv
164 anova(l4.1.cvv)
165 validate(l4.1.cvv, bw=T, B=200)
166
167 l4.1.v <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
168 Voice
169 , data = e, x=T, y=T)
170 l4.1.cv
171
172 # Group 2
173 l4.2 <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
174 FrequencyInitialDiphoneWord + FrequencyInitialDiphoneSyllable
175 , data = e, x=T, y=T)
176 l4.2
177 vif(l4.2)
178 fastbw(l4.2)
179
180 # Group 3
181 str(e$CorrectLexdec)
182 l4.3 <- ols(RTlexdec ~ (AgeSubject + log(WrittenFrequency) +
183 CorrectLexdec)
184 , data = e, x=T, y=T)
185 l4.3
186 anova(l4.3)
187
188 l4.3.int <- ols(RTlexdec ~ AgeSubject + as.numeric(scale(log(WrittenFrequency), scale=F)) *
189 as.numeric(scale(CorrectLexdec, scale=F))
190 , data = e, x=T, y=T)
191 l4.3.int
192 vif(l4.3.int)
193 anova(l4.3.int)
194
195 dd<- datadist(e)
196 options(datadist='dd')
197 l4.3.rcs <- ols(RTlexdec ~ AgeSubject + rcs(WrittenFrequency,3) +
198 CorrectLexdec
199 , data = e, x=T, y=T)
200 l4.3.rcs
201 par(mfrow=c(2,2))
202 plot(l4.3.rcs)
203
204 # Step 2: (again building on l2)
205 # Group 1: NounFrequency, VerbFrequency, or both?
206 # Group 2: FamilySize
207 # Group 3: DerivationalEntropy, InflectionalEntropy
208
209 # Step 3: combine insights from Step2 and assess to what extent the partial
210 effects hold in the full model
211
212 # Step 4: (building on output of step 3)
213 # Group 1: Does context-sensitive probability matter (MeanBigramFrequency) beyond frequency?
214 # Group 2: Frequency vs. subjective Familiarity
215
216 ## ------------------------------------------------------------
217
218 # 1) understand the predictors, i.e. write/think about how you would
219 # write up a description of the variable, its levels/transform, etc.
220 # and maybe outstanding distributional properties
221 #
222 # 2) enter the predictors(s) into the model and
223 # (a) assess the direction and possible significance of their effect
224 # (b) assess how they improve the model (partial R-square, AIC
225 # associated with removal, etc.)
226 # (c) if the predictor is continuous, plot the effects
227 # (d) validate the model
228 # (d-1) validate(), calibrate()
229 # (d-2) influence() to assess influence of individual data points
230
231 # works with lm(), not ols()
232 l2.lm <- lm(RTlexdec ~ AgeSubject + log(WrittenFrequency),
233 data = e)
234 plot(dfbetas(l2.lm))
235 identify(locator())
236
237 help(influence.measures)
238
239 # additionally
240 # 3) for continuous predictors: look at the data to see
241 # whether a non-linear effect may be justfied? If you applied a
242 # transformation, is it justified? (also consider, plot(), curve()
243 # to plot the regression predictions on top of a scatterplot of the
244 # data).
245 #
246 # 4) are their possible interactions? think about how you would code them.
247 # Beware: interactions almost always will bring up the issue of collinearity
248
249
250 # the complete model
251
252 # ---- orthogonalize orthographic consistency measures
253 items = english[english$AgeSubject == "young",]
254 items.pca = prcomp(items[ , c(18:27)], center = TRUE, scale = TRUE)
255 x = as.data.frame(items.pca$rotation[,1:4])
256 items$PC1 = items.pca$x[,1]
257 items$PC2 = items.pca$x[,2]
258 items$PC3 = items.pca$x[,3]
259 items$PC4 = items.pca$x[,4]
260 items2 = english[english$AgeSubject != "young", ]
261 items2$PC1 = items.pca$x[,1]
262 items2$PC2 = items.pca$x[,2]
263 items2$PC3 = items.pca$x[,3]
264 items2$PC4 = items.pca$x[,4]
265 english = rbind(items, items2)
266
267 # ---- add Noun-Verb frequency ratio
268 english$NVratio = log(english$NounFrequency+1)-log(english$VerbFrequency+1)
269
270 english.dd = datadist(english)
271 options(datadist = 'english.dd')
272
273 english.ols = ols(RTlexdec ~ Voice + PC1 + MeanBigramFrequency +
274 rcs(WrittenFrequency, 5) + rcs(WrittenSpokenFrequencyRatio, 3) +
275 NVratio + WordCategory + AgeSubject +
276 rcs(FamilySize, 3) + InflectionalEntropy +
277 NumberComplexSynsets + rcs(WrittenFrequency, 5) : AgeSubject,
278 data = english, x = TRUE, y = TRUE)
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.