Attachment 'lexdecRT.R'
Download 1 library(languageR)
2 data(english)
3
4 str(english)
5 help(english)
6
7 # Data from: Balota, D., Cortese, M., Sergent-Marshall, S., Spieler, D. and Yap, M. (2004) Visual word recognition for single-syllable words, Journal of Experimental Psychology:General, 133, 283-316.
8 #
9 # For best model, see Baayen, R.H., Feldman, L. and Schreuder, R. (2006) Morphological influences on the recognition of monosyllabic monomorphemic words, Journal of Memory and Language, 53, 496-512.
10
11 library(Design)
12 english.dd = datadist(english)
13 options(datadist = 'english.dd')
14
15 # The data contain average lexical decision times on 2284 English word
16 # taken from both young and old subjects
17
18 # Question 1: Does age affect RTs?
19 # Step 1: Understand age variable
20 summary(english$AgeSubject)
21 with(english, table(AgeSubject, Word))
22 with(english, table(AgeSubject, paste(Word, WordCategory, sep=".")))
23
24 # We would predict that older subjects have slower RTs
25 l1 = ols(RTlexdec ~ AgeSubject,
26 data = english)
27
28 # default: test="F"
29 anova(l1)
30 anova(l1, test="Chisq")
31
32 # ok, so Age matters, but how?
33 print(l1)
34
35 # what does this effect translate into?
36 # i.e. how many msecs faster are young subjects on average?
37 # what else would be want to say in a summary of this result?
38
39
40 # Age has a strong effect on log RT, what about word frequency?
41 # We would expect that more frequent words are faster to decide on
42 e <- english
43
44 # just for entertainment, let's back-transform CELEX frequencies
45 # to the original counts (we will use the log below)
46 e$WrittenFrequency <- exp(e$WrittenFrequency)
47 par(mfrow=c(1,2))
48 hist(e$WrittenFrequency, breaks=20, main="CELEX frequencies")
49 hist(log(e$WrittenFrequency), breaks=20, main="CELEX log frequencies")
50 par(mfrow=c(1,1))
51
52 l2 = ols(RTlexdec ~ AgeSubject + log(WrittenFrequency),
53 data = e)
54 l2
55
56 # (1) how would you describe the frequency effect?
57 # think about high vs. low values (use quantiles())
58 # (2) how would you summarize the model? What *exactly* do the coefficients mean?
59
60 # the frequency coefficient goes in the right direction
61 # judging from the summary of the model, do you think the
62 # frequency and age effect are independent of each other?
63 # or are they collinear?
64 # (think about l1 vs. l2 and the coefficients)
65 vif(l2)
66 # also:kappa(l2, exact=T)
67
68 # what influences lexical decision times more, age or frequency?
69 plot(anova(l2), what="aic")
70 plot(anova(l2), what="partial R2")
71 plot(anova(l2), what="remaining R2")
72 plot(anova(l2), what="proportion R2")
73
74 # plot the partial effects as predicted by the model
75 par(mfrow = c(4, 3), mar = c(4, 4, 1, 1), oma = rep(1, 4))
76 plot(l2, adj.subtitle = FALSE, ylim = c(6.4, 7), conf.int = FALSE)
77 par(mfrow = c(1, 1))
78
79 # ---- validate the model
80 validate(l2, bw = TRUE, B = 200)
81 # creates an error because we need to save the
82 # x (design matrix) and y (response) when we fit
83 # the model
84 l2 <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency),
85 data = e, x=T, y=T)
86 summary(l2$y)
87 summary(l2$x)
88 validate(l2, method="crossvalidation", bw = TRUE, B = 200)
89
90 # plot predicted vs. actual outcome to see where the model is good/bad
91 plot(calibrate(l2, B= 40))
92
93
94 ## ------------------------------------------------------------
95 # Backing up: You're at a conference and somebody asks you
96 # whether the log-transform of RTs is justfied. How do you
97 # determine this?
98 ## ------------------------------------------------------------
99
100
101
102
103 # (1) comparison of a priori distribution
104 hist(e$RTlexdec)
105 hist(exp(e$RTlexdec))
106 # also use shapiro.test()
107
108 # (2) comparison of qualities of models fit to that dependent variable
109 l2.exp <- ols(exp(RTlexdec) ~ AgeSubject + log(WrittenFrequency),
110 data = e, x=T, y=T)
111 l2$stats["R2"]
112 l2.exp$stats["R2"]
113
114 # (3) comparison of residuals of models fit to that dependent variable
115 hist(resid(l2))
116 qqnorm(resid(l2))
117 qqline(resid(l2), col="blue")
118
119 l2.exp <- ols(exp(RTlexdec) ~ AgeSubject + log(WrittenFrequency),
120 data = e, x=T, y=T)
121 hist(resid(l2.exp))
122 qqnorm(resid(l2.exp))
123 qqline(resid(l2.exp), col="blue")
124 plot(calibrate(l2.exp, B= 40))
125
126
127 ## ------------------------------------------------------------
128 # Further predictors
129 #
130 # Step 1: (building on l2)
131 # Group 1: Phonology 1 (CV, voice, obstruent, frication)
132 # Group 2: Phonology 2 (FrequencyInitialDiphoneWord, FrequencyInitialDiphoneSyllable)
133 # Group 3: CorrectLexdec and interactions with WrittenFrequency, non-linearity of WrittenFrequency (rcs)
134
135 # Step 2: (again building on l2)
136 # Group 1: NounFrequency, VerbFrequency, or both?
137 # Group 2: FamilySize
138 # Group 3: DerivationalEntropy, InflectionalEntropy
139
140 # Step 3: combine insights from Step2 and assess to what extent the partial
141 effects hold in the full model
142
143 # Step 4: (building on output of step 3)
144 # Group 1: Does context-sensitive probability matter (MeanBigramFrequency) beyond frequency?
145 # Group 2: Frequency vs. subjective Familiarity
146
147 ## ------------------------------------------------------------
148
149 # 1) understand the predictors, i.e. write/think about how you would
150 # write up a description of the variable, its levels/transform, etc.
151 # and maybe outstanding distributional properties
152 #
153 # 2) enter the predictors(s) into the model and
154 # (a) assess the direction and possible significance of their effect
155 # (b) assess how they improve the model (partial R-square, AIC
156 # associated with removal, etc.)
157 # (c) if the predictor is continuous, plot the effects
158 # (d) validate the model
159 # (d-1) validate(), calibrate()
160 # (d-2) influence() to assess influence of individual data points
161
162 # works with lm(), not ols()
163 l2.lm <- lm(RTlexdec ~ AgeSubject + log(WrittenFrequency),
164 data = e)
165 plot(dfbetas(l2.lm))
166 identify(locator())
167
168 help(influence.measures)
169
170 # additionally
171 # 3) for continuous predictors: look at the data to see
172 # whether a non-linear effect may be justfied? If you applied a
173 # transformation, is it justified? (also consider, plot(), curve()
174 # to plot the regression predictions on top of a scatterplot of the
175 # data).
176 #
177 # 4) are their possible interactions? think about how you would code them.
178 # Beware: interactions almost always will bring up the issue of collinearity
179
180
181
182
183
184 e.ols = ols(RTlexdec ~ Voice + PC1 + MeanBigramFrequency +
185 rcs(WrittenFrequency, 5) + rcs(WrittenSpokenFrequencyRatio, 3) +
186 NVratio + WordCategory + AgeSubject +
187 rcs(FamilySize, 3) + InflectionalEntropy +
188 NumberComplexSynsets + rcs(WrittenFrequency, 5) : AgeSubject,
189 data = english, x = TRUE, y = TRUE)
190
191 # ---- orthogonalize orthographic consistency measures
192 items = english[english$AgeSubject == "young",]
193 items.pca = prcomp(items[ , c(18:27)], center = TRUE, scale = TRUE)
194 x = as.data.frame(items.pca$rotation[,1:4])
195 items$PC1 = items.pca$x[,1]
196 items$PC2 = items.pca$x[,2]
197 items$PC3 = items.pca$x[,3]
198 items$PC4 = items.pca$x[,4]
199 items2 = english[english$AgeSubject != "young", ]
200 items2$PC1 = items.pca$x[,1]
201 items2$PC2 = items.pca$x[,2]
202 items2$PC3 = items.pca$x[,3]
203 items2$PC4 = items.pca$x[,4]
204 english = rbind(items, items2)
205
206 # ---- add Noun-Verb frequency ratio
207 english$NVratio = log(english$NounFrequency+1)-log(english$VerbFrequency+1)
208
209 english.dd = datadist(english)
210 options(datadist = 'english.dd')
211
212 english.ols = ols(RTlexdec ~ Voice + PC1 + MeanBigramFrequency +
213 rcs(WrittenFrequency, 5) + rcs(WrittenSpokenFrequencyRatio, 3) +
214 NVratio + WordCategory + AgeSubject +
215 rcs(FamilySize, 3) + InflectionalEntropy +
216 NumberComplexSynsets + rcs(WrittenFrequency, 5) : AgeSubject,
217 data = english, x = TRUE, y = TRUE)
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.