attachment:lexdecRT.R of DenmarkMiniCourseSession1

Attachment 'lexdecRT.R'

   1 library(languageR)
   2 data(english)
   3 
   4 str(english)
   5 help(english)
   6 
   7 # Data from: Balota, D., Cortese, M., Sergent-Marshall, S., Spieler, 
   8 # D. and Yap, M. (2004) Visual word recognition for single-syllable 
   9 # words, Journal of Experimental Psychology:General, 133, 283-316. 
  10 #
  11 # For best model, see Baayen, R.H., Feldman, L. and Schreuder, R. (2006) 
  12 # Morphological influences on the recognition of monosyllabic monomorphemic 
  13 # words, Journal of Memory and Language, 53, 496-512. 
  14 
  15 library(Design)
  16 english.dd = datadist(english)
  17 options(datadist = 'english.dd')
  18 
  19 # The data contain average lexical decision times on 2284 English words
  20 # taken from both young and old subjects
  21 
  22 # Question 1: Does age affect RTs?
  23 #     Step 1: Understand age variable
  24 summary(english$AgeSubject)
  25 with(english, table(AgeSubject, Word))
  26 with(english, table(AgeSubject, paste(Word, WordCategory, sep=".")))
  27 
  28 # We would predict that older subjects have slower RTs
  29 l1 = ols(RTlexdec ~ AgeSubject,
  30    data = english)
  31 
  32 # default: test="F"
  33 anova(l1) 
  34 anova(l1, test="Chisq")
  35 
  36 # ok, so Age matters, but how?
  37 print(l1)
  38 
  39 # what does this effect translate into? 
  40 # i.e. how many msecs faster are young subjects on average?
  41 # what else would be want to say in a summary of this result?
  42 
  43 
  44 # Age has a strong effect on log RT, what about word frequency?
  45 # We would expect that more frequent words are faster to decide on
  46 e <- english
  47 
  48 # just for entertainment, let's back-transform CELEX frequencies
  49 # to the original counts (we will use the log below)
  50 e$WrittenFrequency <- exp(e$WrittenFrequency)
  51 par(mfrow=c(1,2))
  52 hist(e$WrittenFrequency, breaks=20, main="CELEX frequencies")
  53 hist(log(e$WrittenFrequency), breaks=20, main="CELEX log frequencies")
  54 par(mfrow=c(1,1))
  55 
  56 l2 = ols(RTlexdec ~ AgeSubject + log(WrittenFrequency),
  57    data = e)
  58 l2
  59 
  60 # (1) how would you describe the frequency effect? 
  61 #     think about high vs. low values (use quantiles())
  62 # (2) how would you summarize the model? What *exactly* do the coefficients mean?
  63 
  64 # the frequency coefficient goes in the right direction
  65 # judging from the summary of the model, do you think the 
  66 # frequency and age effect are independent of each other?
  67 # or are they collinear? 
  68 # (think about l1 vs. l2 and the coefficients)
  69 vif(l2)
  70 # also:kappa(l2, exact=T)
  71 
  72 # what influences lexical decision times more, age or frequency?
  73 plot(anova(l2), what="aic")
  74 plot(anova(l2), what="partial R2")
  75 plot(anova(l2), what="remaining R2")
  76 plot(anova(l2), what="proportion R2")
  77 
  78 # plot the partial effects as predicted by the model
  79 par(mfrow = c(4, 3), mar = c(4, 4, 1, 1), oma = rep(1, 4))
  80 plot(l2, adj.subtitle = FALSE, ylim = c(6.4, 7), conf.int = FALSE)
  81 par(mfrow = c(1, 1))
  82 
  83 l2.int = ols(RTlexdec ~ I(ifelse(e$AgeSubject == "young", 1, -1))  * as.numeric(scale(log(WrittenFrequency), scale=F)),
  84    data = e)
  85 l2.int
  86 
  87 
  88 # ---- validate the model
  89 validate(l2, bw = TRUE, B = 200)
  90 # creates an error because we need to save the 
  91 # x (design matrix) and y (response) when we fit 
  92 # the model
  93 l2 <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency),
  94    data = e, x=T, y=T)
  95 summary(l2$y)
  96 summary(l2$x)
  97 validate(l2, method="crossvalidation", bw = TRUE, B = 200)
  98 
  99 # plot predicted vs. actual outcome to see where the model is good/bad
 100 plot(calibrate(l2, B= 40))
 101 
 102 
 103 ## ------------------------------------------------------------
 104 # Backing up: You're at a conference and somebody asks you 
 105 # whether the log-transform of RTs is justfied. How do you 
 106 # determine this?
 107 ## ------------------------------------------------------------
 108 
 109 
 110 
 111 
 112 # (1) comparison of a priori distribution
 113 par(mfrow=c(1,2))
 114 hist(e$RTlexdec)
 115 hist(exp(e$RTlexdec))
 116 # also use shapiro.test()
 117 par(mfrow=c(1,1))
 118 
 119 # (2) comparison of qualities of models fit to that dependent variable
 120 l2.exp <- ols(exp(RTlexdec) ~ AgeSubject + log(WrittenFrequency),
 121    data = e, x=T, y=T)
 122 l2$stats["R2"]
 123 l2.exp$stats["R2"]
 124 
 125 # (3) comparison of residuals of models fit to that dependent variable
 126 hist(resid(l2))
 127 qqnorm(resid(l2))
 128 qqline(resid(l2), col="blue")
 129 
 130 l2.exp <- ols(exp(RTlexdec) ~ AgeSubject + log(WrittenFrequency),
 131    data = e, x=T, y=T)
 132 hist(resid(l2.exp))
 133 qqnorm(resid(l2.exp))
 134 qqline(resid(l2.exp), col="blue")
 135 plot(calibrate(l2.exp, B= 40))
 136 
 137 
 138 ## ------------------------------------------------------------
 139 # Further predictors
 140 # 
 141 # Step 1: (building on l2)
 142 # Group 1: Phonology 1 (CV, voice, obstruent, frication)
 143 # Group 2: Phonology 2 (FrequencyInitialDiphoneWord, FrequencyInitialDiphoneSyllable)
 144 # Group 3: CorrectLexdec and interactions with WrittenFrequency, non-linearity of WrittenFrequency (rcs)
 145 
 146 help(english)
 147 
 148 # Group 1
 149 l4.1 <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
 150 	CV + Voice + Obstruent 
 151 	, data = e, x=T, y=T)
 152 l4.1
 153 vif(l4.1)
 154 
 155 with(e, table(Voice, Obstruent, Frication))
 156 l4.1.cv <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
 157 	CV
 158 	, data = e, x=T, y=T)
 159 l4.1.cv
 160 l4.1.cvv <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
 161 	CV + Voice
 162 	, data = e, x=T, y=T)
 163 l4.1.cvv
 164 anova(l4.1.cvv)
 165 validate(l4.1.cvv, bw=T, B=200)
 166 
 167 l4.1.v <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
 168 	Voice
 169 	, data = e, x=T, y=T)
 170 l4.1.cv
 171 
 172 # Group 2
 173 l4.2 <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
 174 	FrequencyInitialDiphoneWord + FrequencyInitialDiphoneSyllable
 175 	, data = e, x=T, y=T)
 176 l4.2
 177 vif(l4.2)
 178 fastbw(l4.2)
 179 
 180 # Group 3
 181 str(e$CorrectLexdec)
 182 l4.3 <- ols(RTlexdec ~ (AgeSubject + log(WrittenFrequency) +
 183 	CorrectLexdec)
 184 	, data = e, x=T, y=T)
 185 l4.3
 186 anova(l4.3)
 187 
 188 l4.3.int <- ols(RTlexdec ~ AgeSubject + as.numeric(scale(log(WrittenFrequency), scale=F)) *
 189 	as.numeric(scale(CorrectLexdec, scale=F))
 190 	, data = e, x=T, y=T)
 191 l4.3.int
 192 vif(l4.3.int)
 193 anova(l4.3.int)
 194 
 195 dd<- datadist(e)
 196 options(datadist='dd')
 197 l4.3.rcs <- ols(RTlexdec ~ AgeSubject + rcs(WrittenFrequency,3) +
 198 	CorrectLexdec
 199 	, data = e, x=T, y=T)
 200 l4.3.rcs
 201 par(mfrow=c(2,2))
 202 plot(l4.3.rcs)
 203 
 204 # Step 2: (again building on l2)
 205 # Group 1: NounFrequency, VerbFrequency, or both?
 206 # Group 2: FamilySize
 207 # Group 3: DerivationalEntropy, InflectionalEntropy
 208 
 209 # Step 3: combine insights from Step2 and assess to what extent the partial
 210           effects hold in the full model
 211 
 212 # Step 4: (building on output of step 3)
 213 # Group 1: Does context-sensitive probability matter (MeanBigramFrequency) beyond frequency?
 214 # Group 2: Frequency vs. subjective Familiarity
 215 
 216 ## ------------------------------------------------------------
 217 
 218 # 1) understand the predictors, i.e. write/think about how you would 
 219 #    write up a description of the variable, its levels/transform, etc.
 220 #    and maybe outstanding distributional properties
 221 #
 222 # 2) enter the predictors(s) into the model and
 223 #    (a) assess the direction and possible significance of their effect
 224 #    (b) assess how they improve the model (partial R-square, AIC 
 225 #        associated with removal, etc.)
 226 #    (c) if the predictor is continuous, plot the effects 
 227 #    (d) validate the model 
 228 #    (d-1) validate(), calibrate()
 229 #    (d-2) influence() to assess influence of individual data points
 230 
 231 # works with lm(), not ols()
 232 l2.lm <- lm(RTlexdec ~ AgeSubject + log(WrittenFrequency),
 233    data = e)
 234 plot(dfbetas(l2.lm))
 235 identify(locator())
 236 
 237 help(influence.measures)
 238 
 239 # additionally 
 240 # 3) for continuous predictors: look at the data to see 
 241 #    whether a non-linear effect may be justfied? If you applied a 
 242 #    transformation, is it justified? (also consider, plot(), curve()
 243 #    to plot the regression predictions on top of a scatterplot of the
 244 #    data).
 245 #
 246 # 4) are their possible interactions? think about how you would code them.
 247 #    Beware: interactions almost always will bring up the issue of collinearity
 248 
 249 
 250 # the complete model
 251 
 252 # ---- orthogonalize orthographic consistency measures
 253 items = english[english$AgeSubject == "young",]
 254 items.pca = prcomp(items[ , c(18:27)], center = TRUE, scale = TRUE)
 255 x = as.data.frame(items.pca$rotation[,1:4])
 256 items$PC1 =  items.pca$x[,1]
 257 items$PC2 =  items.pca$x[,2]
 258 items$PC3 =  items.pca$x[,3]
 259 items$PC4 =  items.pca$x[,4]
 260 items2 = english[english$AgeSubject != "young", ]
 261 items2$PC1 =  items.pca$x[,1]
 262 items2$PC2 =  items.pca$x[,2]
 263 items2$PC3 =  items.pca$x[,3]
 264 items2$PC4 =  items.pca$x[,4]
 265 english = rbind(items, items2) 
 266 
 267 # ---- add Noun-Verb frequency ratio
 268 english$NVratio = log(english$NounFrequency+1)-log(english$VerbFrequency+1)
 269 
 270 english.dd = datadist(english)
 271 options(datadist = 'english.dd')
 272 
 273 english.ols = ols(RTlexdec ~ Voice + PC1 + MeanBigramFrequency + 
 274    rcs(WrittenFrequency, 5) + rcs(WrittenSpokenFrequencyRatio, 3) + 
 275    NVratio + WordCategory + AgeSubject +
 276    rcs(FamilySize, 3) + InflectionalEntropy + 
 277    NumberComplexSynsets + rcs(WrittenFrequency, 5) : AgeSubject,
 278    data = english, x = TRUE, y = TRUE)
Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
You are not allowed to attach a file to this page.