attachment:lexdecRT.R of HLPMiniCourseSession2

Attachment 'lexdecRT.R'

   1 library(languageR)
   2 data(english)
   3 
   4 str(english)
   5 help(english)
   6 
   7 # Data from: Balota, D., Cortese, M., Sergent-Marshall, S., Spieler, D. and Yap, M. (2004) Visual word recognition for single-syllable words, Journal of Experimental Psychology:General, 133, 283-316. 
   8 #
   9 # For best model, see Baayen, R.H., Feldman, L. and Schreuder, R. (2006) Morphological influences on the recognition of monosyllabic monomorphemic words, Journal of Memory and Language, 53, 496-512. 
  10 
  11 library(Design)
  12 english.dd = datadist(english)
  13 options(datadist = 'english.dd')
  14 
  15 # The data contain average lexical decision times on 2284 English word
  16 # taken from both young and old subjects
  17 
  18 # Question 1: Does age affect RTs?
  19 #     Step 1: Understand age variable
  20 summary(english$AgeSubject)
  21 with(english, table(AgeSubject, Word))
  22 with(english, table(AgeSubject, paste(Word, WordCategory, sep=".")))
  23 
  24 # We would predict that older subjects have slower RTs
  25 l1 = ols(RTlexdec ~ AgeSubject,
  26    data = english)
  27 
  28 # default: test="F"
  29 anova(l1) 
  30 anova(l1, test="Chisq")
  31 
  32 # ok, so Age matters, but how?
  33 print(l1)
  34 
  35 # what does this effect translate into? 
  36 # i.e. how many msecs faster are young subjects on average?
  37 # what else would be want to say in a summary of this result?
  38 
  39 
  40 # Age has a strong effect on log RT, what about word frequency?
  41 # We would expect that more frequent words are faster to decide on
  42 e <- english
  43 
  44 # just for entertainment, let's back-transform CELEX frequencies
  45 # to the original counts (we will use the log below)
  46 e$WrittenFrequency <- exp(e$WrittenFrequency)
  47 par(mfrow=c(1,2))
  48 hist(e$WrittenFrequency, breaks=20, main="CELEX frequencies")
  49 hist(log(e$WrittenFrequency), breaks=20, main="CELEX log frequencies")
  50 par(mfrow=c(1,1))
  51 
  52 l2 = ols(RTlexdec ~ AgeSubject + log(WrittenFrequency),
  53    data = e)
  54 l2
  55 
  56 # (1) how would you describe the frequency effect? 
  57 #     think about high vs. low values (use quantiles())
  58 # (2) how would you summarize the model? What *exactly* do the coefficients mean?
  59 
  60 # the frequency coefficient goes in the right direction
  61 # judging from the summary of the model, do you think the 
  62 # frequency and age effect are independent of each other?
  63 # or are they collinear? 
  64 # (think about l1 vs. l2 and the coefficients)
  65 vif(l2)
  66 # also:kappa(l2, exact=T)
  67 
  68 # what influences lexical decision times more, age or frequency?
  69 plot(anova(l2), what="aic")
  70 plot(anova(l2), what="partial R2")
  71 plot(anova(l2), what="remaining R2")
  72 plot(anova(l2), what="proportion R2")
  73 
  74 # plot the partial effects as predicted by the model
  75 par(mfrow = c(4, 3), mar = c(4, 4, 1, 1), oma = rep(1, 4))
  76 plot(l2, adj.subtitle = FALSE, ylim = c(6.4, 7), conf.int = FALSE)
  77 par(mfrow = c(1, 1))
  78 
  79 # ---- validate the model
  80 validate(l2, bw = TRUE, B = 200)
  81 # creates an error because we need to save the 
  82 # x (design matrix) and y (response) when we fit 
  83 # the model
  84 l2 <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency),
  85    data = e, x=T, y=T)
  86 summary(l2$y)
  87 summary(l2$x)
  88 validate(l2, method="crossvalidation", bw = TRUE, B = 200)
  89 
  90 # plot predicted vs. actual outcome to see where the model is good/bad
  91 plot(calibrate(l2, B= 40))
  92 
  93 
  94 ## ------------------------------------------------------------
  95 # Backing up: You're at a conference and somebody asks you 
  96 # whether the log-transform of RTs is justfied. How do you 
  97 # determine this?
  98 ## ------------------------------------------------------------
  99 
 100 
 101 
 102 
 103 # (1) comparison of a priori distribution
 104 hist(e$RTlexdec)
 105 hist(exp(e$RTlexdec))
 106 # also use shapiro.test()
 107 
 108 # (2) comparison of qualities of models fit to that dependent variable
 109 l2.exp <- ols(exp(RTlexdec) ~ AgeSubject + log(WrittenFrequency),
 110    data = e, x=T, y=T)
 111 l2$stats["R2"]
 112 l2.exp$stats["R2"]
 113 
 114 # (3) comparison of residuals of models fit to that dependent variable
 115 hist(resid(l2))
 116 qqnorm(resid(l2))
 117 qqline(resid(l2), col="blue")
 118 
 119 l2.exp <- ols(exp(RTlexdec) ~ AgeSubject + log(WrittenFrequency),
 120    data = e, x=T, y=T)
 121 hist(resid(l2.exp))
 122 qqnorm(resid(l2.exp))
 123 qqline(resid(l2.exp), col="blue")
 124 plot(calibrate(l2.exp, B= 40))
 125 
 126 
 127 ## ------------------------------------------------------------
 128 # Further predictors
 129 # 
 130 # Step 1: (building on l2)
 131 # Group 1: Phonology 1 (CV, voice, obstruent, frication)
 132 # Group 2: Phonology 2 (FrequencyInitialDiphoneWord, FrequencyInitialDiphoneSyllable)
 133 # Group 3: CorrectLexdec and interactions with WrittenFrequency, non-linearity of WrittenFrequency (rcs)
 134 
 135 # Step 2: (again building on l2)
 136 # Group 1: NounFrequency, VerbFrequency, or both?
 137 # Group 2: FamilySize
 138 # Group 3: DerivationalEntropy, InflectionalEntropy
 139 
 140 # Step 3: combine insights from Step2 and assess to what extent the partial
 141           effects hold in the full model
 142 
 143 # Step 4: (building on output of step 3)
 144 # Group 1: Does context-sensitive probability matter (MeanBigramFrequency) beyond frequency?
 145 # Group 2: Frequency vs. subjective Familiarity
 146 
 147 ## ------------------------------------------------------------
 148 
 149 # 1) understand the predictors, i.e. write/think about how you would 
 150 #    write up a description of the variable, its levels/transform, etc.
 151 #    and maybe outstanding distributional properties
 152 #
 153 # 2) enter the predictors(s) into the model and
 154 #    (a) assess the direction and possible significance of their effect
 155 #    (b) assess how they improve the model (partial R-square, AIC 
 156 #        associated with removal, etc.)
 157 #    (c) if the predictor is continuous, plot the effects 
 158 #    (d) validate the model 
 159 #    (d-1) validate(), calibrate()
 160 #    (d-2) influence() to assess influence of individual data points
 161 
 162 # works with lm(), not ols()
 163 l2.lm <- lm(RTlexdec ~ AgeSubject + log(WrittenFrequency),
 164    data = e)
 165 plot(dfbetas(l2.lm))
 166 identify(locator())
 167 
 168 help(influence.measures)
 169 
 170 # additionally 
 171 # 3) for continuous predictors: look at the data to see 
 172 #    whether a non-linear effect may be justfied? If you applied a 
 173 #    transformation, is it justified? (also consider, plot(), curve()
 174 #    to plot the regression predictions on top of a scatterplot of the
 175 #    data).
 176 #
 177 # 4) are their possible interactions? think about how you would code them.
 178 #    Beware: interactions almost always will bring up the issue of collinearity
 179 
 180 
 181 
 182 
 183 
 184 e.ols = ols(RTlexdec ~ Voice + PC1 + MeanBigramFrequency + 
 185    rcs(WrittenFrequency, 5) + rcs(WrittenSpokenFrequencyRatio, 3) + 
 186    NVratio + WordCategory + AgeSubject +
 187    rcs(FamilySize, 3) + InflectionalEntropy + 
 188    NumberComplexSynsets + rcs(WrittenFrequency, 5) : AgeSubject,
 189    data = english, x = TRUE, y = TRUE)
 190 
 191 # ---- orthogonalize orthographic consistency measures
 192 items = english[english$AgeSubject == "young",]
 193 items.pca = prcomp(items[ , c(18:27)], center = TRUE, scale = TRUE)
 194 x = as.data.frame(items.pca$rotation[,1:4])
 195 items$PC1 =  items.pca$x[,1]
 196 items$PC2 =  items.pca$x[,2]
 197 items$PC3 =  items.pca$x[,3]
 198 items$PC4 =  items.pca$x[,4]
 199 items2 = english[english$AgeSubject != "young", ]
 200 items2$PC1 =  items.pca$x[,1]
 201 items2$PC2 =  items.pca$x[,2]
 202 items2$PC3 =  items.pca$x[,3]
 203 items2$PC4 =  items.pca$x[,4]
 204 english = rbind(items, items2) 
 205 
 206 # ---- add Noun-Verb frequency ratio
 207 english$NVratio = log(english$NounFrequency+1)-log(english$VerbFrequency+1)
 208 
 209 english.dd = datadist(english)
 210 options(datadist = 'english.dd')
 211 
 212 english.ols = ols(RTlexdec ~ Voice + PC1 + MeanBigramFrequency + 
 213    rcs(WrittenFrequency, 5) + rcs(WrittenSpokenFrequencyRatio, 3) + 
 214    NVratio + WordCategory + AgeSubject +
 215    rcs(FamilySize, 3) + InflectionalEntropy + 
 216    NumberComplexSynsets + rcs(WrittenFrequency, 5) : AgeSubject,
 217    data = english, x = TRUE, y = TRUE)
Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
You are not allowed to attach a file to this page.