library(languageR)
data(english)

str(english)
help(english)

# Data from: Balota, D., Cortese, M., Sergent-Marshall, S., Spieler, 
# D. and Yap, M. (2004) Visual word recognition for single-syllable 
# words, Journal of Experimental Psychology:General, 133, 283-316. 
#
# For best model, see Baayen, R.H., Feldman, L. and Schreuder, R. (2006) 
# Morphological influences on the recognition of monosyllabic monomorphemic 
# words, Journal of Memory and Language, 53, 496-512. 

library(Design)
english.dd = datadist(english)
options(datadist = 'english.dd')

# The data contain average lexical decision times on 2284 English words
# taken from both young and old subjects

# Question 1: Does age affect RTs?
#     Step 1: Understand age variable
summary(english$AgeSubject)
with(english, table(AgeSubject, Word))
with(english, table(AgeSubject, paste(Word, WordCategory, sep=".")))

# We would predict that older subjects have slower RTs
l1 = ols(RTlexdec ~ AgeSubject,
   data = english)

# default: test="F"
anova(l1) 
anova(l1, test="Chisq")

# ok, so Age matters, but how?
print(l1)

# what does this effect translate into? 
# i.e. how many msecs faster are young subjects on average?
# what else would be want to say in a summary of this result?


# Age has a strong effect on log RT, what about word frequency?
# We would expect that more frequent words are faster to decide on
e <- english

# just for entertainment, let's back-transform CELEX frequencies
# to the original counts (we will use the log below)
e$WrittenFrequency <- exp(e$WrittenFrequency)
par(mfrow=c(1,2))
hist(e$WrittenFrequency, breaks=20, main="CELEX frequencies")
hist(log(e$WrittenFrequency), breaks=20, main="CELEX log frequencies")
par(mfrow=c(1,1))

l2 = ols(RTlexdec ~ AgeSubject + log(WrittenFrequency),
   data = e)
l2

# (1) how would you describe the frequency effect? 
#     think about high vs. low values (use quantiles())
# (2) how would you summarize the model? What *exactly* do the coefficients mean?

# the frequency coefficient goes in the right direction
# judging from the summary of the model, do you think the 
# frequency and age effect are independent of each other?
# or are they collinear? 
# (think about l1 vs. l2 and the coefficients)
vif(l2)
# also:kappa(l2, exact=T)

# what influences lexical decision times more, age or frequency?
plot(anova(l2), what="aic")
plot(anova(l2), what="partial R2")
plot(anova(l2), what="remaining R2")
plot(anova(l2), what="proportion R2")

# plot the partial effects as predicted by the model
par(mfrow = c(4, 3), mar = c(4, 4, 1, 1), oma = rep(1, 4))
plot(l2, adj.subtitle = FALSE, ylim = c(6.4, 7), conf.int = FALSE)
par(mfrow = c(1, 1))

l2.int = ols(RTlexdec ~ I(ifelse(e$AgeSubject == "young", 1, -1))  * as.numeric(scale(log(WrittenFrequency), scale=F)),
   data = e)
l2.int


# ---- validate the model
validate(l2, bw = TRUE, B = 200)
# creates an error because we need to save the 
# x (design matrix) and y (response) when we fit 
# the model
l2 <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency),
   data = e, x=T, y=T)
summary(l2$y)
summary(l2$x)
validate(l2, method="crossvalidation", bw = TRUE, B = 200)

# plot predicted vs. actual outcome to see where the model is good/bad
plot(calibrate(l2, B= 40))


## ------------------------------------------------------------
# Backing up: You're at a conference and somebody asks you 
# whether the log-transform of RTs is justfied. How do you 
# determine this?
## ------------------------------------------------------------




# (1) comparison of a priori distribution
par(mfrow=c(1,2))
hist(e$RTlexdec)
hist(exp(e$RTlexdec))
# also use shapiro.test()
par(mfrow=c(1,1))

# (2) comparison of qualities of models fit to that dependent variable
l2.exp <- ols(exp(RTlexdec) ~ AgeSubject + log(WrittenFrequency),
   data = e, x=T, y=T)
l2$stats["R2"]
l2.exp$stats["R2"]

# (3) comparison of residuals of models fit to that dependent variable
hist(resid(l2))
qqnorm(resid(l2))
qqline(resid(l2), col="blue")

l2.exp <- ols(exp(RTlexdec) ~ AgeSubject + log(WrittenFrequency),
   data = e, x=T, y=T)
hist(resid(l2.exp))
qqnorm(resid(l2.exp))
qqline(resid(l2.exp), col="blue")
plot(calibrate(l2.exp, B= 40))


## ------------------------------------------------------------
# Further predictors
# 
# Step 1: (building on l2)
# Group 1: Phonology 1 (CV, voice, obstruent, frication)
# Group 2: Phonology 2 (FrequencyInitialDiphoneWord, FrequencyInitialDiphoneSyllable)
# Group 3: CorrectLexdec and interactions with WrittenFrequency, non-linearity of WrittenFrequency (rcs)

help(english)

# Group 1
l4.1 <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
	CV + Voice + Obstruent 
	, data = e, x=T, y=T)
l4.1
vif(l4.1)

with(e, table(Voice, Obstruent, Frication))
l4.1.cv <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
	CV
	, data = e, x=T, y=T)
l4.1.cv
l4.1.cvv <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
	CV + Voice
	, data = e, x=T, y=T)
l4.1.cvv
anova(l4.1.cvv)
validate(l4.1.cvv, bw=T, B=200)

l4.1.v <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
	Voice
	, data = e, x=T, y=T)
l4.1.cv

# Group 2
l4.2 <- ols(RTlexdec ~ AgeSubject + log(WrittenFrequency) +
	FrequencyInitialDiphoneWord + FrequencyInitialDiphoneSyllable
	, data = e, x=T, y=T)
l4.2
vif(l4.2)
fastbw(l4.2)

# Group 3
str(e$CorrectLexdec)
l4.3 <- ols(RTlexdec ~ (AgeSubject + log(WrittenFrequency) +
	CorrectLexdec)
	, data = e, x=T, y=T)
l4.3
anova(l4.3)

l4.3.int <- ols(RTlexdec ~ AgeSubject + as.numeric(scale(log(WrittenFrequency), scale=F)) *
	as.numeric(scale(CorrectLexdec, scale=F))
	, data = e, x=T, y=T)
l4.3.int
vif(l4.3.int)
anova(l4.3.int)

dd<- datadist(e)
options(datadist='dd')
l4.3.rcs <- ols(RTlexdec ~ AgeSubject + rcs(WrittenFrequency,3) +
	CorrectLexdec
	, data = e, x=T, y=T)
l4.3.rcs
par(mfrow=c(2,2))
plot(l4.3.rcs)

# Step 2: (again building on l2)
# Group 1: NounFrequency, VerbFrequency, or both?
# Group 2: FamilySize
# Group 3: DerivationalEntropy, InflectionalEntropy

# Step 3: combine insights from Step2 and assess to what extent the partial
          effects hold in the full model

# Step 4: (building on output of step 3)
# Group 1: Does context-sensitive probability matter (MeanBigramFrequency) beyond frequency?
# Group 2: Frequency vs. subjective Familiarity

## ------------------------------------------------------------

# 1) understand the predictors, i.e. write/think about how you would 
#    write up a description of the variable, its levels/transform, etc.
#    and maybe outstanding distributional properties
#
# 2) enter the predictors(s) into the model and
#    (a) assess the direction and possible significance of their effect
#    (b) assess how they improve the model (partial R-square, AIC 
#        associated with removal, etc.)
#    (c) if the predictor is continuous, plot the effects 
#    (d) validate the model 
#    (d-1) validate(), calibrate()
#    (d-2) influence() to assess influence of individual data points

# works with lm(), not ols()
l2.lm <- lm(RTlexdec ~ AgeSubject + log(WrittenFrequency),
   data = e)
plot(dfbetas(l2.lm))
identify(locator())

help(influence.measures)

# additionally 
# 3) for continuous predictors: look at the data to see 
#    whether a non-linear effect may be justfied? If you applied a 
#    transformation, is it justified? (also consider, plot(), curve()
#    to plot the regression predictions on top of a scatterplot of the
#    data).
#
# 4) are their possible interactions? think about how you would code them.
#    Beware: interactions almost always will bring up the issue of collinearity


# the complete model

# ---- orthogonalize orthographic consistency measures
items = english[english$AgeSubject == "young",]
items.pca = prcomp(items[ , c(18:27)], center = TRUE, scale = TRUE)
x = as.data.frame(items.pca$rotation[,1:4])
items$PC1 =  items.pca$x[,1]
items$PC2 =  items.pca$x[,2]
items$PC3 =  items.pca$x[,3]
items$PC4 =  items.pca$x[,4]
items2 = english[english$AgeSubject != "young", ]
items2$PC1 =  items.pca$x[,1]
items2$PC2 =  items.pca$x[,2]
items2$PC3 =  items.pca$x[,3]
items2$PC4 =  items.pca$x[,4]
english = rbind(items, items2) 

# ---- add Noun-Verb frequency ratio
english$NVratio = log(english$NounFrequency+1)-log(english$VerbFrequency+1)

english.dd = datadist(english)
options(datadist = 'english.dd')

english.ols = ols(RTlexdec ~ Voice + PC1 + MeanBigramFrequency + 
   rcs(WrittenFrequency, 5) + rcs(WrittenSpokenFrequencyRatio, 3) + 
   NVratio + WordCategory + AgeSubject +
   rcs(FamilySize, 3) + InflectionalEntropy + 
   NumberComplexSynsets + rcs(WrittenFrequency, 5) : AgeSubject,
   data = english, x = TRUE, y = TRUE)