## 	Benjamin Van Durme, vandurme@cs.rochester.edu,  4 Jun 2008
## 	Time-stamp: <2008-06-06 15:19:16 vandurme>

## setup environment
library(languageR)
library(Design)

########################################
## Baayen 6.1

data(english)

## Create english2 as according to the text

items = english[english$AgeSubject=="young",]
items.pca = prcomp(items[,c(18:27)],center=T,scale=T)
x= as.data.frame(items.pca$rotation[,1:4])
items$PC1 = items.pca$x[,1] 
items$PC2 = items.pca$x[,2] 
items$PC3 = items.pca$x[,3] 
items$PC4 = items.pca$x[,4] 
items2 = english[english$AgeSubject != "young", ] 
items2$PC1 = items.pca$x[,1] 
items2$PC2 = items.pca$x[,2] 
items2$PC3 = items.pca$x[,3] 
items2$PC4 = items.pca$x[,4] 
english2 = rbind(items, items2)

# english2$NVratio = log(english2$NounFrequency+1) - log(english2$VerbFrequency+1) 

## Create datadist object to inform Design of ranges for plots

english2.dd = datadist(english2)
options(datadist = "english2.dd")

## Look at WrittenFrequency compared to lexdec

plot(english2$RTlexdec, english2$WrittenFrequency)

## It looks there are two distributions there, perhaps old vs young?

with(english2[english2$AgeSubject == "old",],plot(WrittenFrequency, RTlexdec,ylim=c(6,7.5)))
with(english2[english2$AgeSubject == "young",],points(WrittenFrequency, RTlexdec, col="blue"))
 

## Fit a model as according to the instructions

model1.0 = ols( RTlexdec ~ AgeSubject + WrittenFrequency + PC1, data = english2)
model2 = ols( RTnaming ~ AgeSubject + WrittenFrequency + PC1, data = english2)

## interaction between WrittenFrequency and AgeSubject
model1.1 = ols( RTlexdec ~ AgeSubject + WrittenFrequency + PC1 + WrittenFrequency * AgeSubject, data = english2)

par(mfrow=c(2,2))
plot(model1.1)
par(mfrow=c(1,1))

## add restricted cubic spline on WrittenFrequency
model1.2 = ols( RTlexdec ~  AgeSubject * (rcs(WrittenFrequency, 3) + PC1), data = english2)

par(mfrow=c(2,2))
plot(model1.2)
par(mfrow=c(1,1))

## PC1 is not significant in the above model, why? The assignment suggests a
## 3knot rcs to make it non-linear

model1.3 = ols( RTlexdec ~  AgeSubject * (rcs(WrittenFrequency, 3) + rcs(PC1,3)), data = english2)

par(mfrow=c(2,2))
plot(model1.3)
par(mfrow=c(1,1))


## As compared to model1.2, the fit using an rcs on PC1 is better, making PC1 a
## more significant predictor. Based only on help(english), I'm not clear what
## the orthographic variables were that we performed PCA on, so I have little
## intuition on why PC1 provides for a better fit when allowed to be non-linear.


############################################################
## Baayen 6.8

data(imaging)
str(imaging)

imaging.dd = datadist(imaging)
options(datadist = "imaging.dd")

plot(imaging$BehavioralScore, imaging$FilteredSignal, ylim=c(0,100))

model6.8.1 = ols( FilteredSignal ~ BehavioralScore, data = imaging)
plot(model6.8.1)

## The above plot is worrisome; the confidence intervals are wide enough to
## allow for a non-increasing mapping between behavioralscore and
## filteredsignal.