###########################################################
# 
# Below, empty lines correspond to slide breaks in the 
# presentation. I've also marked the major sections breaks.
#
# For questions, contact Florian Jaeger, 
#                tiflo@bcs.rochester.edu
#
# Also consider, subscribing to R-lang, an email list for
# language researchers working in R 
# (http://pidgin.ucsd.edu/mailman/listinfo/r-lang)
#
# you can get help on any command by typing ?command into
# the R GUI.
#
# Lines of the script are executed with CTRL-R (Windows) or
# APPLE-ENTER (Mac OS)
#
###########################################################

## --------------------------------------------------------
# Functions (run first)
## --------------------------------------------------------
center <- function(x) { return(x - mean(x, na.rm=T)) }

## --------------------------------------------------------
# Change to your working directory
## --------------------------------------------------------
setwd("C:\\Documents and Settings\\florian\\My Documents\\My LabMeetings, Tutorials, & Teaching\\Regression Workshops\\PennState\\figs")

## --------------------------------------------------------
# Lexical decision time data
## --------------------------------------------------------
library(languageR)
head(lexdec[,c(1,2,5,10,11)])

?lexdec
summary(lexdec)
str(lexdec)
names(lexdec)


## --------------------------------------------------------
# Ordinary Linear Regression - An example
## --------------------------------------------------------
l = glm(RT ~ 1 + Frequency, data=lexdec, 
	family="gaussian")
summary(l)
sqrt(summary(l)[["dispersion"]])

## --------------------------------------------------------
# Ordinary Linear Regression - Geometric intuitions
## --------------------------------------------------------
library(languageR)
library(Design)

l.lexdec0 = lm(RT ~ 1, data=lexdec)
summary(l.lexdec0)
plot(x=lexdec$RT, 
	xlab="Case Index", 
	ylab="Response latency (in log-transformed msecs)", 
	main="Predicting Lexical Decision RTs")
abline(coef(l.lexdec0)[1], 0, col="orange", lwd=3)

l.lexdec1 = lm(RT ~ 1 + Frequency, data=lexdec)
plot(x=lexdec$Frequency, y=lexdec$RT, 
	xlab="Word Frequency (log-transformed)", 
	ylab="Response latency (in log-transformed msecs)", 
	main="Predicting Lexical Decision RTs")
abline(coef(l.lexdec1), col="orange", lwd=3)

plot(x=exp(lexdec$Frequency), y=exp(lexdec$RT), 
	xlab="Word Frequency", 
	ylab="Response latency (in msecs)", 
	main="Predicting Lexical Decision RTs")
curve(exp(coef(l.lexdec1)[1] + 
	coef(l.lexdec1)[2]*log(x)), 
	col="orange", lwd=3, add=T)
# see also the plot.Design function
# for which you have to fit the model with ols()

library(scatterplot3d)
l.lexdec2 = lm(RT ~ 1 + Frequency + FamilySize, data=lexdec)
summary(l.lexdec2)

s = scatterplot3d(x=lexdec$Frequency, y=lexdec$FamilySize, 
	z=lexdec$RT,
	xlab="Word Frequency (log-transformed)", 
	ylab="Number of morph. family members (log-transformed)",
	zlab="Response latency (in log-transformed msecs)", 
	main="Predicting Lexical Decision RTs")
s$plane3d(l.lexdec2, col= "orange", lwd=2, lty.box = "solid")

l.lexdec3 = lm(RT ~ 1 + Frequency + FamilySize + 
	NativeLanguage, data=lexdec)
s = scatterplot3d(x=lexdec$Frequency, y=lexdec$FamilySize, 
	z=lexdec$RT,
	xlab="Word Frequency (log-transformed)", 
	ylab="Number of morph. family members (log-transformed)",
	zlab="Response latency (in log-transformed msecs)", 
	main="Predicting Lexical Decision RTs",
	sub="Native Speakers (red) and\nNon-Native Speakers (blue)")
s$plane3d(coef(l.lexdec3)[1], 
	coef(l.lexdec3)[2], coef(l.lexdec3)[3], 
	col= "red", lwd=2, 
	lty.box = "solid")
s$plane3d(coef(l.lexdec3)[1] + coef(l.lexdec3)[4], 
	coef(l.lexdec3)[2], coef(l.lexdec3)[3], 
	col= "orange", lwd=2, 
	lty.box = "solid")

l.lexdec4 = lm(RT ~ 1 + Frequency + FamilySize + 
	NativeLanguage + NativeLanguage:Frequency, 
	data=lexdec)
summary(l.lexdec4)
s = scatterplot3d(x=lexdec$Frequency, y=lexdec$FamilySize, 
	z=lexdec$RT,
	xlab="Word Frequency (log-transformed)", 
	ylab="Number of morph. family members (log-transformed)",
	zlab="Response latency (in log-transformed msecs)", 
	main="Predicting Lexical Decision RTs",
	sub="Interaction with Native Speakers (red) and\nNon-Native Speakers (blue)")
s$plane3d(coef(l.lexdec4)[1], 
	coef(l.lexdec4)[2], coef(l.lexdec4)[3], 
	col= "red", lwd=2, 
	lty.box = "solid")
s$plane3d(coef(l.lexdec4)[1] + coef(l.lexdec4)[4], 
	coef(l.lexdec4)[2] + coef(l.lexdec4)[5], 
	coef(l.lexdec4)[3], 
	col= "orange", lwd=2, 
	lty.box = "solid")

## --------------------------------------------------------
# Random Effects
## --------------------------------------------------------
setwd("C:\\Documents and Settings\\florian\\My Documents\\My LabMeetings, Tutorials, & Teaching\\Regression Workshops\\PennState\\figs")
library(languageR)

o = exp(as.vector(tapply(lexdec$RT, lexdec$Subject, mean)))
plot(o, type = "n", xlab = "Subject Index", 
	ylab = "Response latency in msecs", 
	main = "Subject as random effect")
text(o, as.character(unique(lexdec$Subject)), col = "blue")
abline(mean(exp(lexdec$RT)),0, col = "gray", lwd=2)

lmer.lexdec0 = lmer(RT ~ 1 + Frequency + 
	(1 | Subject), data=lexdec)

cor(fitted(lmer.lexdec0), lexdec$RT)^2

pvals.fnc(lmer.lexdec0, nsim = 10000)

ranef(lmer.lexdec0)

p = exp(as.vector(unlist(coef(lme.lexdec0)$Subject)))
text(p, as.character(unique(lexdec$Subject)), col = "red")
legend(x=2, y=850, legend=c("Predicted", "Observed"), 
	col=c("blue", "red"), pch=1)

lmer.lexdec1 = lmer(RT ~ 1 + (1 | Subject) + (1 | Word), 
	data=lexdec)
o = exp(as.vector(tapply(lexdec$RT, lexdec$Word, mean, na.rm=T)))
p = exp(as.vector(unlist(coef(lmer.lexdec1)$Word)))
plot(o, type = "n", xlab = "Word Index", 
	ylab = "Response latency in msecs", 
	main = "Word as random effect")
abline(mean(exp(lexdec$RT)),0, col = "gray", lwd=2)
text(o, as.character(unique(lexdec$Word)), col = "blue")
segments(x0=1:length(o), y0=o, x1=1:length(p), 
	y1=p, col="red", lwd=2)
points(p, pch=ifelse(p > o, "^", "v"), col="red", lwd=2)

lmer.lexdec2 = lmer(RT ~ 1 + Frequency +
	(1 | Subject) + (0 + Frequency | Subject) + 
	(1 | Word), 
	data=lexdec)

ranef(lmer.lexdec2)

lmer.lexdec3 = lmer(RT ~ 1 + Frequency +
	(1 + Frequency | Subject) + (1 | Word), 
	data=lexdec)

plot(ranef(lmer.lexdec3)$Subject, 
	main="Random Effect Correlation")


# for all models
dotplot(ranef(lmer.lexdec3))
qqmath(ranef(lmer.lexdec3))

# currently only for models with relatively
# simple random effects structures
dotplot(ranef(lmer.lexdec3, postVar=TRUE))
dotplot(ranef(lmer.lexdec3, postVar=TRUE), 
	scales = list(x = 
		list(relation = 'free')))[["Subject"]]

qqmath(ranef(lmer.lexdec3, postVar=TRUE))

# To see what the postVar=TRUE option does, see ?ranef, or look at the
# code returned by getMethod(ranef, "mer").  There are also some examples
# on the ranef help page.

anova(lmer.lexdec3, lmer.lexdec2)
anova(lmer.lexdec3, lmer.lexdec3.simple = 
	lmer(RT ~ 1 + Frequency +
	(1 | Subject) + (1 | Word), 
	data=lexdec)
)

# center variables 
lexdec$cNativeEnglish <- center(ifelse(
	lexdec$NativeLanguage == "English", 1,0))
lexdec$cFrequency <- center(lexdec$Frequency)
lexdec$cFamilySize <- center(lexdec$FamilySize)
lexdec$cSynsetCount <- center(lexdec$SynsetCount)
lexdec$cPlant <- center(ifelse(
	lexdec$Class == "plant", 1,0))

lmer.lexdec4 = lmer(RT ~ 1 + cNativeEnglish * (
	cFrequency + cFamilySize +	cSynsetCount + 
	cPlant) + 
	(1 + Frequency | Subject) + (1 | Word), 
	data=lexdec)
print(lmer.lexdec4, corr=F)

# for printing with plotLMER.fnc, let's use the
# uncentered variables.
lmer.lexdec4b = lmer(RT ~ 1 + NativeLanguage * (
	Frequency + FamilySize + SynsetCount + 
	Class) + 
	(1 | Subject) + (1 | Word), 
	data=lexdec)
p.lmer.lexdec4b = pvals.fnc(lmer.lexdec4b, 
	nsim=10000, withMCMC=T)
p.lmer.lexdec
plotLMER.fnc(lmer.lexdec4b, pred="Frequency", 
	mcmcMat=p.lmer.lexdec4b$mcmc,
	intr=list("NativeLanguage", c("English", "Other"),
		"end", list(c("red", "orange"), 
		c(1,1))), cex=1.3, lwd=2)		

## --------------------------------------------------------
# Ordinary Logistic Regression - An example
## --------------------------------------------------------
library(languageR)
lmer.lexdec.answer4 = lmer(Correct == "correct" ~ 
	1 + cNativeEnglish * (
	cFrequency + cFamilySize +	cSynsetCount + 
	cPlant) + 
	(1 + Frequency | Subject) + (1 | Word), 
	data=lexdec, family="binomial")

lmer.lexdec.answer4b = lmer(Correct == "correct" ~ 
	1 + NativeLanguage * (
	Frequency + FamilySize + SynsetCount + 
	Class) + 
	(1 + Frequency | Subject) + (1 | Word), 
	data=lexdec, family="binomial")
plotLMER.fnc(lmer.lexdec.answer4b, pred="Frequency", 
	intr=list("NativeLanguage", c("English", "Other"),
		"end", list(c("red", "orange"), 
		c(1,1))), cex=1.3, lwd=2, fun=I)	
plotLMER.fnc(lmer.lexdec.answer4b, pred="Frequency", 
	intr=list("NativeLanguage", c("English", "Other"),
		"end", list(c("red", "orange"), 
		c(1,1))), cex=1.3, lwd=2)		





setwd("C:\\Documents and Settings\\florian\\My Documents\\_Papers\\GillespieJaeger_OCP\\data\\data")

d= as.data.frame(read.csv(file="results.txt", sep="\t",
	header=F))
names(d) = c('Sequence', 'CountThat', 'CountNoThat')

d$ProportionThat = d$CountThat / (d$CountThat + d$CountNoThat)
d$HeadNoun = as.factor(sub("^([a-zA-Z]+)\\s.+$", "\\1", d$Sequence))
d$RcDet = as.factor(sub("^[a-zA-Z]+\\sthat\\s([a-zA-Z]+)\\s.+$", "\\1", d$Sequence))
d$RcNoun = as.factor(sub("^.*\\s([a-zA-Z]+)\\s[a-zA-Z]+$", "\\1", d$Sequence))
d$RcVerb = as.factor(sub("^.*\\s([a-zA-Z]+)$", "\\1", d$Sequence))

poisson.ocp1 = glm(CountThat ~ RcDet, data= d, family= "poisson")
summary(poisson.ocp1)

d.that = d
d.that$That = 1
d.nothat = d
d.nothat$That = 0
dd = rbind(d.that, d.nothat)
rm(d.that, d.nothat)

dd$Weight = ifelse(dd$That == 1, dd$CountThat, dd$CountNoThat)
table(dd$That, is.na(dd$Weight))
dd = subset(dd, !is.na(CountThat) & !is.na(CountNoThat))
summary(dd$That == 1)
dd$CountThat = NULL
dd$CountNoThat = NULL
dd$HeadNounCount = table(dd$HeadNoun)[as.character(dd$HeadNoun)]
dd$RcNounCount = table(dd$RcNoun)[as.character(dd$RcNoun)]
dd$RcVerbCount = table(dd$RcVerb)[as.character(dd$RcVerb)]

library(Design)
dd.dist = datadist(dd)
options(datadist = 'dd.dist')
logistic.ocp1 = lrm(That ~ RcDet, data= dd, weights= Weight, 
	subset= RcDet %in% c('this','that'))

logistic.ocp1 = lrm(That ~ RcDet, data= dd, weight= Weight)
plot(logistic.ocp1, xlab= "Determiner at RC onset", 
	ylab="log-odds of relativizer")
plot(logistic.ocp1, xlab= "Determiner at RC onset", 
	ylab="probability of relativizer", fun= plogis)

library(lme4)
dd.sub = subset(dd, HeadNounCount > 10 &
	RcNounCount > 10)
nrow(dd.sub)
lmer.ocp1 = lmer(That ~ RcDet + (1 + RcDet | RcNoun) + (1 | HeadNoun), data= dd.sub, 
	subset= RcDet %in% c('this','that'),
	weights= Weight, family="binomial")