## -----------------
# functions
## -----------------
myCenter = function(x) x - mean(x, na.rm=T)
emplog = function(p, totalN) {
	y = p*totalN
 	return(log( (y+.5)/ (totalN-y + .5) ))
}
emplogweight = function(p, totalN) {
 	y = p*totalN
	return(1 / ( 1 / (y+.5) + 1 / (totalN-y + .5) ))
}


## -----------------
# load data
## -----------------
setwd("C:\\Users\\tiflo\\Documents\\My LabMeetings, Tutorials, & Teaching\\LSA13\\latex\\scripts")
d = read.csv(file = "eye-tracking-sample.csv")
str(d)

## -----------------
# analysis 1: ANOVA (collapsing over time)
## -----------------
d.agg = aggregate(d[,c('LooksToTarget')], by= list(
		Subj = d$Subj,
		CondWordFrequency = d$CondWordFrequency,
		CondCompetitors = d$CondCompetitors
	),
	FUN = mean
)
names(d.agg)[length(names(d.agg))] = "LooksToTarget"
str(d.agg)

# visualize
with(d.agg, interaction.plot(CondCompetitors, CondWordFrequency, LooksToTarget,  
	type="b", 
	col=c(1:2), 
	legend = F,
#  	leg.bty="o", leg.bg="beige", 
	lwd=2, 
	pch=c(18,24),	
	ylim= c(0,1),
   	xlab="Number of competitors", 
   	ylab="Proportion looks to target", 
   	main="Interaction Plot")
)
legend(1.5, 1, title= "Word Frequency", 
	legend = c("high", "low"),
	col=c(1:2), 
  	bty="o", 
	bg="beige", 
	lwd=2, 
	pch=c(18,24)
)

# see: http://www.statmethods.net/stats/anova.html
m.F1 = aov(LooksToTarget ~ 1 + CondWordFrequency * CondCompetitors + 
	Error(Subj/(CondWordFrequency * CondCompetitors)), 
	data = d.agg)
summary(m.F1)

d.agg = aggregate(d[,c('LooksToTarget')], by= list(
		Item = d$Item,
		CondWordFrequency = d$CondWordFrequency,
		CondCompetitors = d$CondCompetitors
	),
	FUN = mean
)
names(d.agg)[length(names(d.agg))] = "LooksToTarget"
str(d.agg)

m.F2 = aov(LooksToTarget ~ 1 + CondWordFrequency * CondCompetitors + 
	Error(Item/(CondWordFrequency * CondCompetitors)), 
	data = d.agg)
summary(m.F2)

## -----------------
# analysis 2: ANOVA (collapsing over time) - arcsine
## -----------------
d.agg = aggregate(d[,c('LooksToTarget')], by= list(
		Subj = d$Subj,
		CondWordFrequency = d$CondWordFrequency,
		CondCompetitors = d$CondCompetitors
	),
	FUN = function(x) { asin(sqrt(mean(x))) }
)
names(d.agg)[length(names(d.agg))] = "LooksToTarget"
str(d.agg)

m.F1 = aov(LooksToTarget ~ 1 + CondWordFrequency * CondCompetitors + 
	Error(Subj/(CondWordFrequency * CondCompetitors)), 
	data = d.agg)
summary(m.F1)

d.agg = aggregate(d[,c('LooksToTarget')], by= list(
		Item = d$Item,
		CondWordFrequency = d$CondWordFrequency,
		CondCompetitors = d$CondCompetitors
	),
	FUN = function(x) { asin(sqrt(mean(x))) }
)
names(d.agg)[length(names(d.agg))] = "LooksToTarget"
str(d.agg) 

m.F2 = aov(LooksToTarget ~ 1 + CondWordFrequency * CondCompetitors + 
	Error(Item/(CondWordFrequency * CondCompetitors)), 
	data = d.agg)
summary(m.F2)


## -----------------
# analysis 3: mixed linear model
# 
# note: aggregation over subject and items combined
## -----------------
d.agg = aggregate(d[,c('LooksToTarget')], by= list(
		Subj = d$Subj,
		Item = d$Item,
		CondWordFrequency = d$CondWordFrequency,
		CondCompetitors = d$CondCompetitors
	),
	FUN = mean
)
names(d.agg)[length(names(d.agg))] = "LooksToTarget"

d.agg$cCondWordFrequencyHigh = myCenter(ifelse(d.agg$CondWordFrequency == "high", 1, 0))
d.agg$cCondCompetitorsTwo = myCenter(ifelse(d.agg$CondCompetitors == "two", 1, 0))

# model with full random effect structure
library(lme4)
m.full = lmer(LooksToTarget ~ 
	cCondWordFrequencyHigh * cCondCompetitorsTwo + 
	(1 + cCondWordFrequencyHigh * cCondCompetitorsTwo | Subj) + 
	(1 + cCondWordFrequencyHigh * cCondCompetitorsTwo | Item), 
	d.agg
)
m.full


## -----------------
# analysis 5: empirical logit
# 
# note: aggregation over subject and items combined
## -----------------
d.agg = aggregate(d[,c('LooksToTarget')], by= list(
		Subj = d$Subj,
		Item = d$Item,
		CondWordFrequency = d$CondWordFrequency,
		CondCompetitors = d$CondCompetitors
	),
	FUN = mean
)
names(d.agg)[length(names(d.agg))] = "LooksToTarget"

TotalLooks = aggregate(d[,c('LooksToTarget')], by= list(
		Subj = d$Subj,
		Item = d$Item,
		CondWordFrequency = d$CondWordFrequency,
		CondCompetitors = d$CondCompetitors
	),
	FUN = length
)

d.agg$cCondWordFrequencyHigh = 
    myCenter(ifelse(d.agg$CondWordFrequency == "high", 1, 0))
d.agg$cCondCompetitorsTwo = 
    myCenter(ifelse(d.agg$CondCompetitors == "two", 1, 0))

# model with full random effect structure
m.full = lmer(emplog(LooksToTarget, TotalLooks$x) ~ 
	cCondWordFrequencyHigh * cCondCompetitorsTwo + 
	(1 + cCondWordFrequencyHigh * cCondCompetitorsTwo | Subj) + 
	(1 + cCondWordFrequencyHigh * cCondCompetitorsTwo | Item), 
	d.agg, 
	weight = emplogweight(LooksToTarget, TotalLooks$x)
)
m.full
# deviance 1191ish

# model with only random intercepts
m.simple = lmer(emplog(LooksToTarget, TotalLooks$x) ~ 
	cCondWordFrequencyHigh * cCondCompetitorsTwo + 
	(1 | Subj) + 
	(1 | Item), 
	d.agg,
	weight = emplogweight(LooksToTarget, TotalLooks$x)
)
m.simple
anova(m.full,m.simple)
# deviance 1209, so there is room for improvement beyond the intercepts

# model with full random effect structure for item
m.full.item.1 = lmer(emplog(LooksToTarget, TotalLooks$x) ~ 
	cCondWordFrequencyHigh * cCondCompetitorsTwo + 
	(1 + cCondWordFrequencyHigh + cCondCompetitorsTwo | Subj) + 
	(1 + cCondWordFrequencyHigh * cCondCompetitorsTwo | Item), 
	d.agg, 
	weight = emplogweight(LooksToTarget, TotalLooks$x)
)

m.full.item.2 = lmer(emplog(LooksToTarget, TotalLooks$x) ~ 
	cCondWordFrequencyHigh * cCondCompetitorsTwo + 
	(1 +  cCondWordFrequencyHigh | Subj) + 
	(1 + cCondWordFrequencyHigh * cCondCompetitorsTwo | Item), 
	d.agg, 
	weight = emplogweight(LooksToTarget, TotalLooks$x)
)

m.full.item.3 = lmer(emplog(LooksToTarget, TotalLooks$x) ~ 
	cCondWordFrequencyHigh * cCondCompetitorsTwo + 
	(1 + cCondCompetitorsTwo | Subj) + 
	(1 + cCondWordFrequencyHigh * cCondCompetitorsTwo | Item), 
	d.agg, 
	weight = emplogweight(LooksToTarget, TotalLooks$x)
)

# comparison of the above models in terms of their deviance suggests
# that the only justified random slope for subjects is one for 
# competitors. The resulting model is just as good as the full model.

m.1.3 = lmer(emplog(LooksToTarget, TotalLooks$x) ~ 
	cCondWordFrequencyHigh * cCondCompetitorsTwo + 
	(1 + cCondCompetitorsTwo | Subj) + 
	(1 + cCondWordFrequencyHigh + cCondCompetitorsTwo | Item), 
	d.agg, 
	weight = emplogweight(LooksToTarget, TotalLooks$x)
)
anova(m.full.item.3, m.1.3)


m.2.3 = lmer(emplog(LooksToTarget, TotalLooks$x) ~ 
	cCondWordFrequencyHigh * cCondCompetitorsTwo + 
	(1 + cCondCompetitorsTwo | Subj) + 
	(1 + cCondCompetitorsTwo  | Item), 
	d.agg,
	weight = emplogweight(LooksToTarget, TotalLooks$x)
)
anova(m.full.item.3, m.2.3, m.1.3)

m.3.3 = lmer(emplog(LooksToTarget, TotalLooks$x) ~ 
	cCondWordFrequencyHigh * cCondCompetitorsTwo + 
	(1 + cCondCompetitorsTwo | Subj) + 
	(1 + cCondWordFrequencyHigh | Item), 
	d.agg, 
	weight = emplogweight(LooksToTarget, TotalLooks$x)
)
anova(m.full.item.3, m.3.3, m.1.3)

# m.1.3 seems the best model


## -----------------
# analysis 4: logit
# 
# note: no aggregation. This analysis would yield inflated confidence. 
# It is hence anti-consersvative
## -----------------


d.agg$cCondWordFrequencyHigh = 
    myCenter(ifelse(d.agg$CondWordFrequency == "high", 1, 0))
d.agg$cCondCompetitorsTwo = 
    myCenter(ifelse(d.agg$CondCompetitors == "two", 1, 0))

# full model (do not run, because it takes a long time to fit
# since the data were artificial created in a way that means 
# that the full model is unjustified
# m.full = lmer(LooksToTarget ~ CondWordFrequency * CondCompetitors +
#	(1 + CondWordFrequency * CondCompetitors | Subj) +
#	(1 + CondWordFrequency * CondCompetitors | Item),
#	data = d,
#	family = "binomial"
# )
# m.full
#
# Deviance: 39968
# save(m.full, file = "eye-tracking-full-model.Rdata", compress=T)


# reducing random effects
m.simple = glmer(LooksToTarget ~ cCondWordFrequencyHigh * cCondCompetitorsTwo +
	(1 | Subj) +
	(1 | Item),
	data = d,
	family = "binomial"
)
m.simple




## -----------------
# Now let's take time into consideration:
#
# There are two common ways to analyze this type of 
# binomial time series data:
#
# 1) weighted linear regression over empirical logits
# 2) binomial logit regression
#
# Binned data (for Barr type emp. logit analysis)
## -----------------

d.agg = aggregate(d[,c('LooksToTarget')], by= list(
		Subj = d$Subj,
		Item = d$Item,
		CondWordFrequency = d$CondWordFrequency,
		CondCompetitors = d$CondCompetitors,
		Bin = d$Bin
	),
	FUN = mean,
	nfrequency = T
)
names(d.agg)[length(names(d.agg))] = "LooksToTarget"
d.agg$cBin = myCenter(d.agg$Bin)

# plot
library(ggplot2)
library(rms)
ggplot(d.agg, aes(x=Bin, 
		y= LooksToTarget, 
		col= as.factor(CondWordFrequency), 
		linetype=as.factor(CondCompetitors)
	    )
	) +
	geom_smooth(size=1) +
	geom_smooth(method=lm, 
		formula= y ~ pol(x, 2),
		family="binomial",
		size = 2) +
	xlab("Time bin") +
	ylab("Proportion of looks to the target") +
	scale_color_discrete("Word Frequency") +
	scale_linetype_discrete("Competitors") 

ggplot(d.agg, aes(x=Bin, 
		y= LooksToTarget, 
		col= as.factor(CondWordFrequency), 
		shape= as.factor(CondCompetitors),
		linetype=as.factor(CondCompetitors)
	    )
	) +
	geom_point(alpha=.05) +
	geom_smooth(size=1) +
	geom_smooth(method=lm, 
		formula= y ~ pol(x, 2),
		family="binomial",
		size = 2) +
	xlab("Time bin") +
	ylab("Proportion of looks to the target") +
	scale_color_discrete("Word Frequency") +
	scale_linetype_discrete("Competitors") +
	scale_shape_discrete("Competitors")



d.agg$cCondWordFrequencyHigh = 
    myCenter(ifelse(d.agg$CondWordFrequency == "high", 1, 0))
d.agg$cCondCompetitorsTwo = 
    myCenter(ifelse(d.agg$CondCompetitors == "two", 1, 0))
lmer(emplog(LooksToTarget, TotalLooks$x) ~ 
	cCondWordFrequencyHigh * cCondCompetitorsTwo * 
	cBin + 
	(1 | Subj) + (1 | Item), 
	d.agg, 
	weight = emplogweight(LooksToTarget, TotalLooks$x)
)

# with quadratic effect of bin
lmer(emplog(LooksToTarget, TotalLooks$x) ~ 
	cCondWordFrequencyHigh * cCondCompetitorsTwo * 
	pol(cBin, 2) + 
	(1 | Subj) + (1 | Item), 
	d.agg, 
	weight = emplogweight(LooksToTarget, TotalLooks$x)
)






#  2) logit
d$cCondWordFrequencyHigh = 
    myCenter(ifelse(d$CondWordFrequency == "high", 1, 0))
d$cCondCompetitorsTwo = 
    myCenter(ifelse(d$CondCompetitors == "two", 1, 0))


# with quadratic effect of time
glmer(LooksToTarget ~ 
	cCondWordFrequencyHigh * cCondCompetitorsTwo * 
	pol(cTime, 2) + 
	(1 | Subj) + (1 | Item), 
	d, family = "binomial"
)


