# load packages and libraries you'll need.  I like to keep them all up here.  Here are some that I use a lot.
library(languageR) # this has some cool databases in it to play with and some nice functions related to multilevel modeling that we'll use at some point
library(plotrix) # this has a function in it called "std.error" that, you guessed it, computes standard errors for vectors of data.  
library(lattice) # this has some cool graphics stuff in it, like xyplot()
library(stats) # 


# set the working directory.  this is where all the files related to your analysis should be saved.  Similarly, anything you create and save will be saved here by default.  *This place will be different on every person's computer*.
setwd("/Users/afine/Dropbox/RegressionTutorial/")

#### Example:  self-paced reading data

# general info on this tutorial can be found here:  http://wiki.bcs.rochester.edu:2525/HlpLab/StatsCourses/Feb2011Regression#preview


read.table("fixedpre12-19-10.rtm")->d
read.table("verbs.txt", header=T)->verbs
read.table("subjects.txt", header=T)->subjects

# first, we can inspect the dataset to make sure nothing weird is going on with the "summary()" command

summary(d)
summary(verbs)
summary(subjects)

# summary command should give basic descriptive statistics for numeic variables, levels of factor variables, and number of observations for character variables.  factor variables are variables that can take on a discrete set of values.  so if you're going to do a 2x2 anova at some point, for instance, you should have two variables in the dataframe (i.e. 2 columns) corresponding to factor variables with 2 levels each.


# you can also use the head() function to see first few rows for each variable
head(d)

# namem the columns
colnames(d) <- c("Expt", "Cond", "Item", "Subj", "ListPos", "WordPos", "Word", "Region", "RawRT", "RawZ", "ResidRT", "zResidRT", "Correct")

# subset the data to get rid of practice trials
d <- subset(d, Expt != "practice")

# We can also get rid of trials for which subjects incorrectly answered the comprehension question (for a given trial, every word in that trial will have the same value for "Correct")
d <- subset(d, Correct == 100)

# look at weirdly high or low RTs 
WeirdRTs <- subset(d, RawRT < 100 | RawRT > 2000) # save weird rts as a dataframe
nrow(WeirdRTs)/nrow(d) # what percentage of the dataset have weird rts?
summary(WeirdRTs) # are any subjects/items particularly over-represented?  could be basis for subject/item removal
	xtabs(~ Subj, data=WeirdRTs)
	xtabs(~ Item, data=WeirdRTs)
	xtabs(~ WordPos, data=WeirdRTs)
	xtabs(~ Subj + Item, data=WeirdRTs)

# get rid of the weird data
d <- subset(d, RawRT > 100 & RawRT < 2000) 


### This data set is extremely impoverished, given the goals of my analysis (I need a lot more variables).

# I want to add a variable that gives the length of each word in characters.  here, the argument "type" tells the function HOW it should measure the number of characters (the default is number of characters, but it can also do, e.g., number of bytes needed to encode the character string)
d$Length <- nchar(as.character(d$Word), type = "chars")

# Now we can use this to compute length-corrected reading times, a standard measure used in reading studies.  "Length-corrected reading times" are often explained in a way that disguises the fact that they fundamentally based on linear regression.  Specifically, the reading times are "length-corrected" in that they are the variance in reading times that is "left over" after accounting for all the variance in reading times that is due to word length in characters.  so you have a regression:

lmer(RawRT ~ Length + (1+Length|Subj), data=d) -> length.regression

# now you can take the "leftover variance" or the "residuals" of this model, using the "resid()" function.  Residuals will come back later in the tutorial (probably) and it's generally good to know what they are, as they're often a very useful construct

resid(length.regression)->resids

# you can then add this to the dataframe

d$resids <- resids

# I prefer to use the following more concise notation (notice that functions can be embedded inside of functions.  The only thing that needs to be true is that "lmer()" outputs an object of the kind that "resid()" requires):

d$resids <- resid(lmer(RawRT ~ Length + (1 + Length | Subj), data = d))

# now that we've done this, let's get rid of fillers

d <- subset(d, Expt == "adapt")

myFactorCleanup <- function(x) {
	if (is.factor(x)) {
		return(as.factor(as.character(x)))
	}
	if (is.data.frame(x) || is.matrix(x)) {
		for (i in 1:ncol(x)) {
			if(is.factor(x[,i])) { x[,i] <- as.factor(as.character(x[,i])) }
		}
		return(as.data.frame(x))
	}
}

myFactorCleanup(d) -> d
# Now I'd like to add some verb-specific variables from the dataframe "verbs" and "subjects"

# do this because in the variable additions that follow, the row names of the dataframes "verbs" and "subjects" need to match 
row.names(verbs)<-verbs$Item
row.names(subjects)<-subjects$Subject

d$SCBiasBin <- verbs[as.character(d$Item),]$Bias # whether verb is SC-, DO-, or EQ-biased based on norming study
d$numbias <- verbs[as.character(d$Item),]$SCbias # p(SC|v) based on norming study
d$VerbFreq <- verbs[as.character(d$Item),]$Frequency # frequency of the verb
d$ThatPref <- verbs[as.character(d$Item),]$ThatPref # p("that"|SC, v) based on norming study

d$List <- subjects[as.character(d$Subj),]$List
d$Gender <- subjects[as.character(d$Subj),]$Gender

# run the summary command and make sure everything makes sense

# so what we want to ultimately do is run a regression to analyze the effect of complementizer presence/absence and SCBiasBin on residual reading times at the point of disambiguation.  first, let's make sure we have a more or less balanced design:

xtabs(~ Cond + SCBiasBin, data=d)

# now let's inspect the dependent variable, "resids" or "RawRT", to see if it (more or less) follows a normal distribution

hist(d$resids)

# as with any reaction time data, not even close.  we can improve the situation by removing or replacing outliers.

# outlier removal:  the way you remove outliers will vary from field to field, and from analysis type to analysis type.  for reading data, a standard way to do outlier removal is to compute the mean length-corrected reading time for each subject at each sentence region, and then replace or remove data points by subject and region that fall 2.5 standard deviations above or below the mean.

	# first, I'm going to create a composite variable that identifies unique subject-region combinations:
	d$SubjReg <- paste(d$Subj, d$Region)
	d$SubjReg <- as.factor(d$SubjReg)
	
	# now I'm going to create a table with the mean length-corrected reading time for each level of "SubjReg"
	
	unlist(							# unlist 
		lapply(						# "lapply()" takes the output of "split" and applies the function "mean()" to it	
			split(d$resids, d$SubjReg), # "split()" gives you ALL the residual reading times at each region for each subject
			mean, na.rm=T))  -> mean.resids # save it as a vector
			
# do the same thing to find the standard deviation by subject and by region (exact same syntax, just compressed)

sd.resids <- unlist(lapply(split(d$resids, d$SubjReg), sd, na.rm=T))
# combine the mean and sd vectors into a single dataframe
subject.resids <- as.data.frame(cbind(mean.resids, sd.resids), row.names = levels(d$SubjReg))

# now let's just get the outliers so we can inspect them:
outliers <- subset(d, abs((d$resids - subject.resids[d$Subj,]$mean.resids)/subject.resids[d$Subj,]$sd.resids) > 2.5)

# this is a lot to parse, but all you're doing is (x - mean)/sd, which is a z-score, and figuring out if it has an absolute value greater than 2.5.  if it does, it goes in the dataframe "outliers"

# you can also compute, for every length-corrected RT in the dataframe, 
d$z.score <- (d$resids - subject.resids[d$SubjReg,]$mean.resids)/subject.resids[d$SubjReg,]$sd.resids

# now you can remove outliers:
d <- subset(d, abs(z.score) < 2.5)


# OR!  here's some code that *replaces* outliers with the +/- 2.5 z-score cutoff value, in case you're doing something like anova where you need (roughly) equal number of observations in each cell of the design.  the way I did this was by setting z = 2.5, and solving the equation z = (x-mean)/sd for x, and replacing the offending value with x.  you can ask me about this later or play with it on your own.
d$new.resids <- ifelse(d$z.score > 2.5,   # if the z score exceeds 2.5, substitute 
					2.5 * subject.resids[d$SubjReg,]$sd.resids + subject.resids[d$SubjReg,]$mean.resids, 
					ifelse(d$z.score < -2.5, -2.5 * subject.resids[d$SubjReg,]$sd.resids + subject.resids[d$SubjReg,]$mean.resids, d$resids))

# re-inspect:

hist(d$resids)

# much better.

##### simple regression analysis:

# let's see if SCBias and Cond are good predictors of reading times at the point of disambiguation (i.e. where Region = 4)

# because SC-bias is bounded by 0, it's good to log it, so I'm addng a new variable:
d$lnumbias <- log(d$numbias, base=2)

# I also want to mean-center this variable (we'll get to this later).  this means that i want to set the mean equal to 0.
d$clnumbias <- d$lnumbias - mean(d$lnumbias, na.rm=T)

r.model <- lm(resids ~ # dependent variable
				lnumbias * # centered log sc-bias
				Cond, # presence or absence of complementizer
				data=subset(d, Region == "4")
)

summary(r.model)
# bias is more strongly correlated with reding times when complementizer is absent
str(r.model)		# tells you how you can grab stuff out of the lm() object.  also provides useful sanity checks (e.g. that lnumbias )	
r.model$coefficients[2:4] -> coefs # 2-4 b/c i don't want the intercept

# can this be treated as a vector?

is.vector(r.model$coefficients)

# plot 'em

coef.plot <- barplot(coefs, 
	beside=TRUE, 
	ylab="coefficient", 
	axes=TRUE, 
	names.arg = c("log bias", "that", "interaction"),
	ylim = c(-5, 5))			

# standard errors:
coef.se <- c(.94, 3.36, 1.34)