# Gelman, exercise 4.4 (p. 76)

library(foreign)
library(Design)
pollution.dd=datadist(pollution)
options(datadist='pollution.dd')

pollution<-read.dta("~/Desktop/pollution.dta.txt")
names(pollution)
str(pollution)


# 4.4a

plot(pollution$nox,pollution$mort,xlab="Relative nitric oxide pollution potential",ylab="Total age-adjusted mortality rate per 100,000")

# linear regression would probably not fit these data well; there are some outliers with very high NOX rates

p1 <- ols(mort ~ nox, data=pollution)
p1
abline(p1)

# terrible fit! the outliers screw up the regression
# interestingly, R^2 is negative!

plot(density(p1$residuals))
abline(v=0)

# the distribution of the residuals appears somewhat asymmetric, and there's a nasty lil' bump in the right tail--the outliers


# 4.4b

plot(density(pollution$nox))
plot(density(log(pollution$nox)))

p2 <- ols(mort ~ log(nox), data=pollution)
p2

plot(log(pollution$nox),pollution$mort,xlab="Log relative nitric oxide pollution potential",ylab="Total age-adjusted mortality rate per 100,000")
abline(p2)

# this looks better than the previous model

plot(density(p2$residuals))
abline(v=0)

# the residuals look more normally distributed than for model p1, but now there's a 'lil bump in the left tail.  in fact, I'm not really sure how much the log transformation improved things for the residuals.


# 4.4c

p2

# there is a significant effect of relative nitric oxide pollution potential on mortality rate.  for each one unit increase in log nitric oxide pollution potential, there is an expected increase in mortality rate of 15.3.


# 4.4d

plot(density(pollution$hc))
plot(density(pollution$so2))

# both need to be log transformed

p3 <- ols(mort ~ log(nox) + log(so2) + log (hc), data=pollution)
p3
plot(p3)

# this is not showing the whole model, hm ...

# interpretation of coefficients:
# for each one unit increase in log(nox), mortality rate is expected to increase by 58.3
# for each one unit increase in log(so2), mortality rate is expected to increase by 11.8
# for each one unit increase in log(hc), mortallity rate is expected to decrease by 57.3


# 4.4.e

p.firsthalf=pollution[1:30,]
p.secondhalf=pollution[31:60,]

p4 <- ols(mort ~ log(nox) + log(so2) + log (hc), data=p.firsthalf)
p4

p.secondhalf.predict<-predict(p4,p.secondhalf)
plot(density(p.secondhalf.predict-p.secondhalf$mort))

# there is a nasty outlier hump in the left field.  this may be because there are outliers in the second half of the data


# Baayen, exercise 1 (p. 260)

# first, recreate their "english2" dataset

items=english[english$AgeSubject=="young",]
items.pca=prcomp(items[,c(18:27)],center=T,scale=T)
summary(items.pca)
sum((items.pca$sdev^2/sum(items.pca$sdev^2))[1:4])
x=as.data.frame(items.pca$rotation[,1:4])
x[order(x$PC4), ]

items$PC1=items.pca$x[,1]
items$PC2=items.pca$x[,2]
items$PC3=items.pca$x[,3]
items$PC4=items.pca$x[,4]
items2=english[english$AgeSubject!="young",]
items2$PC1=items.pca$x[,1]
items2$PC2=items.pca$x[,2]
items2$PC3=items.pca$x[,3]
items2$PC4=items.pca$x[,4]
english2=rbind(items,items2)

str(english2)

# add data distribution object
library(Design)
english2.dd=datadist(english2)
options(datadist='english2.dd')

# THIS DOESN'T WORK.  NOT SURE WHY!
# english2.ols <- ols(RTlexdec~AgeSubject+rcs(WrittenFrequency,3)+PC1+AgeSubject*WrittenFrequency,data=english2)


# Baayen, exercise 8 (p. 262)

imaging
imaging.lm <- ols(FilteredSignal ~ BehavioralScore + Condition + BehavioralScore:Condition,data=imaging)
imaging.lm

# at the surface, it looks like there is a significant interaction of BehavioralScore by Condition, with a coefficient of 0.78

# however ...

plot(density(imaging.lm$residuals))

# but this is no good!  the residuals are not normally distributed

min(predict(imaging.lm,imaging)-imaging$FilteredSignal)
max(predict(imaging.lm,imaging)-imaging$FilteredSignal)

# this is the outlier score

imaging[19,]

# let's try the model again with the outliers removed

imaging.2 <- subset(imaging,imaging$FilteredSignal>50)
imaging.lm2 <- ols(FilteredSignal ~ BehavioralScore + Condition + BehavioralScore:Condition,data=imaging.2)
imaging.lm2

# ouch.  no interaction after all!