## 	Benjamin Van Durme, vandurme@cs.rochester.edu, 30 May 2008
## 	Time-stamp: <2008-05-30 15:45:33 vandurme>


## Baayen Section 4.7, Problem 3

names(durationsGe)
str(durationsGe)
summary(durationsGe)
attach(durationsGe)

## Required for histogram
library(lattice)
## Required for ols
library(Design)

## Look at duration
histogram( DurationOfPrefix )

## The following suggests women overall have shorter duration than men, on
## average.
histogram(~ DurationOfPrefix | Sex)

## Tests for normality reject the null hypothesis that duration is normally
## distributed amongst the sub-populations partitioned by sex.
shapiro.test(durationsGe[durationsGe$Sex == "female",]$DurationOfPrefix)
shapiro.test(durationsGe[durationsGe$Sex == "male",]$DurationOfPrefix)

## Look at each of the other (potential) predictors
histogram( YearOfBirth )
histogram( SpeechRate )
## Zipf
histogram( Frequency )

## Since the study took place in 2005 (?), then we transform YearOfBirth into
## an approximate Age by subtracting from the year of the study
durationsGe$Age = 2005 - durationsGe$YearOfBirth
attach(durationsGe)
histogram(Age)

## This shows a spike in the last chart. Either an outlier, or a severe
## lengthening of duration at a particular point of old age, or sparse data.
histogram( ~ DurationOfPrefix | round(Age / 10))

histogram( ~ DurationOfPrefix | Age > 70)
histogram( ~ DurationOfPrefix | Age > 80)


## This shows that frequency has a few extreme points, which we saw earlier in
## the histogram.
summary(Frequency)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    6.00   18.00  125.40   59.25 8104.00 

## What are these high frequency values?
rev(sort(Frequency))[1:20]
## [1] 8104 4970 4121 2774 1839 1605 1301 1155 1113 1065 946 904 742 736 701
## [16] 623 619 594 531 526

## The second is more linear than the first
plot(sort(Frequency))
plot(log(sort(Frequency)) + 1)

## This gives a more symmetric looking distribution
densityplot(rev(sort(DurationOfPrefix))[-(1:10)])

## This again shows the highest values under duration being potential outliers
boxplot(DurationOfPrefix)

big = durationsGe[durationsGe$DurationOfPrefix >  0.246479,]
cbind(big$Frequency, big$DurationOfPrefix)
##       [,1]     [,2]
##  [1,]   88 0.278665
##  [2,]   20 0.248249
##  [3,]   36 0.254832
##  [4,]    1 0.311227
##  [5,]   33 0.263485
##  [6,]   17 0.311799
##  [7,]   13 0.250762
##  [8,]    5 0.276309
##  [9,]   71 0.289032

## Which all leaves me still uncertain of when we are justified in deleting data
## points.

## A basic model
fit.1 = lm( DurationOfPrefix ~  Frequency  + Sex  + SpeechRate + NumberSegmentsOnset + Age)

summary(fit.1)

## Call:
## lm(formula = DurationOfPrefix ~ Frequency + Sex + SpeechRate + 
##     NumberSegmentsOnset + Age)

## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0954003 -0.0318011 -0.0008356  0.0243314  0.1822600 

## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.895e-01  1.364e-02  13.891  < 2e-16 ***
## Frequency           -1.108e-05  4.027e-06  -2.751  0.00619 ** 
## Sexmale              4.182e-03  4.458e-03   0.938  0.34878    
## SpeechRate          -9.646e-03  1.626e-03  -5.934 6.22e-09 ***
## NumberSegmentsOnset -7.143e-03  3.804e-03  -1.878  0.06110 .  
## Age                 -3.327e-05  1.378e-04  -0.241  0.80940    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

## Residual standard error: 0.0453 on 419 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared: 0.1004,	Adjusted R-squared: 0.08965 
## F-statistic: 9.351 on 5 and 419 DF,  p-value: 1.848e-08 


## Log transform Frequency
fit.1.1 = lm( DurationOfPrefix ~  log(Frequency)  + Sex  + SpeechRate + NumberSegmentsOnset + Age)
summary(fit.1.1)

## Call:
## lm(formula = DurationOfPrefix ~ log(Frequency) + Sex + SpeechRate + 
##     NumberSegmentsOnset + Age)

## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.101688 -0.031895 -0.001376  0.026118  0.182423 

## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.993e-01  1.407e-02  14.172  < 2e-16 ***
## log(Frequency)      -4.241e-03  1.281e-03  -3.312  0.00101 ** 
## Sexmale              3.867e-03  4.435e-03   0.872  0.38378    
## SpeechRate          -9.407e-03  1.621e-03  -5.802 1.29e-08 ***
## NumberSegmentsOnset -7.678e-03  3.798e-03  -2.022  0.04384 *  
## Age                 -7.172e-06  1.371e-04  -0.052  0.95831    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

## Residual standard error: 0.04512 on 419 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared: 0.1075,	Adjusted R-squared: 0.09685 
## F-statistic: 10.09 on 5 and 419 DF,  p-value: 3.867e-09 


## A model without the biggest duration points
durationsGe.clipped = durationsGe[durationsGe$DurationOfPrefix < 0.27,]
fit.2 = lm( DurationOfPrefix ~  durationsGe.clipped$Frequency  + Sex  + SpeechRate + NumberSegmentsOnset + Age, data = durationsGe.clipped)

summary(fit.2)

## Call:
## lm(formula = DurationOfPrefix ~ Frequency + Sex + SpeechRate + 
##     NumberSegmentsOnset + Age, data = durationsGe.clipped)

## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.093180 -0.030800 -0.000976  0.024868  0.129151 

## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.743e-01  1.307e-02  13.340  < 2e-16 ***
## Frequency           -1.066e-05  3.771e-06  -2.826  0.00494 ** 
## Sexmale              5.745e-03  4.193e-03   1.370  0.17136    
## SpeechRate          -7.798e-03  1.547e-03  -5.039    7e-07 ***
## NumberSegmentsOnset -6.045e-03  3.577e-03  -1.690  0.09179 .  
## Age                 -1.266e-05  1.306e-04  -0.097  0.92280    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

## Residual standard error: 0.04241 on 414 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared: 0.08289,	Adjusted R-squared: 0.07181 
## F-statistic: 7.483 on 5 and 414 DF,  p-value: 9.724e-07 


## Log transform frequency
fit.2.1 = lm( DurationOfPrefix ~  log(Frequency)  + Sex  + SpeechRate + NumberSegmentsOnset + Age, data = durationsGe.clipped)


## Call:
## lm(formula = DurationOfPrefix ~ log(Frequency) + Sex + SpeechRate + 
##     NumberSegmentsOnset + Age, data = durationsGe.clipped)

## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0991713 -0.0311861 -0.0002442  0.0268042  0.1318496 

## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.839e-01  1.348e-02  13.646  < 2e-16 ***
## log(Frequency)      -4.069e-03  1.206e-03  -3.373 0.000812 ***
## Sexmale              5.469e-03  4.171e-03   1.311 0.190480    
## SpeechRate          -7.573e-03  1.543e-03  -4.909 1.32e-06 ***
## NumberSegmentsOnset -6.562e-03  3.571e-03  -1.837 0.066887 .  
## Age                  1.055e-05  1.299e-04   0.081 0.935300    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

## Residual standard error: 0.04224 on 414 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared: 0.0902,	Adjusted R-squared: 0.07921 
## F-statistic: 8.209 on 5 and 414 DF,  p-value: 2.089e-07