attachment:BenVanDurme-hw1.R of HLPMiniCourseSession1

   1 ## 	Benjamin Van Durme, vandurme@cs.rochester.edu, 30 May 2008
   2 ## 	Time-stamp: <2008-05-30 15:45:33 vandurme>
   3 
   4 
   5 ## Baayen Section 4.7, Problem 3
   6 
   7 names(durationsGe)
   8 str(durationsGe)
   9 summary(durationsGe)
  10 attach(durationsGe)
  11 
  12 ## Required for histogram
  13 library(lattice)
  14 ## Required for ols
  15 library(Design)
  16 
  17 ## Look at duration
  18 histogram( DurationOfPrefix )
  19 
  20 ## The following suggests women overall have shorter duration than men, on
  21 ## average.
  22 histogram(~ DurationOfPrefix | Sex)
  23 
  24 ## Tests for normality reject the null hypothesis that duration is normally
  25 ## distributed amongst the sub-populations partitioned by sex.
  26 shapiro.test(durationsGe[durationsGe$Sex == "female",]$DurationOfPrefix)
  27 shapiro.test(durationsGe[durationsGe$Sex == "male",]$DurationOfPrefix)
  28 
  29 ## Look at each of the other (potential) predictors
  30 histogram( YearOfBirth )
  31 histogram( SpeechRate )
  32 ## Zipf
  33 histogram( Frequency )
  34 
  35 ## Since the study took place in 2005 (?), then we transform YearOfBirth into
  36 ## an approximate Age by subtracting from the year of the study
  37 durationsGe$Age = 2005 - durationsGe$YearOfBirth
  38 attach(durationsGe)
  39 histogram(Age)
  40 
  41 ## This shows a spike in the last chart. Either an outlier, or a severe
  42 ## lengthening of duration at a particular point of old age, or sparse data.
  43 histogram( ~ DurationOfPrefix | round(Age / 10))
  44 
  45 histogram( ~ DurationOfPrefix | Age > 70)
  46 histogram( ~ DurationOfPrefix | Age > 80)
  47 
  48 
  49 ## This shows that frequency has a few extreme points, which we saw earlier in
  50 ## the histogram.
  51 summary(Frequency)
  52 ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  53 ##    1.00    6.00   18.00  125.40   59.25 8104.00 
  54 
  55 ## What are these high frequency values?
  56 rev(sort(Frequency))[1:20]
  57 ## [1] 8104 4970 4121 2774 1839 1605 1301 1155 1113 1065 946 904 742 736 701
  58 ## [16] 623 619 594 531 526
  59 
  60 ## The second is more linear than the first
  61 plot(sort(Frequency))
  62 plot(log(sort(Frequency)) + 1)
  63 
  64 ## This gives a more symmetric looking distribution
  65 densityplot(rev(sort(DurationOfPrefix))[-(1:10)])
  66 
  67 ## This again shows the highest values under duration being potential outliers
  68 boxplot(DurationOfPrefix)
  69 
  70 big = durationsGe[durationsGe$DurationOfPrefix >  0.246479,]
  71 cbind(big$Frequency, big$DurationOfPrefix)
  72 ##       [,1]     [,2]
  73 ##  [1,]   88 0.278665
  74 ##  [2,]   20 0.248249
  75 ##  [3,]   36 0.254832
  76 ##  [4,]    1 0.311227
  77 ##  [5,]   33 0.263485
  78 ##  [6,]   17 0.311799
  79 ##  [7,]   13 0.250762
  80 ##  [8,]    5 0.276309
  81 ##  [9,]   71 0.289032
  82 
  83 ## Which all leaves me still uncertain of when we are justified in deleting data
  84 ## points.
  85 
  86 ## A basic model
  87 fit.1 = lm( DurationOfPrefix ~  Frequency  + Sex  + SpeechRate + NumberSegmentsOnset + Age)
  88 
  89 summary(fit.1)
  90 
  91 ## Call:
  92 ## lm(formula = DurationOfPrefix ~ Frequency + Sex + SpeechRate + 
  93 ##     NumberSegmentsOnset + Age)
  94 
  95 ## Residuals:
  96 ##        Min         1Q     Median         3Q        Max 
  97 ## -0.0954003 -0.0318011 -0.0008356  0.0243314  0.1822600 
  98 
  99 ## Coefficients:
 100 ##                       Estimate Std. Error t value Pr(>|t|)    
 101 ## (Intercept)          1.895e-01  1.364e-02  13.891  < 2e-16 ***
 102 ## Frequency           -1.108e-05  4.027e-06  -2.751  0.00619 ** 
 103 ## Sexmale              4.182e-03  4.458e-03   0.938  0.34878    
 104 ## SpeechRate          -9.646e-03  1.626e-03  -5.934 6.22e-09 ***
 105 ## NumberSegmentsOnset -7.143e-03  3.804e-03  -1.878  0.06110 .  
 106 ## Age                 -3.327e-05  1.378e-04  -0.241  0.80940    
 107 ## ---
 108 ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
 109 
 110 ## Residual standard error: 0.0453 on 419 degrees of freedom
 111 ##   (3 observations deleted due to missingness)
 112 ## Multiple R-squared: 0.1004,	Adjusted R-squared: 0.08965 
 113 ## F-statistic: 9.351 on 5 and 419 DF,  p-value: 1.848e-08 
 114 
 115 
 116 ## Log transform Frequency
 117 fit.1.1 = lm( DurationOfPrefix ~  log(Frequency)  + Sex  + SpeechRate + NumberSegmentsOnset + Age)
 118 summary(fit.1.1)
 119 
 120 ## Call:
 121 ## lm(formula = DurationOfPrefix ~ log(Frequency) + Sex + SpeechRate + 
 122 ##     NumberSegmentsOnset + Age)
 123 
 124 ## Residuals:
 125 ##       Min        1Q    Median        3Q       Max 
 126 ## -0.101688 -0.031895 -0.001376  0.026118  0.182423 
 127 
 128 ## Coefficients:
 129 ##                       Estimate Std. Error t value Pr(>|t|)    
 130 ## (Intercept)          1.993e-01  1.407e-02  14.172  < 2e-16 ***
 131 ## log(Frequency)      -4.241e-03  1.281e-03  -3.312  0.00101 ** 
 132 ## Sexmale              3.867e-03  4.435e-03   0.872  0.38378    
 133 ## SpeechRate          -9.407e-03  1.621e-03  -5.802 1.29e-08 ***
 134 ## NumberSegmentsOnset -7.678e-03  3.798e-03  -2.022  0.04384 *  
 135 ## Age                 -7.172e-06  1.371e-04  -0.052  0.95831    
 136 ## ---
 137 ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
 138 
 139 ## Residual standard error: 0.04512 on 419 degrees of freedom
 140 ##   (3 observations deleted due to missingness)
 141 ## Multiple R-squared: 0.1075,	Adjusted R-squared: 0.09685 
 142 ## F-statistic: 10.09 on 5 and 419 DF,  p-value: 3.867e-09 
 143 
 144 
 145 
 146 ## A model without the biggest duration points
 147 durationsGe.clipped = durationsGe[durationsGe$DurationOfPrefix < 0.27,]
 148 fit.2 = lm( DurationOfPrefix ~  durationsGe.clipped$Frequency  + Sex  + SpeechRate + NumberSegmentsOnset + Age, data = durationsGe.clipped)
 149 
 150 summary(fit.2)
 151 
 152 ## Call:
 153 ## lm(formula = DurationOfPrefix ~ Frequency + Sex + SpeechRate + 
 154 ##     NumberSegmentsOnset + Age, data = durationsGe.clipped)
 155 
 156 ## Residuals:
 157 ##       Min        1Q    Median        3Q       Max 
 158 ## -0.093180 -0.030800 -0.000976  0.024868  0.129151 
 159 
 160 ## Coefficients:
 161 ##                       Estimate Std. Error t value Pr(>|t|)    
 162 ## (Intercept)          1.743e-01  1.307e-02  13.340  < 2e-16 ***
 163 ## Frequency           -1.066e-05  3.771e-06  -2.826  0.00494 ** 
 164 ## Sexmale              5.745e-03  4.193e-03   1.370  0.17136    
 165 ## SpeechRate          -7.798e-03  1.547e-03  -5.039    7e-07 ***
 166 ## NumberSegmentsOnset -6.045e-03  3.577e-03  -1.690  0.09179 .  
 167 ## Age                 -1.266e-05  1.306e-04  -0.097  0.92280    
 168 ## ---
 169 ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
 170 
 171 ## Residual standard error: 0.04241 on 414 degrees of freedom
 172 ##   (3 observations deleted due to missingness)
 173 ## Multiple R-squared: 0.08289,	Adjusted R-squared: 0.07181 
 174 ## F-statistic: 7.483 on 5 and 414 DF,  p-value: 9.724e-07 
 175 
 176 
 177 ## Log transform frequency
 178 fit.2.1 = lm( DurationOfPrefix ~  log(Frequency)  + Sex  + SpeechRate + NumberSegmentsOnset + Age, data = durationsGe.clipped)
 179 
 180 
 181 ## Call:
 182 ## lm(formula = DurationOfPrefix ~ log(Frequency) + Sex + SpeechRate + 
 183 ##     NumberSegmentsOnset + Age, data = durationsGe.clipped)
 184 
 185 ## Residuals:
 186 ##        Min         1Q     Median         3Q        Max 
 187 ## -0.0991713 -0.0311861 -0.0002442  0.0268042  0.1318496 
 188 
 189 ## Coefficients:
 190 ##                       Estimate Std. Error t value Pr(>|t|)    
 191 ## (Intercept)          1.839e-01  1.348e-02  13.646  < 2e-16 ***
 192 ## log(Frequency)      -4.069e-03  1.206e-03  -3.373 0.000812 ***
 193 ## Sexmale              5.469e-03  4.171e-03   1.311 0.190480    
 194 ## SpeechRate          -7.573e-03  1.543e-03  -4.909 1.32e-06 ***
 195 ## NumberSegmentsOnset -6.562e-03  3.571e-03  -1.837 0.066887 .  
 196 ## Age                  1.055e-05  1.299e-04   0.081 0.935300    
 197 ## ---
 198 ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
 199 
 200 ## Residual standard error: 0.04224 on 414 degrees of freedom
 201 ##   (3 observations deleted due to missingness)
 202 ## Multiple R-squared: 0.0902,	Adjusted R-squared: 0.07921 
 203 ## F-statistic: 8.209 on 5 and 414 DF,  p-value: 2.089e-07
Attachment 'BenVanDurme-hw1.R'

Attached Files