Attachment 'BenVanDurme-hw1.R'
Download 1 ## Benjamin Van Durme, vandurme@cs.rochester.edu, 30 May 2008
2 ## Time-stamp: <2008-05-30 15:45:33 vandurme>
3
4
5 ## Baayen Section 4.7, Problem 3
6
7 names(durationsGe)
8 str(durationsGe)
9 summary(durationsGe)
10 attach(durationsGe)
11
12 ## Required for histogram
13 library(lattice)
14 ## Required for ols
15 library(Design)
16
17 ## Look at duration
18 histogram( DurationOfPrefix )
19
20 ## The following suggests women overall have shorter duration than men, on
21 ## average.
22 histogram(~ DurationOfPrefix | Sex)
23
24 ## Tests for normality reject the null hypothesis that duration is normally
25 ## distributed amongst the sub-populations partitioned by sex.
26 shapiro.test(durationsGe[durationsGe$Sex == "female",]$DurationOfPrefix)
27 shapiro.test(durationsGe[durationsGe$Sex == "male",]$DurationOfPrefix)
28
29 ## Look at each of the other (potential) predictors
30 histogram( YearOfBirth )
31 histogram( SpeechRate )
32 ## Zipf
33 histogram( Frequency )
34
35 ## Since the study took place in 2005 (?), then we transform YearOfBirth into
36 ## an approximate Age by subtracting from the year of the study
37 durationsGe$Age = 2005 - durationsGe$YearOfBirth
38 attach(durationsGe)
39 histogram(Age)
40
41 ## This shows a spike in the last chart. Either an outlier, or a severe
42 ## lengthening of duration at a particular point of old age, or sparse data.
43 histogram( ~ DurationOfPrefix | round(Age / 10))
44
45 histogram( ~ DurationOfPrefix | Age > 70)
46 histogram( ~ DurationOfPrefix | Age > 80)
47
48
49 ## This shows that frequency has a few extreme points, which we saw earlier in
50 ## the histogram.
51 summary(Frequency)
52 ## Min. 1st Qu. Median Mean 3rd Qu. Max.
53 ## 1.00 6.00 18.00 125.40 59.25 8104.00
54
55 ## What are these high frequency values?
56 rev(sort(Frequency))[1:20]
57 ## [1] 8104 4970 4121 2774 1839 1605 1301 1155 1113 1065 946 904 742 736 701
58 ## [16] 623 619 594 531 526
59
60 ## The second is more linear than the first
61 plot(sort(Frequency))
62 plot(log(sort(Frequency)) + 1)
63
64 ## This gives a more symmetric looking distribution
65 densityplot(rev(sort(DurationOfPrefix))[-(1:10)])
66
67 ## This again shows the highest values under duration being potential outliers
68 boxplot(DurationOfPrefix)
69
70 big = durationsGe[durationsGe$DurationOfPrefix > 0.246479,]
71 cbind(big$Frequency, big$DurationOfPrefix)
72 ## [,1] [,2]
73 ## [1,] 88 0.278665
74 ## [2,] 20 0.248249
75 ## [3,] 36 0.254832
76 ## [4,] 1 0.311227
77 ## [5,] 33 0.263485
78 ## [6,] 17 0.311799
79 ## [7,] 13 0.250762
80 ## [8,] 5 0.276309
81 ## [9,] 71 0.289032
82
83 ## Which all leaves me still uncertain of when we are justified in deleting data
84 ## points.
85
86 ## A basic model
87 fit.1 = lm( DurationOfPrefix ~ Frequency + Sex + SpeechRate + NumberSegmentsOnset + Age)
88
89 summary(fit.1)
90
91 ## Call:
92 ## lm(formula = DurationOfPrefix ~ Frequency + Sex + SpeechRate +
93 ## NumberSegmentsOnset + Age)
94
95 ## Residuals:
96 ## Min 1Q Median 3Q Max
97 ## -0.0954003 -0.0318011 -0.0008356 0.0243314 0.1822600
98
99 ## Coefficients:
100 ## Estimate Std. Error t value Pr(>|t|)
101 ## (Intercept) 1.895e-01 1.364e-02 13.891 < 2e-16 ***
102 ## Frequency -1.108e-05 4.027e-06 -2.751 0.00619 **
103 ## Sexmale 4.182e-03 4.458e-03 0.938 0.34878
104 ## SpeechRate -9.646e-03 1.626e-03 -5.934 6.22e-09 ***
105 ## NumberSegmentsOnset -7.143e-03 3.804e-03 -1.878 0.06110 .
106 ## Age -3.327e-05 1.378e-04 -0.241 0.80940
107 ## ---
108 ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
109
110 ## Residual standard error: 0.0453 on 419 degrees of freedom
111 ## (3 observations deleted due to missingness)
112 ## Multiple R-squared: 0.1004, Adjusted R-squared: 0.08965
113 ## F-statistic: 9.351 on 5 and 419 DF, p-value: 1.848e-08
114
115
116 ## Log transform Frequency
117 fit.1.1 = lm( DurationOfPrefix ~ log(Frequency) + Sex + SpeechRate + NumberSegmentsOnset + Age)
118 summary(fit.1.1)
119
120 ## Call:
121 ## lm(formula = DurationOfPrefix ~ log(Frequency) + Sex + SpeechRate +
122 ## NumberSegmentsOnset + Age)
123
124 ## Residuals:
125 ## Min 1Q Median 3Q Max
126 ## -0.101688 -0.031895 -0.001376 0.026118 0.182423
127
128 ## Coefficients:
129 ## Estimate Std. Error t value Pr(>|t|)
130 ## (Intercept) 1.993e-01 1.407e-02 14.172 < 2e-16 ***
131 ## log(Frequency) -4.241e-03 1.281e-03 -3.312 0.00101 **
132 ## Sexmale 3.867e-03 4.435e-03 0.872 0.38378
133 ## SpeechRate -9.407e-03 1.621e-03 -5.802 1.29e-08 ***
134 ## NumberSegmentsOnset -7.678e-03 3.798e-03 -2.022 0.04384 *
135 ## Age -7.172e-06 1.371e-04 -0.052 0.95831
136 ## ---
137 ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
138
139 ## Residual standard error: 0.04512 on 419 degrees of freedom
140 ## (3 observations deleted due to missingness)
141 ## Multiple R-squared: 0.1075, Adjusted R-squared: 0.09685
142 ## F-statistic: 10.09 on 5 and 419 DF, p-value: 3.867e-09
143
144
145
146 ## A model without the biggest duration points
147 durationsGe.clipped = durationsGe[durationsGe$DurationOfPrefix < 0.27,]
148 fit.2 = lm( DurationOfPrefix ~ durationsGe.clipped$Frequency + Sex + SpeechRate + NumberSegmentsOnset + Age, data = durationsGe.clipped)
149
150 summary(fit.2)
151
152 ## Call:
153 ## lm(formula = DurationOfPrefix ~ Frequency + Sex + SpeechRate +
154 ## NumberSegmentsOnset + Age, data = durationsGe.clipped)
155
156 ## Residuals:
157 ## Min 1Q Median 3Q Max
158 ## -0.093180 -0.030800 -0.000976 0.024868 0.129151
159
160 ## Coefficients:
161 ## Estimate Std. Error t value Pr(>|t|)
162 ## (Intercept) 1.743e-01 1.307e-02 13.340 < 2e-16 ***
163 ## Frequency -1.066e-05 3.771e-06 -2.826 0.00494 **
164 ## Sexmale 5.745e-03 4.193e-03 1.370 0.17136
165 ## SpeechRate -7.798e-03 1.547e-03 -5.039 7e-07 ***
166 ## NumberSegmentsOnset -6.045e-03 3.577e-03 -1.690 0.09179 .
167 ## Age -1.266e-05 1.306e-04 -0.097 0.92280
168 ## ---
169 ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
170
171 ## Residual standard error: 0.04241 on 414 degrees of freedom
172 ## (3 observations deleted due to missingness)
173 ## Multiple R-squared: 0.08289, Adjusted R-squared: 0.07181
174 ## F-statistic: 7.483 on 5 and 414 DF, p-value: 9.724e-07
175
176
177 ## Log transform frequency
178 fit.2.1 = lm( DurationOfPrefix ~ log(Frequency) + Sex + SpeechRate + NumberSegmentsOnset + Age, data = durationsGe.clipped)
179
180
181 ## Call:
182 ## lm(formula = DurationOfPrefix ~ log(Frequency) + Sex + SpeechRate +
183 ## NumberSegmentsOnset + Age, data = durationsGe.clipped)
184
185 ## Residuals:
186 ## Min 1Q Median 3Q Max
187 ## -0.0991713 -0.0311861 -0.0002442 0.0268042 0.1318496
188
189 ## Coefficients:
190 ## Estimate Std. Error t value Pr(>|t|)
191 ## (Intercept) 1.839e-01 1.348e-02 13.646 < 2e-16 ***
192 ## log(Frequency) -4.069e-03 1.206e-03 -3.373 0.000812 ***
193 ## Sexmale 5.469e-03 4.171e-03 1.311 0.190480
194 ## SpeechRate -7.573e-03 1.543e-03 -4.909 1.32e-06 ***
195 ## NumberSegmentsOnset -6.562e-03 3.571e-03 -1.837 0.066887 .
196 ## Age 1.055e-05 1.299e-04 0.081 0.935300
197 ## ---
198 ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
199
200 ## Residual standard error: 0.04224 on 414 degrees of freedom
201 ## (3 observations deleted due to missingness)
202 ## Multiple R-squared: 0.0902, Adjusted R-squared: 0.07921
203 ## F-statistic: 8.209 on 5 and 414 DF, p-value: 2.089e-07
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.