Attachment 'TingQian-hw1.R'
Download 1 ##############################
2 # HLP Lab - Summer Stats Class
3 # HW #1
4 # Ting Qian
5 ##############################
6
7 #############################
8 # G&H07 3.9 (pp. 50-51) #
9 # Exercises 3 and 5 #
10 #############################
11
12 ### 3(a) ###
13 var1 <- rnorm(1000,0,1)
14 var2 <- rnorm(1000,0,1)
15
16 ### fit a linear model ###
17 fit <- lm(var1 ~ var2)
18 summary(fit)
19
20 ### output: ###
21 #Coefficients:
22 # Estimate Std. Error t value Pr(>|t|)
23 #(Intercept) 0.0460 0.0309 1.49 0.14
24 #var2 0.0520 0.0321 1.62 0.11
25
26 ### answer: no. because p(var2) > 0.05 ###
27
28 #############
29 ### 3(b) ####
30
31 ### load library "arm" ###
32 library(arm)
33
34 ### Run the loop as provided ###
35 z.scores <- rep(NA,100)
36 for (k in 1:100) {
37 var1 <- rnorm(1000,0,1)
38 var2 <- rnorm(1000,0,1)
39 fit <- lm(var2 ~ var1)
40 z.scores[k] <- coef(fit)[2]/se.coef(fit)[2]
41 }
42
43 ### display the number of z.scores > 2 ###
44 length(z.scores[z.scores>2])
45
46 ### output: [1] 2###
47 ### Answer: Two of them are significant ###
48
49 ############
50 ### 5(a) ####
51
52 beauty <- read.csv("ProfEvaltnsBeautyPublic.csv", sep=",", header=T)
53 attach(beauty)
54 plot(btystdave, courseevaluation)
55
56 # fit a model, controling for gender
57 fit.b <- lm(courseevaluation ~ btystdave + female)
58 summary(fit.b)
59
60 # output
61 # Coefficients:
62 # Estimate Std. Error t value Pr(>|t|)
63 # (Intercept) 4.0947 0.0333 123.03 < 2e-16 ***
64 # btystdave 0.1486 0.0320 4.65 4.3e-06 ***
65 # --an increase of 1 in beauty score leads to an increase of 0.15 in evaluation score
66 # female -0.1978 0.0510 -3.88 0.00012 ***
67 # --evaluation scores of male professors are 0.20 higher than female on average
68 #
69 # Residual standard error: 0.54 on 460 degrees of freedom
70 # --this pariticular model can predict evaluation scores to an accuracy of 0.54 points
71 # --which can be good in the sense gender and beauty do not determine students' opinion entirely
72
73 # graphical display
74 plot(fit.b)
75
76 # graphical display showing differences in gender
77 colors <- ifelse(female==1, "blue", "red")
78 plot(btystdave, courseevaluation, xlab="beauty score", ylab="eval score", col=colors)
79
80 abline(lm(courseevaluation ~ btystdave, data=beauty[female==0,]), col="red", lty=1)
81 abline(lm(courseevaluation ~ btystdave, data=beauty[female==1,]), col="blue", lty=1)
82 abline(lm(courseevaluation ~ btystdave, data=beauty), lty=1)
83 legend("bottomright", c("female", "male", "all"), fill=c("blue", "red", "black"), title="gender groups",ncol=3)
84
85 # plot residuals vs. fitted values
86 plot(fitted.values(fit.b), residuals(fit.b), xlab="fitted values - lm(courseevaluation ~ btystdave + female)", ylab="residuals", main="residuals vs. fitted")
87
88 ### 5(b) ###
89
90 # #1
91 # fit a model, controlling for age, gender, and nationality
92 fit.b <- lm(courseevaluation ~ btystdave + female + nonenglish + age)
93 summary(fit.b)
94
95 # output
96 # Coefficients:
97 # Estimate Std. Error t value Pr(>|t|)
98 # (Intercept) 4.24446 0.14153 29.99 < 2e-16 ***
99 # btystdave 0.14103 0.03291 4.28 2.2e-05 ***
100 # --an increase of 1 in beauty score leads to an increase of 0.14 in course evaluation for both genders, different ages, and nationalities.
101 # female -0.21030 0.05230 -4.02 6.8e-05 ***
102 # --evaluation scores of male professors are 0.21 higher than female on average
103 # nonenglish -0.33223 0.10374 -3.20 0.0015 **
104 # --nonenglish(non-native speakers?) professors are less preferred by students by a decrease of 0.33 in evaluation score
105 # age -0.00259 0.00274 -0.94 0.3459
106 # --this is an insignificant effect. the coefficient is also close to 0, meaning age does not affect evaluation
107
108 # #2
109 # It is possible that gender and beauty interacts: ;-)
110 # fit a model, controlling for age and nationality
111 fit.b1 <- lm(courseevaluation ~ (btystdave * female) + nonenglish + age)
112 summary(fit.b1)
113
114 # output
115 # Coefficients:
116 # Estimate Std. Error t value Pr(>|t|)
117 # (Intercept) 4.22998 0.14142 29.91 < 2e-16 ***
118 # btystdave 0.19513 0.04454 4.38 1.5e-05 ***
119 # female -0.21527 0.05225 -4.12 4.5e-05 ***
120 # nonenglish -0.33813 0.10354 -3.27 0.0012 **
121 # age -0.00211 0.00275 -0.77 0.4429
122 # btystdave:female -0.11444 0.06367 -1.80 0.0729 .
123
124 # after allowing interaction between beauty and gender, we see some interesting results:
125 # 1) non-native English speaking professors still have scores lower than native speakers by 0.33 on average
126 # 2) age of professor is still an irrelavant factor in course evaluation
127 # 3) for professors who get a beauty score of 0, female professors do get even worse evaluation scores than male professors by 0.21
128 # 4) among male professors, being ranked as more beautiful (cute/handsome?) helps to boost evaluation score by 0.19 for every increase of 1 in beautifulness.
129 # 5) the interaction between beauty and gender is marginally significant (p = 0.07)
130 # this means, perhaps, beautifulness is less of an influential factor for students to determine the quality of the course for female professors than male professors.
131 # (i.e. the slope of btystdave for female is smaller than that of male)
132
133
134 #############################
135 # Baa08 4.7 (p. 126) #
136 # Exercises 3 and 7 #
137 #############################
138
139 # Exercise 3 #
140
141 library(languageR)
142
143 # peek at the data
144 summary(durationsGe)
145 attach(durationsGe)
146 plot(log(Frequency), DurationOfPrefix)
147
148 # transform the frequency (predictor) values to log z scores
149 durationsGe$z.log.Frequency <- as.numeric(scale(log(Frequency)))
150
151 # look at the new distribution
152 attach(durationsGe)
153 histogram(z.log.Frequency)
154
155 # the distribution of log(frequency) should be roughly normal #
156 # raw frequency data can't be normal; instead, it should be a zipf distribution #
157 shapiro.test(z.log.Frequency)
158
159 # Do the same thing to the dependent variable: DurationOfPrefix #
160 durationsGe$z.DurationOfPrefix <- as.numeric(scale(DurationOfPrefix))
161 attach(durationsGe)
162 histogram(z.DurationOfPrefix)
163
164 #### remove outliers ###
165 d <- subset(durationsGe, abs(z.log.Frequency) < 2 & abs(z.DurationOfPrefix) < 2)
166 nrow(d)
167 # output: 397. we filtered out 31 data points
168
169 # scatterplot
170 plot(log(d$Frequency), d$DurationOfPrefix)
171 # a trend is almost perceivable
172 abline(lm(DurationOfPrefix ~ log(Frequency), d))
173
174 #### model fitting ###
175 # here we use raw data - but filtered according to z scores
176 fit.ge <- lm(DurationOfPrefix ~ log(Frequency), d)
177 summary(fit.ge)
178
179 # output: Coefficients:
180 # Estimate Std. Error t value Pr(>|t|)
181 # (Intercept) 0.13178 0.00417 31.61 <2e-16 ***
182 # log(Frequency) -0.00365 0.00128 -2.85 0.0046 **
183 #
184 # Residual standard error: 0.039 on 395 degrees of freedom
185 # Multiple R-squared: 0.0201, Adjusted R-squared: 0.0176
186 # F-statistic: 8.1 on 1 and 395 DF, p-value: 0.00465
187
188 # therefore, it takes less time to pronounce the "ge" prefix for high-frequency words.
189 # specifically, an increase of one frequency count decreases pronunciation time by 3.6/1000 fold.
190
191
192 # Exercise 7 #
193
194 # as instruced, remove data where the nasal was absent
195 d.ont <- durationsOnt[durationsOnt$DurationPrefixNasal!=0,]
196
197 attach(d.ont)
198 summary(Frequency)
199 # Min. 1st Qu. Median Mean 3rd Qu. Max.
200 # 0.000 1.386 2.639 2.658 3.892 6.725
201 # I ASSUME THESE ARE LOG FREQUENCIES, NOT RAW FREQUENCIES
202
203 d.ont$z.log.Frequency <- as.numeric(scale(Frequency))
204 d.ont$z.DurationPrefixNasal <- as.numeric(scale(DurationPrefixNasal))
205
206 d.ont.1 <- d.ont[abs(d.ont$z.log.Frequency) < 2 & abs(d.ont$z.DurationPrefixNasal) < 2,]
207 plot(d.ont.1$PlosivePresent, d.ont.1$DurationPrefixNasal)
208
209 # test for normality
210 shapiro.test(d.ont.1$DurationPrefixNasal)
211
212 # Shapiro-Wilk normality test
213 # data: d.ont.1$DurationPrefixNasal
214 # W = 0.9857, p-value = 0.4303
215 #
216 # YES! The outcome variable is normal now.
217
218 #### model fitting ####
219 library(Design)
220 dd.ont.1 <- datadist(d.ont.1)
221 options(datadist='dd.ont.1')
222 ol.ont <- ols(DurationPrefixNasal ~ PlosivePresent + Frequency, d.ont.1)
223 ol.ont
224
225 ### analysis of covariance ###
226 # Coefficients:
227 # Value Std. Error t Pr(>|t|)
228 # Intercept 0.069993 0.0034353 20.375 0.000e+00
229 # PlosivePresent=yes -0.019660 0.0031140 -6.313 1.114e-08
230 # Frequency -0.002145 0.0008854 -2.422 1.750e-02
231 #
232 # Residual standard error: 0.01286 on 87 degrees of freedom
233 # Adjusted R-Squared: 0.3408
234
235 # Interpretation:
236 # 1) intercept: when the plosive is not present and the frequency of the word is 1 (log(F)=0), the duration of nasal is 0.07s
237 # 2) when the plosive is present, the duration of nasal decreases by 0.02s compared to when it isn't.
238 # 3) the frequency of the word being pronunced has a negative correlation to duration of nasal: for
239 # an increase of one frequency count decreases pronunciation time by 2.1/1000 fold.
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.