attachment:TingQian-hw1.R of HLPMiniCourseSession1

   1 ##############################
   2 # HLP Lab - Summer Stats Class
   3 # HW #1
   4 # Ting Qian
   5 ##############################
   6 
   7 #############################
   8 # G&H07 3.9 (pp. 50-51)     #
   9 # Exercises 3 and 5         #
  10 #############################
  11 
  12 ### 3(a) ###
  13 var1 <- rnorm(1000,0,1)
  14 var2 <- rnorm(1000,0,1)
  15 
  16 ### fit a linear model ###
  17 fit <- lm(var1 ~ var2)
  18 summary(fit)
  19 
  20 ### output: ###
  21 #Coefficients:
  22 #            Estimate Std. Error t value Pr(>|t|)
  23 #(Intercept)   0.0460     0.0309    1.49     0.14
  24 #var2          0.0520     0.0321    1.62     0.11
  25 
  26 ### answer: no. because p(var2) > 0.05 ###
  27 
  28 #############
  29 ### 3(b) ####
  30 
  31 ### load library "arm" ###
  32 library(arm)
  33 
  34 ### Run the loop as provided ###
  35 z.scores <- rep(NA,100)
  36 for (k in 1:100) {
  37  var1 <- rnorm(1000,0,1)
  38  var2 <- rnorm(1000,0,1)
  39  fit <- lm(var2 ~ var1)
  40  z.scores[k] <- coef(fit)[2]/se.coef(fit)[2]
  41 }
  42 
  43 ### display the number of z.scores > 2 ###
  44 length(z.scores[z.scores>2])
  45 
  46 ### output: [1] 2###
  47 ### Answer: Two of them are significant ###
  48 
  49 ############
  50 ### 5(a) ####
  51 
  52 beauty <- read.csv("ProfEvaltnsBeautyPublic.csv", sep=",", header=T)
  53 attach(beauty)
  54 plot(btystdave, courseevaluation)
  55 
  56 # fit a model, controling for gender
  57 fit.b <- lm(courseevaluation ~ btystdave + female)
  58 summary(fit.b)
  59 
  60 # output
  61 # Coefficients:
  62 #             Estimate Std. Error t value Pr(>|t|)    
  63 # (Intercept)   4.0947     0.0333  123.03  < 2e-16 ***
  64 # btystdave     0.1486     0.0320    4.65  4.3e-06 ***
  65 #   --an increase of 1 in beauty score leads to an increase of 0.15 in evaluation score
  66 # female       -0.1978     0.0510   -3.88  0.00012 ***
  67 #   --evaluation scores of male professors are 0.20 higher than female on average
  68 #
  69 # Residual standard error: 0.54 on 460 degrees of freedom
  70 #   --this pariticular model can predict evaluation scores to an accuracy of 0.54 points
  71 #   --which can be good in the sense gender and beauty do not determine students' opinion entirely
  72 
  73 # graphical display
  74 plot(fit.b)
  75 
  76 # graphical display showing differences in gender
  77 colors <- ifelse(female==1, "blue", "red")
  78 plot(btystdave, courseevaluation, xlab="beauty score", ylab="eval score", col=colors)
  79 
  80 abline(lm(courseevaluation ~ btystdave, data=beauty[female==0,]), col="red", lty=1)
  81 abline(lm(courseevaluation ~ btystdave, data=beauty[female==1,]), col="blue", lty=1)
  82 abline(lm(courseevaluation ~ btystdave, data=beauty), lty=1)
  83 legend("bottomright", c("female", "male", "all"), fill=c("blue", "red", "black"), title="gender groups",ncol=3)
  84 
  85 # plot residuals vs. fitted values
  86 plot(fitted.values(fit.b), residuals(fit.b), xlab="fitted values - lm(courseevaluation ~ btystdave + female)", ylab="residuals", main="residuals vs. fitted")
  87 
  88 ### 5(b) ###
  89 
  90 # #1
  91 # fit a model, controlling for age, gender, and nationality
  92 fit.b <- lm(courseevaluation ~ btystdave + female + nonenglish + age)
  93 summary(fit.b)
  94 
  95 # output
  96 # Coefficients:
  97 #             Estimate Std. Error t value Pr(>|t|)    
  98 # (Intercept)  4.24446    0.14153   29.99  < 2e-16 ***
  99 # btystdave    0.14103    0.03291    4.28  2.2e-05 ***
 100 #   --an increase of 1 in beauty score leads to an increase of 0.14 in course evaluation for both genders, different ages, and nationalities.
 101 # female      -0.21030    0.05230   -4.02  6.8e-05 ***
 102 #   --evaluation scores of male professors are 0.21 higher than female on average
 103 # nonenglish  -0.33223    0.10374   -3.20   0.0015 ** 
 104 #   --nonenglish(non-native speakers?) professors are less preferred by students by a decrease of 0.33 in evaluation score 
 105 # age         -0.00259    0.00274   -0.94   0.3459
 106 #   --this is an insignificant effect. the coefficient is also close to 0, meaning age does not affect evaluation
 107 
 108 # #2
 109 # It is possible that gender and beauty interacts: ;-)
 110 # fit a model, controlling for age and nationality
 111 fit.b1 <- lm(courseevaluation ~ (btystdave * female) + nonenglish + age)
 112 summary(fit.b1)
 113 
 114 # output
 115 # Coefficients:
 116 #                  Estimate Std. Error t value Pr(>|t|)    
 117 # (Intercept)       4.22998    0.14142   29.91  < 2e-16 ***
 118 # btystdave         0.19513    0.04454    4.38  1.5e-05 ***
 119 # female           -0.21527    0.05225   -4.12  4.5e-05 ***
 120 # nonenglish       -0.33813    0.10354   -3.27   0.0012 ** 
 121 # age              -0.00211    0.00275   -0.77   0.4429    
 122 # btystdave:female -0.11444    0.06367   -1.80   0.0729 .   
 123 
 124 # after allowing interaction between beauty and gender, we see some interesting results:
 125 # 1) non-native English speaking professors still have scores lower than native speakers by 0.33 on average
 126 # 2) age of professor is still an irrelavant factor in course evaluation
 127 # 3) for professors who get a beauty score of 0, female professors do get even worse evaluation scores than male professors by 0.21
 128 # 4) among male professors, being ranked as more beautiful (cute/handsome?) helps to boost evaluation score by 0.19 for every increase of 1 in beautifulness.
 129 # 5) the interaction between beauty and gender is marginally significant (p = 0.07)
 130 #    this means, perhaps, beautifulness is less of an influential factor for students to determine the quality of the course for female professors than male professors.
 131 #    (i.e. the slope of btystdave for female is smaller than that of male)
 132 
 133 
 134 #############################
 135 # Baa08 4.7 (p. 126)        #
 136 # Exercises 3 and 7         #
 137 #############################
 138 
 139 # Exercise 3 #
 140 
 141 library(languageR)
 142 
 143 # peek at the data
 144 summary(durationsGe)
 145 attach(durationsGe)
 146 plot(log(Frequency), DurationOfPrefix)
 147 
 148 # transform the frequency (predictor) values to log z scores
 149 durationsGe$z.log.Frequency <- as.numeric(scale(log(Frequency)))
 150 
 151 # look at the new distribution
 152 attach(durationsGe)
 153 histogram(z.log.Frequency)
 154 
 155 # the distribution of log(frequency) should be roughly normal #
 156 # raw frequency data can't be normal; instead, it should be a zipf distribution #
 157 shapiro.test(z.log.Frequency)
 158 
 159 # Do the same thing to the dependent variable: DurationOfPrefix #
 160 durationsGe$z.DurationOfPrefix <- as.numeric(scale(DurationOfPrefix))
 161 attach(durationsGe)
 162 histogram(z.DurationOfPrefix)
 163 
 164 #### remove outliers ###
 165 d <- subset(durationsGe, abs(z.log.Frequency) < 2 & abs(z.DurationOfPrefix) < 2)
 166 nrow(d)
 167 # output: 397. we filtered out 31 data points
 168 
 169 # scatterplot
 170 plot(log(d$Frequency), d$DurationOfPrefix)
 171 # a trend is almost perceivable
 172 abline(lm(DurationOfPrefix ~ log(Frequency), d))
 173 
 174 #### model fitting ###
 175 # here we use raw data - but filtered according to z scores
 176 fit.ge <- lm(DurationOfPrefix ~ log(Frequency), d)
 177 summary(fit.ge)
 178 
 179 # output: Coefficients:
 180 #               Estimate Std. Error t value Pr(>|t|)    
 181 # (Intercept)     0.13178    0.00417   31.61   <2e-16 ***
 182 # log(Frequency) -0.00365    0.00128   -2.85   0.0046 ** 
 183 #
 184 # Residual standard error: 0.039 on 395 degrees of freedom
 185 # Multiple R-squared: 0.0201,     Adjusted R-squared: 0.0176 
 186 # F-statistic:  8.1 on 1 and 395 DF,  p-value: 0.00465 
 187 
 188 # therefore, it takes less time to pronounce the "ge" prefix for high-frequency words.
 189 # specifically, an increase of one frequency count decreases pronunciation time by 3.6/1000 fold.
 190 
 191 
 192 # Exercise 7 #
 193 
 194 # as instruced, remove data where the nasal was absent
 195 d.ont <- durationsOnt[durationsOnt$DurationPrefixNasal!=0,]
 196 
 197 attach(d.ont)
 198 summary(Frequency)
 199 #   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 200 #  0.000   1.386   2.639   2.658   3.892   6.725 
 201 # I ASSUME THESE ARE LOG FREQUENCIES, NOT RAW FREQUENCIES
 202 
 203 d.ont$z.log.Frequency <- as.numeric(scale(Frequency))
 204 d.ont$z.DurationPrefixNasal <- as.numeric(scale(DurationPrefixNasal))
 205 
 206 d.ont.1 <- d.ont[abs(d.ont$z.log.Frequency) < 2 & abs(d.ont$z.DurationPrefixNasal) < 2,]
 207 plot(d.ont.1$PlosivePresent, d.ont.1$DurationPrefixNasal)
 208 
 209 # test for normality
 210 shapiro.test(d.ont.1$DurationPrefixNasal)
 211 
 212 #  Shapiro-Wilk normality test
 213 # data:  d.ont.1$DurationPrefixNasal 
 214 # W = 0.9857, p-value = 0.4303
 215 #
 216 # YES! The outcome variable is normal now.
 217 
 218 #### model fitting ####
 219 library(Design)
 220 dd.ont.1 <- datadist(d.ont.1)
 221 options(datadist='dd.ont.1')
 222 ol.ont <- ols(DurationPrefixNasal ~ PlosivePresent + Frequency, d.ont.1)
 223 ol.ont
 224 
 225 ### analysis of covariance ###
 226 # Coefficients:
 227 #                        Value Std. Error      t  Pr(>|t|)
 228 # Intercept           0.069993  0.0034353 20.375 0.000e+00
 229 # PlosivePresent=yes -0.019660  0.0031140 -6.313 1.114e-08
 230 # Frequency          -0.002145  0.0008854 -2.422 1.750e-02
 231 #
 232 # Residual standard error: 0.01286 on 87 degrees of freedom
 233 # Adjusted R-Squared: 0.3408 
 234 
 235 # Interpretation:
 236 # 1) intercept: when the plosive is not present and the frequency of the word is 1 (log(F)=0), the duration of nasal is 0.07s
 237 # 2) when the plosive is present, the duration of nasal decreases by 0.02s compared to when it isn't.
 238 # 3) the frequency of the word being pronunced has a negative correlation to duration of nasal: for 
 239 #    an increase of one frequency count decreases pronunciation time by 2.1/1000 fold.
Attachment 'TingQian-hw1.R'

Attached Files