Attachment 'LSA13-Lecture4-plyr-reshape.R'
Download 1 ## @knitr preamble, echo=FALSE, results='hide', cache=FALSE, message=F
2 library(plyr)
3 library(reshape2)
4
5 library(ggplot2)
6 library(stringr)
7
8 # set knitr chunk option defaults for this document
9 opts_chunk$set(results='markup', echo=TRUE, cache=T, autodep=T, warning=F, message=F, prompt=F,
10 fig.width=4, fig.height=4, out.width='0.5\\textwidth', fig.pos='!h', fig.lp='fig:',
11 tidy=F, size='scriptsize')
12
13 data(lexdec, package='languageR')
14
15 # set ggplot theme globally
16 theme_set(theme_bw())
17
18
19
20 ## @knitr
21 freqEffects <- ddply(lexdec, .(Subject), function(df) {coef(lm(RT~Frequency, data=df))})
22
23
24 ## @knitr echo=F
25 head(freqEffects)
26
27
28 ## @knitr lexdec-freq-effect-bysub, echo=F, out.width='0.9\\textwidth'
29 ggplot(lexdec, aes(x=Frequency, y=RT)) +
30 geom_point() +
31 facet_wrap(~Subject) +
32 geom_abline(data=freqEffects, aes(slope=Frequency, intercept=`(Intercept)`), color='red')
33
34
35 ## @knitr
36 mean(runif(100))
37
38
39 ## @knitr
40 f <- mean
41 f(runif(100))
42
43
44 ## @knitr
45 function(x) sum(x)/length(x)
46
47
48 ## @knitr
49 mymean <- function(x) sum(x)/length(x)
50 mymean(runif(100))
51 mymean(runif(100, 1, 2))
52
53
54 ## @knitr
55 (function(x) sum(x)/length(x)) (runif(100))
56
57
58 ## @knitr
59 make.power <- function(n) {
60 return(function(x) x^n)
61 }
62 my.square <- make.power(2)
63 my.square(3)
64 (make.power(4)) (2)
65
66
67 ## @knitr
68 sd(rnorm(sd=5, mean=1, 100))
69
70
71 ## @knitr
72 rnorm
73
74
75 ## @knitr
76 list.o.nums <- list(runif(100), rnorm(100), rpois(100, lambda=1))
77 lapply(list.o.nums, mean)
78
79
80 ## @knitr
81 sapply(1:10, function(n) rpois(5, lambda=n))
82
83
84 ## @knitr
85 sapply(1:10, function(n) mean(rnorm(n=5, mean=0, sd=1)))
86
87
88 ## @knitr
89 data(lexdec, package='languageR')
90 RT.bysub <- with(lexdec, split(RT, Subject))
91 RT.means.bysub <- sapply(RT.bysub, mean)
92 head(data.frame(RT.mean=RT.means.bysub))
93
94
95 ## @knitr
96 library(plyr)
97 head(ddply(lexdec, .(Subject), function(df) data.frame(RT.mean=mean(df$RT))))
98
99
100 ## @knitr eval=F
101 ddply(lexdec, .(Subject), function(df) data.frame(RT.mean=mean(df$RT)))
102
103
104 ## @knitr ddply-output-exercises, eval=F
105 library(plyr)
106 data(lexdec, package='languageR') # load the dataset if it isn't already
107 ddply(lexdec, .(PrevType, Class), function(df) with(df, data.frame(meanRT=mean(RT))))
108
109
110 ## @knitr
111 slowestTrials <- ddply(lexdec, .(Subject), subset, RT==max(RT))
112 head(slowestTrials[, c('Subject', 'RT', 'Trial', 'Word')])
113
114
115 ## @knitr eval=FALSE
116 ## ddply(lexdec, .(Subject), function(df, ...) subset(df, ...), RT==max(RT))
117 ## ddply(lexdec, .(Subject), function(df) subset(df, RT==max(RT)))
118
119
120 ## @knitr eval=F
121 ddply(lexdec, .(Subject), function(df) {
122 df$RT.s <- scale(df$RT)
123 return(df)
124 })
125
126
127 ## @knitr
128 lexdecScaledRT <- ddply(lexdec, .(Subject), transform, RT.s=scale(RT))
129
130
131 ## @knitr summarise-RT-dist-plots, out.width='0.45\\textwidth', echo=FALSE, results='hide'
132 ggplot(lexdecScaledRT, aes(x=RT, group=Subject)) + geom_density()
133 ggplot(lexdecScaledRT, aes(x=RT.s, group=Subject)) + geom_density()
134
135
136 ## @knitr eval=FALSE
137 ## summarise(.data, summVar1=expr1, summVar2=expr2, ...)
138
139
140 ## @knitr
141 lexdec.RTsumm <- ddply(lexdec, .(Subject), summarise, mean=mean(RT), var=var(RT))
142 head(lexdec.RTsumm)
143
144
145 ## @knitr eval=F
146 ddply(lexdec, .(Subject), function(df) with(df, data.frame(meanRT=mean(RT))))
147
148
149 ## @knitr
150 lexdec <- ddply(lexdec, .(Subject), transform, RTrank=rank(RT))
151
152
153 word.rt.ranks <- ddply(lexdec, .(Word, Frequency), summarise,
154 RTrank=mean(RTrank),
155 RTrank25=quantile(RTrank, 0.25), RTrank75=quantile(RTrank, 0.75))
156
157
158 ## @knitr out.width='0.9\\textwidth'
159 ggplot(word.rt.ranks, aes(x=Frequency, y=RTrank, ymin=RTrank25, ymax=RTrank75)) +
160 geom_pointrange()
161
162
163 ## @knitr
164 subset(word.rt.ranks,
165 RTrank %in% c(max(RTrank), min(RTrank)))
166
167
168 ## @knitr
169 ddply(lexdec, .(NativeLanguage), summarise, acc=mean(Correct=='correct'))
170
171
172 ## @knitr
173 daply(lexdec, .(NativeLanguage, Correct), nrow)
174
175
176 ## @knitr
177 correctCounts <- daply(lexdec, .(NativeLanguage, Correct), nrow)
178 chisq.test(correctCounts)
179
180
181 ## @knitr
182 subjectSlopes <- ddply(lexdec, .(Subject), function(df) {coef(lm(RT~Frequency, data=df))})
183
184
185 ## @knitr
186 summary(subjectSlopes$Frequency)
187
188
189 ## @knitr subject-slopes-rt
190 ggplot(lexdec, aes(x=Frequency, y=RT)) +
191 geom_point() +
192 facet_wrap(~Subject) +
193 geom_abline(data=subjectSlopes, aes(slope=Frequency, intercept=`(Intercept)`), color='red')
194
195
196 ## @knitr
197 lexdec <- transform(lexdec,
198 FreqHiLo=factor(ifelse(Frequency>median(Frequency),
199 'high', 'low'),
200 levels=c('low', 'high')))
201
202
203 ## @knitr results='hide', echo=FALSE
204 # this is necessary because of something strange in how knitr/plyr are
205 # using environments
206 se <- function(x) sd(x)/sqrt(length(x))
207
208
209 ## @knitr
210 se <- function(x) sd(x)/sqrt(length(x))
211 langfreq.summ <- ddply(lexdec,
212 .(NativeLanguage, FreqHiLo),
213 summarise, mean=mean(RT), se=se(RT))
214
215
216 ## @knitr fig.width=6, out.width='0.75\\textwidth'
217 ggplot(langfreq.summ, aes(x=FreqHiLo, color=NativeLanguage,
218 y=mean, ymin=mean-1.96*se, ymax=mean+1.96*se)) +
219 geom_point() +
220 geom_errorbar(width=0.1) +
221 geom_line(aes(group=NativeLanguage))
222
223
224 ## @knitr
225 langfreq.bysub <- ddply(lexdec, .(NativeLanguage, FreqHiLo, Subject),
226 summarise, RT=mean(RT))
227
228
229 ## @knitr
230 langfreq.bysub.summ <- ddply(langfreq.bysub,
231 .(NativeLanguage, FreqHiLo),
232 summarise, mean=mean(RT), se=se(RT))
233
234
235 ## @knitr by-subject-errorbars, fig.width=6, out.width='0.75\\textwidth'
236 ggplot(langfreq.bysub.summ, aes(x=FreqHiLo, color=NativeLanguage,
237 y=mean, ymin=mean-1.96*se, ymax=mean+1.96*se)) +
238 geom_point() +
239 geom_errorbar(width=0.1) +
240 geom_line(aes(group=NativeLanguage))
241
242
243 ## @knitr
244 langfreq.byitem <- ddply(lexdec, .(NativeLanguage, FreqHiLo, Word), summarise, RT=mean(RT))
245 langfreq.byitem.summ <- ddply(langfreq.byitem,
246 .(NativeLanguage, FreqHiLo),
247 summarise, mean=mean(RT), se=se(RT))
248
249 ## ggplot(langfreq.byitem.summ, aes(x=FreqHiLo, color=NativeLanguage,
250 ## y=mean, ymin=mean-2*se, ymax=mean+2*se)) +
251 ## geom_point() +
252 ## geom_errorbar(width=0.1) +
253 ## geom_line(aes(group=NativeLanguage))
254
255
256 ## @knitr echo=F, results='hide', fig.show='hide'
257 library(plyr)
258 library(mvtnorm)
259 library(lme4)
260
261 make.data.generator <- function(true.effects=c(0,0),
262 resid.var=1,
263 ranef.var=diag(c(1,1)),
264 n.subj=24,
265 n.obs=24
266 )
267 {
268 # create design matrix for our made up experiment
269 data.str <- data.frame(freq=factor(c(rep('high', n.obs/2), rep('low', n.obs/2))))
270 contrasts(data.str$freq) <- contr.sum(2)
271 model.mat <- model.matrix(~ 1 + freq, data.str)
272
273 generate.data <- function() {
274 # sample data set
275 simulated.data <- rdply(n.subj, {
276 beta <- t(rmvnorm(n=1, sigma=ranef.var)) + true.effects
277 expected.RT <- model.mat %*% beta
278 epsilon <- rnorm(n=length(expected.RT), mean=0, sd=sqrt(resid.var))
279 data.frame(data.str,
280 RT=expected.RT + epsilon)
281 })
282 names(simulated.data)[1] <- 'subject'
283 simulated.data
284 }
285 }
286
287 fit.models <- function(simulated.data) {
288 # fit models and extract coefs
289 lm.coefs <- coefficients(summary(lm(RT ~ 1+freq, simulated.data)))[, 1:3]
290 rand.int.coefs <- summary(lmer(RT ~ 1+freq + (1|subject), simulated.data))@coefs
291 rand.slope.coefs <- summary(lmer(RT ~ 1+freq + (1+freq|subject), simulated.data))@coefs
292 # format output all pretty
293 rbind(data.frame(model='lm', predictor=rownames(lm.coefs), lm.coefs),
294 data.frame(model='rand.int', predictor=rownames(rand.int.coefs), rand.int.coefs),
295 data.frame(model='rand.slope', predictor=rownames(rand.slope.coefs), rand.slope.coefs))
296 }
297
298
299 gen.dat <- make.data.generator()
300
301 simulations <- rdply(.n=100,
302 fit.models(gen.dat()),
303 .progress='text')
304
305 head(simulations, n=20)
306 sim.m <- melt(simulations, measure.var=c('Estimate', 'Std..Error', 't.value'))
307 head(sim.m)
308
309 ddply(simulations, .(model, predictor), summarise, type1err=mean(abs(t.value)>1.96))
310
311
312
313 ## @knitr data-simulation-model-fitting-preview, fig.width=5, fig.height=3.5, out.width='\\textwidth', echo=F, results='hide'
314
315 ggplot(simulations, aes(x=t.value, color=model)) +
316 geom_vline(xintercept=c(-1.96, 1.96), color='#888888', linetype=3) +
317 scale_x_continuous('t value') +
318 geom_density() +
319 facet_grid(predictor~.)
320
321
322
323 ## @knitr eval=F
324 library(plyr)
325 library(mvtnorm)
326 library(lme4)
327
328 make.data.generator <- function(true.effects=c(0,0),
329 resid.var=1,
330 ranef.var=diag(c(1,1)),
331 n.subj=24,
332 n.obs=24
333 )
334 {
335 # create design matrix for our made up experiment
336 data.str <- data.frame(freq=factor(c(rep('high', n.obs/2), rep('low', n.obs/2))))
337 contrasts(data.str$freq) <- contr.sum(2)
338 model.mat <- model.matrix(~ 1 + freq, data.str)
339
340 generate.data <- function() {
341 # sample data set under mixed effects model with random slope/intercepts
342 simulated.data <- rdply(n.subj, {
343 beta <- t(rmvnorm(n=1, sigma=ranef.var)) + true.effects
344 expected.RT <- model.mat %*% beta
345 epsilon <- rnorm(n=length(expected.RT), mean=0, sd=sqrt(resid.var))
346 data.frame(data.str,
347 RT=expected.RT + epsilon)
348 })
349 names(simulated.data)[1] <- 'subject'
350 simulated.data
351 }
352 }
353
354
355 ## @knitr eval=F
356 fit.models <- function(simulated.data) {
357 # fit models and extract coefs
358 lm.coefs <- coefficients(summary(lm(RT ~ 1+freq, simulated.data)))[, 1:3]
359 rand.int.coefs <- summary(lmer(RT ~ 1+freq + (1|subject), simulated.data))@coefs
360 rand.slope.coefs <- summary(lmer(RT ~ 1+freq + (1+freq|subject), simulated.data))@coefs
361 # format output all pretty
362 rbind(data.frame(model='lm', predictor=rownames(lm.coefs), lm.coefs),
363 data.frame(model='rand.int', predictor=rownames(rand.int.coefs), rand.int.coefs),
364 data.frame(model='rand.slope', predictor=rownames(rand.slope.coefs), rand.slope.coefs))
365 }
366
367
368 ## @knitr eval=F
369 gen.dat <- make.data.generator()
370 simulations <- rdply(.n=100,
371 fit.models(gen.dat()),
372 .progress='text')
373
374
375 ## @knitr
376 head(simulations)
377 daply(simulations, .(model, predictor), function(df) type1err=mean(abs(df$t.value)>1.96))
378
379
380 ## @knitr fig.width=5, fig.height=3.5, out.width='0.75\\textwidth'
381 # use reshape2::melt to get the data into a more convenient format (see next section)
382 ggplot(simulations, aes(x=t.value, color=model)) +
383 geom_vline(xintercept=c(-1.96, 1.96), color='#888888', linetype=3) +
384 scale_x_continuous('t value') +
385 geom_density() +
386 facet_grid(predictor~.)
387
388
389 ## @knitr
390 head(params <- expand.grid(n.obs=c(4, 16, 64), n.subj=c(4, 16, 64)))
391
392
393 ## @knitr eval=F
394 man.simulations <- mdply(params, function(...) {
395 make.data <- make.data.generator(...)
396 rdply(.n=100, fit.models(make.data()))
397 }, .progress='text')
398
399
400 ## @knitr eval=F
401 mdply(params, function(...) {
402 make.data <- make.data.generator(...)
403 rdply(.n=100, fit.models(make.data()))
404 }, .progress='text')
405
406
407 ## @knitr eval=FALSE
408 ## ddply(params, .(n.obs, n.subj), function(df) {
409 ## make.data <- make.data.generator(n.obs=df$n.obs, n.subj=df$n.subj)
410 ## rdply(.n=100, .fun=fit.models(make.data()))
411 ## }, .progress='text')
412
413
414 ## @knitr echo=F
415 ld.wide[1:7, 1:7]
416
417
418 ## @knitr echo=F
419 ld.long[1:5, ]
420
421
422 ## @knitr echo=F
423 ld.wide[1:5, 1:4]
424
425
426 ## @knitr
427 ld.wide[1:5, 1:7]
428
429
430 ## @knitr
431 head(ld.m <- melt(ld.wide, id.var='Subject', na.rm=T))
432
433
434 ## @knitr
435 require(stringr)
436 trials.and.vars <- ldply(str_split(ld.m$variable, '_'))
437 names(trials.and.vars) <- c('Trial', 'measure')
438
439
440 ## @knitr
441 head(ld.m <- cbind(ld.m, trials.and.vars))
442
443
444 ## @knitr
445 head(dcast(ld.m, Subject+Trial ~ measure))
446
447
448 ## @knitr
449 head(dcast(ld.m, ... ~ measure))
450
451
452 ## @knitr
453 ld.m$variable <- NULL
454 head(dcast(ld.m, ... ~ measure))
455
456
457 ## @knitr
458 head(dcast(ld.m, Subject ~ measure))
459
460
461 ## @knitr results='hide'
462 melt(ld.wide, id.var='Subject', na.rm=T)
463
464
465 ## @knitr results='hide'
466 ddply(ld.wide, .(Subject), function(df) {
467 vars <- names(df)
468 vals <- t(df)
469 dimnames(vals) <- NULL
470 return(subset(data.frame(variable=vars, value=vals),
471 variable != 'Subject' & !is.na(value)))
472 })
473
474
475 ## @knitr results='hide'
476 head(dcast(ld.m, Subject+Trial ~ measure))
477
478
479 ## @knitr
480 head(ddply(ld.m, .(Subject, Trial), function(df) {
481 res <- data.frame(t(df$value))
482 names(res) <- df$measure
483 return(res)
484 }))
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.