Attachment 'Lecture5.R'
Download 1 ## --------------------------------------
2 # This script loads a file called swbd.tab, which contains cases
3 # of a PP-ordering alternation. The cases are extracted from the
4 # Switchboard (swbd) Treebank portion, which is a spontaneous speech
5 # corpus.
6 #
7 # The goal is to test whether the results of Hawkins (1999) extend
8 # to spontaneous speech. Hawkins found that, in accordance with his
9 # theory of Domain Minimization (MiD) and Maximize Online Processing
10 # (MaOP) longer PPs follow shorter PPs in English VPs. To be precise,
11 # Hawkins found that the chance of the longer PP ordering after the
12 # shorter PP increases proportionally to the extent that the longer PP
13 # is longer than the shorter PP (in number of words).
14 #
15 # Hawkins considered only VPs with (only) two adjacent PPs. Here,
16 # disfluencies were allowed to intervene and we also included VPs that
17 # contained other non-PP arguments in addition to the two PPs. The
18 # two PPs had to be syntactic sisters and they cannot contain traces.
19 # The search pattern that was used to extract these cases from the corpus
20 # is available on the class webpage (cf. sample solution to Assignment 2).
21 ## --------------------------------------
22
23 # Lines starting with the # symbol are comments
24 # you can get help on any command below by typing
25 # help(command), e.g. help(setwd).
26 #
27 # I recommend Dalgaard (2003?) as an introduction
28 # to R. Not very exciting, but very readable. We also
29 # have R scripts for R intros online on our lab wiki
30 # (just search hlplab.wordpress.com for the link)
31 #
32 # let's set the working directory and then load the file
33 # <- means that we assign the value to the right to the
34 # variable on the left. You can also use -> to assign the value
35 # on the left to a variable on the right.
36 setwd("C:\\Documents and Settings\\florian\\My Documents\\My LabMeetings, Tutorials, & Teaching\\LSA09\\data")
37 pp <- as.data.frame(read.csv(file="swbd.tab", sep="\t", header=T, quote=""))
38
39 # let's have a quick look at the dat
40 # 1) the number of rows
41 # 2) the structure [str] of the dataframe pp
42 # and 3) a [summary] of pp.
43 # [str] and [summary] work for most R objects.
44 nrow(pp)
45 str(pp)
46 summary(pp)
47
48 # Often we want to create new variables or change
49 # existing ones slightly
50 # Here, I extract the Part-Of-Speech [POS] information
51 # from a variable that contains values of the type "string%%POS"
52 # [gsub is a command that replaces parts of strings with other
53 # strings using regular expression syntax]
54 pp$POS <- gsub(".*%%", "", as.character(pp$POS))
55 table(pp$POS)
56 # [table] is kinda like summary for factors, but it lists ALL values
57
58 # To do test parallel to those presented in Hawkins (1999), we need
59 # to create a new variable that stores whether the longer PP is in the
60 # 2nd position:
61 pp$PPS_PPL <- ifelse(pp$LenPP2 > pp$LenPP1, 1, 0)
62
63 # let's create some more variables that store what type of PPs we
64 # are dealing with:
65 pp$TypePP1 <- as.factor(gsub(".*%%", "", as.character(pp$POSPP1)))
66 pp$TypePP2 <- as.factor(gsub(".*%%", "", as.character(pp$POSPP2)))
67 # the following new variables store whether a PP is a filler to a
68 # trace. These fillers are marked by a -NNN, e.g. PP-1. We will ex-
69 # clude these PPs later.
70 pp$FillerPP1 <- ifelse(gsub("[^0-9]", "", as.character(pp$TypePP1)) != "", T, F)
71 pp$FillerPP2 <- ifelse(gsub("[^0-9]", "", as.character(pp$TypePP2)) != "", T, F)
72
73 # We can also define the information density of the two PPs as the
74 # ration of the information content over their strings divided by
75 # the number of words in the PPs.
76 pp$InfoDensityPP1 <- pp$Information_PP1_3gram / pp$Length_PP1_3gram
77 pp$InfoDensityPP2 <- pp$Information_PP2_3gram / pp$Length_PP2_3gram
78
79 # finally, let's define the absolute differences and rations
80 # in length (in words), information content, and information density
81 # between the two PPs. Hawkins used the absolute difference in length
82 # to predict the chance that the longer PP will be ordered 2nd.
83 pp$LenDiff <- abs(pp$LenPP2 - pp$LenPP1)
84 pp$InfoDiff <- abs(pp$Information_PP2_3gram - pp$Information_PP1_3gram)
85 pp$InfoDensityDiff <- abs(pp$InfoDensityPP2 - pp$InfoDensityPP1)
86
87
88 ## -------------------------------------
89 # Exclusion
90 ## -------------------------------------
91 # You can exclude cases using the subset function. First, we
92 # exclude some extreme cases of PP length. A more careful examination
93 # reveals that these cases are problematic (wrongly parsed) anyway.
94 # The command [scale] can be used to standardize variables (to create
95 # z-scores).
96 pp <- subset(pp, abs(scale(LenPP1)) < 2.5 & abs(scale(LenPP2)) < 2.5)
97 nrow(pp)
98
99 summary(pp$TypePP1)
100 ## Probably should be excluded since they are arguments:
101 #-PUT - marks the locative complement of put.
102 #-DTV (dative) - marks the dative object in the unshifted form of the double object construction. If the preposition introducing the "dative" object is for, it is considered benefactive (-BNF). -DTV (and -BNF) is only used after verbs that can undergo dative shift.
103 #-BNF (benefactive) - marks the beneficiary of an action (attaches to NP or PP).
104 #This tag is used only when (1) the verb can undergo dative shift and (2) the prepositional variant (with the same meaning) uses for. The prepositional objects of dative-shifting verbs with other prepositions than for (such as to or of) are annotated -DTV.
105 #-CLR (closely related) - marks constituents that occupy some middle ground between arguments and adjunct of the verb phrase. These roughly correspond to "predication adjuncts", prepositional ditransitives, and some "phrasel verbs". Although constituents marked with -CLR are not strictly speaking complements, they are treated as complements whenever it makes a bracketing difference. The precise meaning of -CLR depends somewhat on the category of the phrase.
106 # * on PP, ADVP, SBAR-PRP, etc - On categories that are ordinarily interpreted as (adjunct) adverbials, -CLR indicates a somewhat closer relationship to the verb. For example:
107 # o Prepositional Ditransitives
108 # In order to ensure consistency, the Treebank recognizes only a limited class of verbs that take more than one complement (-DTV and -PUT and Small Clauses) Verbs that fall outside these classes (including most of the prepositional ditransitive verbs in class [D2]) are often associated with -CLR.
109 # o Phrasal verbs
110 # Phrasal verbs are also annotated with -CLR or a combination of -PRT and PP-CLR. Words that are considered borderline between particle and adverb are often bracketed with ADVP-CLR.
111 # o Predication Adjuncts
112 # Many of Quirk's predication adjuncts are annotated with -CLR.
113 #-TTL (title) - is attached to the top node of a title when this title appears inside running text. -TTL implies -NOM. The internal structure of the title is bracketed as usual.
114 #-UNF - unfinished
115
116 ## Unclear whether it should be excluded:
117 #-PRD (predicate) - marks any predicate that is not VP. In the do so construction, the so is annotated as a predicate.
118
119 ## Probably ok to keep
120 #-any PP without a functional tag
121 #-DIR (direction) - marks adverbials that answer the questions "from where?" and "to where?" It implies motion, which can be metaphorical as in "...rose 5 pts. to 57-1/2" or "increased 70% to 5.8 billion yen" -DIR is most often used with verbs of motion/transit and financial verbs.
122 #-EXT (extent) - marks adverbial phrases that describe the spatial extent of an activity. -EXT was incorporated primarily for cases of movement in financial space, but is also used in analogous situations elsewhere. Obligatory complements do not receive -EXT. Words such as fully and completely are absolutes and do not receive -EXT.
123 #-LOC (locative) - marks adverbials that indicate place/setting of the event. -LOC may also indicate metaphorical location. There is likely to be some varation in the use of -LOC due to differing annotator interpretations. In cases where the annotator is faced with a choice between -LOC or -TMP, the default is -LOC. In cases involving SBAR, SBAR should not receive -LOC. -LOC has some uses that are not adverbial, such as with place names that are adjoined to other NPs and NAC-LOC premodifiers of NPs. The special tag -PUT is used for the locative argument of put.
124 #-MNR (manner) - marks adverbials that indicate manner, including instrument phrases.
125 #-PRP (purpose or reason) - marks purpose or reason clauses and PPs.
126 #-TMP (temporal) - marks temporal or aspectual adverbials that answer the questions when, how often, or how long. It has some uses that are not strictly adverbial, auch as with dates that modify other NPs at S- or VP-level. In cases of apposition involving SBAR, the SBAR should not be labeled -TMP. Only in "financialspeak," and only when the dominating PP is a PP-DIR, may temporal modifiers be put at PP object level. Note that -TMP is not used in possessive phrases.
127
128 # You can exclude based on indices. Here, I use the [grep] function
129 # to find the indices of all cases that have PP-types that match a
130 # certain pattern.
131 pp <- pp[-append(grep("-(UNF|TTL|BNF|DTV|CLR|PUT|PRD)", pp$TypePP1), grep("-(UNF|TTL|BNF|DTV|CLR|PUT)", pp$TypePP2)),]
132 nrow(pp)
133
134 # I also create a subset that contains only PPs without functional
135 # annotation (aka secondary edge labels). This would seem to resemble
136 # what Hawkins (1999) did most closely.
137 pp.hawkins <- pp[-append(grep("-)", pp$TypePP1), grep("-", pp$TypePP2)),]
138 nrow(pp.hawkins)
139
140 # a function that removes unused factor levels
141 drop.levels <- function(dat){
142 # Drop unused factor levels from all factors in a data.frame
143 # Author: Kevin Wright. Idea by Brian Ripley.
144 dat[] <- lapply(dat, function(x) x[,drop=TRUE])
145 return(dat)
146 }
147 pp <- drop.levels(pp)
148 pp.hawkins <- drop.levels(pp.hawkins)
149
150 ## -------------------------------------
151 # Data exploration
152 ## -------------------------------------
153 # Now let's have a look at the data so far. A histogram is always a good
154 # start since it provides a lot of distributional information. We'll plot
155 # the histogram of length differences between PP1 and PP2 for both the
156 # pp dataset and the pp.hawkins data set.
157 par(mfrow=c(1,2))
158 with(pp, hist(LenPP2 - LenPP1))
159 with(pp.hawkins, hist(LenPP2 - LenPP1))
160 # the distributions look very similar suggestion that both the PP-types
161 # investigated by Hawkins and the adverbial PPs of manner, place, time,
162 # and purpose (see above) exhibit similar length-dependent patterns.
163
164 # Two histograms in one plot
165 with(pp, hist(LenPP1, breaks=seq(1,10,1), col=2))
166 with(pp, hist(LenPP2, breaks=seq(1,10,1), add=T))
167
168 # A couple of things jump out:
169 # 1) to the extent that the data supports Hawkins's claim, it seems to do
170 # so rather subtly.
171 # 2) there also are a bunch of cases, where the 1st PP is longer than the
172 # 2nd PP, but mostly only by 1 or 2 words, and Hawkins' theory predicts
173 # that such small differences should not matter.
174 # 3) The way Hawkins (2004, 2007) presents the data, the possible problems
175 # for his theory are not visible. They only become apparent in the
176 # histogram.
177 # 4) a HUGE caveat is in order here: many of the cases included in our data
178 # would probably be EXcluded by a more careful manual annotation. After
179 # all we only want to compare cases for which both PP orders are possible.
180 # ---> we are working with a noisy data set, but --hey-- it only took us a
181 # couple of hours to get there and it definitely took Hawkins longer in
182 # 1999!
183
184 ## ---------------------------------------------
185 # Some analyses
186 ## ---------------------------------------------
187 # Let's load the [Design] library (install it first ;). This library
188 # contains a tons of useful functions to run linear [ols] and logistics
189 # [lrm] models without random effects, for model evaluation, and for
190 # model visualization.
191 library(Design)
192
193 # The most straigtforward and simple test of Hawkins' prediction is a
194 # logistic regression model using the absolute length difference between
195 # the two PPs to predict whether the longer phrase will be ordered 2nd.
196 # We use [lrm] on that for both our data sets.
197 lrm(PPS_PPL ~ LenDiff, pp)
198 lrm(PPS_PPL ~ LenDiff, pp.hawkins)
199 # as predicted by Hawkins (1999, 2004) we observe a positive coefficient
200 # for LenDiff and this coefficient is significantly different from zero.
201 # The model also has a decent R2, though --unsurprisingly-- it's far from
202 # perfect.
203
204 # we can add an additional variable to check whether information density
205 # matters, but the problem is that we can only predict whether the 2nd
206 # phrase will be longer. This is a problem particular to the PP-ordering
207 # alternation since there is nothing that a prior distinguishes PP1 from
208 # PP2 (unlike in e.g. the ditransitive alternation, where we have a
209 # recipient and a theme independent of the order, and then use properties
210 # of the recipient and theme to predict the order).
211 lrm(PPS_PPL ~ LenDiff + InfoDensityDiff, pp.hawkins)
212 # so the null effect of information density has to be taken with a grain
213 # of salt here. no kidding ;).
214
215 # we can test whether there are signs of non-linearity in the length
216 # effect:
217 l.rcs <- lrm(PPS_PPL ~ rcs(LenDiff,5), pp.hawkins)
218 l.pol <- lrm(PPS_PPL ~ pol(LenDiff,2), pp.hawkins)
219
220 # [anova] allows us to test whether removal of a predictor or the non-linear
221 # components of a predictors signficantly worsens the model (given the
222 # decrease in model complexity):
223 anova(l.rcs)
224 anova(l.pol)
225 # this suggest a strong, potentially quadratic effect
226 # let's plot it.
227 dd <- datadist(pp.hawkins)
228 options(datadist='dd')
229 plot(l.pol, LenDiff=NA)
230 # This actually looks more like an asymptotic (log) ceiling effect
231 # (notice that the highest values do not differ from one another --
232 # the dashes around the dot are the 95% confidence intervals)
233 # So, let's model this as log-difference. To avoid NAs, we'll exclude
234 # cases where the two PPs are equally long (Hawkins does the same):
235 l.log <- lrm(PPS_PPL ~ log(LenDiff), pp.hawkins, subset= LenDiff!=0)
236 l.log
237
238 # In the last class, I'll give an example from the ditransitive alternation
239 # where we can actually meaningfully include several predictors.
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.