## --------------------------------------
# This script loads a file called swbd.tab, which contains cases
# of a PP-ordering alternation. The cases are extracted from the
# Switchboard (swbd) Treebank portion, which is a spontaneous speech
# corpus. 
#
# The goal is to test whether the results of Hawkins (1999) extend
# to spontaneous speech. Hawkins found that, in accordance with his
# theory of Domain Minimization (MiD) and Maximize Online Processing 
# (MaOP) longer PPs follow shorter PPs in English VPs. To be precise,
# Hawkins found that the chance of the longer PP ordering after the 
# shorter PP increases proportionally to the extent that the longer PP
# is longer than the shorter PP (in number of words).
#
# Hawkins considered only VPs with (only) two adjacent PPs. Here,
# disfluencies were allowed to intervene and we also included VPs that
# contained other non-PP arguments in addition to the two PPs. The 
# two PPs had to be syntactic sisters and they cannot contain traces.
# The search pattern that was used to extract these cases from the corpus
# is available on the class webpage (cf. sample solution to Assignment 2).
## --------------------------------------

# Lines starting with the # symbol are comments
# you can get help on any command below by typing
# help(command), e.g. help(setwd). 
#
# I recommend Dalgaard (2003?) as an introduction 
# to R. Not very exciting, but very readable. We also
# have R scripts for R intros online on our lab wiki 
# (just search hlplab.wordpress.com for the link)
#
# let's set the working directory and then load the file
# <- means that we assign the value to the right to the 
# variable on the left. You can also use -> to assign the value
# on the left to a variable on the right.
setwd("C:\\Documents and Settings\\florian\\My Documents\\My LabMeetings, Tutorials, & Teaching\\LSA09\\data")
pp <- as.data.frame(read.csv(file="swbd.tab", sep="\t", header=T, quote=""))

# let's have a quick look at the dat
# 1) the number of rows
# 2) the structure [str] of the dataframe pp
# and 3) a [summary] of pp.
# [str] and [summary] work for most R objects.
nrow(pp)
str(pp)
summary(pp)

# Often we want to create new variables or change
# existing ones slightly
# Here, I extract the Part-Of-Speech [POS] information
# from a variable that contains values of the type "string%%POS"
# [gsub is a command that replaces parts of strings with other 
#  strings using regular expression syntax]
pp$POS <- gsub(".*%%", "", as.character(pp$POS))
table(pp$POS)
# [table] is kinda like summary for factors, but it lists ALL values

# To do test parallel to those presented in Hawkins (1999), we need
# to create a new variable that stores whether the longer PP is in the
# 2nd position:
pp$PPS_PPL <- ifelse(pp$LenPP2 > pp$LenPP1, 1, 0)

# let's create some more variables that store what type of PPs we
# are dealing with:
pp$TypePP1 <- as.factor(gsub(".*%%", "", as.character(pp$POSPP1)))
pp$TypePP2 <- as.factor(gsub(".*%%", "", as.character(pp$POSPP2)))
# the following new variables store whether a PP is a filler to a 
# trace. These fillers are marked by a -NNN, e.g. PP-1. We will ex-
# clude these PPs later.
pp$FillerPP1 <- ifelse(gsub("[^0-9]", "", as.character(pp$TypePP1)) != "", T, F)
pp$FillerPP2 <- ifelse(gsub("[^0-9]", "", as.character(pp$TypePP2)) != "", T, F)

# We can also define the information density of the two PPs as the 
# ration of the information content over their strings divided by 
# the number of words in the PPs.
pp$InfoDensityPP1 <- pp$Information_PP1_3gram / pp$Length_PP1_3gram
pp$InfoDensityPP2 <- pp$Information_PP2_3gram / pp$Length_PP2_3gram

# finally, let's define the absolute differences and rations 
# in length (in words), information content, and information density
# between the two PPs. Hawkins used the absolute difference in length
# to predict the chance that the longer PP will be ordered 2nd.
pp$LenDiff <- abs(pp$LenPP2 - pp$LenPP1)
pp$InfoDiff <- abs(pp$Information_PP2_3gram - pp$Information_PP1_3gram)
pp$InfoDensityDiff <- abs(pp$InfoDensityPP2 - pp$InfoDensityPP1)


## -------------------------------------
# Exclusion
## -------------------------------------
# You can exclude cases using the subset function. First, we 
# exclude some extreme cases of PP length. A more careful examination
# reveals that these cases are problematic (wrongly parsed) anyway.
# The command [scale] can be used to standardize variables (to create 
# z-scores).
pp <- subset(pp, abs(scale(LenPP1)) < 2.5 & abs(scale(LenPP2)) < 2.5)
nrow(pp)

summary(pp$TypePP1)
## Probably should be excluded since they are arguments:
#-PUT - marks the locative complement of put.
#-DTV (dative) - marks the dative object in the unshifted form of the double object construction. If the preposition introducing the "dative" object is for, it is considered benefactive (-BNF). -DTV (and -BNF) is only used after verbs that can undergo dative shift.
#-BNF (benefactive) - marks the beneficiary of an action (attaches to NP or PP).
#This tag is used only when (1) the verb can undergo dative shift and (2) the prepositional variant (with the same meaning) uses for. The prepositional objects of dative-shifting verbs with other prepositions than for (such as to or of) are annotated -DTV.
#-CLR (closely related) - marks constituents that occupy some middle ground between arguments and adjunct of the verb phrase. These roughly correspond to "predication adjuncts", prepositional ditransitives, and some "phrasel verbs". Although constituents marked with -CLR are not strictly speaking complements, they are treated as complements whenever it makes a bracketing difference. The precise meaning of -CLR depends somewhat on the category of the phrase.
#   * on PP, ADVP, SBAR-PRP, etc - On categories that are ordinarily interpreted as (adjunct) adverbials, -CLR indicates a somewhat closer relationship to the verb. For example:
#          o Prepositional Ditransitives
#            In order to ensure consistency, the Treebank recognizes only a limited class of verbs that take more than one complement (-DTV and -PUT and Small Clauses) Verbs that fall outside these classes (including most of the prepositional ditransitive verbs in class [D2]) are often associated with -CLR.
#          o Phrasal verbs
#            Phrasal verbs are also annotated with -CLR or a combination of -PRT and PP-CLR. Words that are considered borderline between particle and adverb are often bracketed with ADVP-CLR.
#          o Predication Adjuncts
#            Many of Quirk's predication adjuncts are annotated with -CLR. 
#-TTL (title) - is attached to the top node of a title when this title appears inside running text. -TTL implies -NOM. The internal structure of the title is bracketed as usual.
#-UNF - unfinished

## Unclear whether it should be excluded:
#-PRD (predicate) - marks any predicate that is not VP. In the do so construction, the so is annotated as a predicate.

## Probably ok to keep
#-any PP without a functional tag
#-DIR (direction) - marks adverbials that answer the questions "from where?" and "to where?" It implies motion, which can be metaphorical as in "...rose 5 pts. to 57-1/2" or "increased 70% to 5.8 billion yen" -DIR is most often used with verbs of motion/transit and financial verbs.
#-EXT (extent) - marks adverbial phrases that describe the spatial extent of an activity. -EXT was incorporated primarily for cases of movement in financial space, but is also used in analogous situations elsewhere. Obligatory complements do not receive -EXT. Words such as fully and completely are absolutes and do not receive -EXT.
#-LOC (locative) - marks adverbials that indicate place/setting of the event. -LOC may also indicate metaphorical location. There is likely to be some varation in the use of -LOC due to differing annotator interpretations. In cases where the annotator is faced with a choice between -LOC or -TMP, the default is -LOC. In cases involving SBAR, SBAR should not receive -LOC. -LOC has some uses that are not adverbial, such as with place names that are adjoined to other NPs and NAC-LOC premodifiers of NPs. The special tag -PUT is used for the locative argument of put.
#-MNR (manner) - marks adverbials that indicate manner, including instrument phrases.
#-PRP (purpose or reason) - marks purpose or reason clauses and PPs.
#-TMP (temporal) - marks temporal or aspectual adverbials that answer the questions when, how often, or how long. It has some uses that are not strictly adverbial, auch as with dates that modify other NPs at S- or VP-level. In cases of apposition involving SBAR, the SBAR should not be labeled -TMP. Only in "financialspeak," and only when the dominating PP is a PP-DIR, may temporal modifiers be put at PP object level. Note that -TMP is not used in possessive phrases.

# You can exclude based on indices. Here, I use the [grep] function 
# to find the indices of all cases that have PP-types that match a 
# certain pattern.
pp <- pp[-append(grep("-(UNF|TTL|BNF|DTV|CLR|PUT|PRD)", pp$TypePP1), grep("-(UNF|TTL|BNF|DTV|CLR|PUT)", pp$TypePP2)),]
nrow(pp)

# I also create a subset that contains only PPs without functional 
# annotation (aka secondary edge labels). This would seem to resemble
# what Hawkins (1999) did most closely.
pp.hawkins <- pp[-append(grep("-)", pp$TypePP1), grep("-", pp$TypePP2)),]
nrow(pp.hawkins)

# a function that removes unused factor levels
drop.levels <- function(dat){
   # Drop unused factor levels from all factors in a data.frame
   # Author: Kevin Wright. Idea by Brian Ripley.
   dat[] <- lapply(dat, function(x) x[,drop=TRUE])
   return(dat)
}
pp <- drop.levels(pp)
pp.hawkins <- drop.levels(pp.hawkins)

## -------------------------------------
# Data exploration
## -------------------------------------
# Now let's have a look at the data so far. A histogram is always a good 
# start since it provides a lot of distributional information. We'll plot
# the histogram of length differences between PP1 and PP2 for both the 
# pp dataset and the pp.hawkins data set.
par(mfrow=c(1,2))
with(pp, hist(LenPP2 - LenPP1))
with(pp.hawkins, hist(LenPP2 - LenPP1))
# the distributions look very similar suggestion that both the PP-types 
# investigated by Hawkins and the adverbial PPs of manner, place, time, 
# and purpose (see above) exhibit similar length-dependent patterns.

# Two histograms in one plot
with(pp, hist(LenPP1, breaks=seq(1,10,1), col=2))
with(pp, hist(LenPP2, breaks=seq(1,10,1), add=T))

# A couple of things jump out:
# 1) to the extent that the data supports Hawkins's claim, it seems to do
#    so rather subtly.
# 2) there also are a bunch of cases, where the 1st PP is longer than the
#    2nd PP, but mostly only by 1 or 2 words, and Hawkins' theory predicts
#    that such small differences should not matter.
# 3) The way Hawkins (2004, 2007) presents the data, the possible problems
#    for his theory are not visible. They only become apparent in the 
#    histogram.
# 4) a HUGE caveat is in order here: many of the cases included in our data
#    would probably be EXcluded by a more careful manual annotation. After
#    all we only want to compare cases for which both PP orders are possible.
#    ---> we are working with a noisy data set, but --hey-- it only took us a 
#    couple of hours to get there and it definitely took Hawkins longer in 
#    1999!

## ---------------------------------------------
# Some analyses
## ---------------------------------------------
# Let's load the [Design] library (install it first ;). This library 
# contains a tons of useful functions to run linear [ols] and logistics
# [lrm] models without random effects, for model evaluation, and for 
# model visualization.
library(Design)

# The most straigtforward and simple test of Hawkins' prediction is a 
# logistic regression model using the absolute length difference between
# the two PPs to predict whether the longer phrase will be ordered 2nd.
# We use [lrm] on that for both our data sets. 
lrm(PPS_PPL ~ LenDiff, pp)
lrm(PPS_PPL ~ LenDiff, pp.hawkins)
# as predicted by Hawkins (1999, 2004) we observe a positive coefficient
# for LenDiff and this coefficient is significantly different from zero.
# The model also has a decent R2, though --unsurprisingly-- it's far from
# perfect.

# we can add an additional variable to check whether information density 
# matters, but the problem is that we can only predict whether the 2nd 
# phrase will be longer. This is a problem particular to the PP-ordering
# alternation since there is nothing that a prior distinguishes PP1 from 
# PP2 (unlike in e.g. the ditransitive alternation, where we have a 
# recipient and a theme independent of the order, and then use properties
# of the recipient and theme to predict the order).
lrm(PPS_PPL ~ LenDiff + InfoDensityDiff, pp.hawkins)
# so the null effect of information density has to be taken with a grain
# of salt here. no kidding ;).

# we can test whether there are signs of non-linearity in the length 
# effect:
l.rcs <- lrm(PPS_PPL ~ rcs(LenDiff,5), pp.hawkins)
l.pol <- lrm(PPS_PPL ~ pol(LenDiff,2), pp.hawkins)

# [anova] allows us to test whether removal of a predictor or the non-linear
# components of a predictors signficantly worsens the model (given the 
# decrease in model complexity):
anova(l.rcs)
anova(l.pol)
# this suggest a strong, potentially quadratic effect
# let's plot it.
dd <- datadist(pp.hawkins)
options(datadist='dd')
plot(l.pol, LenDiff=NA)
# This actually looks more like an asymptotic (log) ceiling effect 
# (notice that the highest values do not differ from one another --
# the dashes around the dot are the 95% confidence intervals)
# So, let's model this as log-difference. To avoid NAs, we'll exclude
# cases where the two PPs are equally long (Hawkins does the same):
l.log <- lrm(PPS_PPL ~ log(LenDiff), pp.hawkins, subset= LenDiff!=0)
l.log

# In the last class, I'll give an example from the ditransitive alternation
# where we can actually meaningfully include several predictors.