analysis.R

rm(list=ls())
d = read.csv('Lassiter-must-NALS-2016-data-anonymized.csv')
length(levels(factor(d$workerid))) # 570
head(d)
levels(d$language)

d = subset(d, !(language %in% c('Indonesian', 'Japanese', 'Chinese', 'mandarin', 'vietnamese', 'Korean', 'French', 'Russian', 'Spanish', 'VIETNAMESE','tagalog','Tamil','Arabic','AVWUCILTTODO2','spanish','Tagalog','urdu','Mandarin','Cantonese','Chinese')))
levels(factor(d$language))
length(d$workerid) 

plot(density(log(d$rt))) # check for approximately log-normal RT distribution

table = xtabs(~ response + modal, data=d)
Ns = xtabs(~ modal, data=d)
props = table[1,]/Ns
props = sort(props)

#names(props)[2] = 'know not'

# sanity check
approx.upper = props + 1.96 * sqrt(props * (1-props) / Ns)
approx.lower = props - 1.96 * sqrt(props * (1-props) / Ns)

bootstrap = function(mod) {
  num = 50000
  dat = subset(d, modal==mod)
  modal.N = length(dat$response)
  sample.props = sapply(1:num, function(i) {
    s = sample(dat$response, modal.N, replace=T)
    return(length(which(s == 'Agree'))/modal.N)
  })
  return(quantile(sample.props, c(.025, .975)))
} 

CI = sapply(names(props), FUN=bootstrap)

names(props)[2] = 'know not' # fix condition name mis-annotated in JavaScript 

par(mfrow=c(1,1))
#png("res.png") ; par(mar=c(6, 4, 4, 2) + 0.1)

barplot(props, 
          main='',
          ylim=c(0,1), 
          ylab='Proportion agreement', las=2)
  arrows(x0=c(.7, 1.9, 3.1, 4.3, 5.5, 6.7, 7.9, 9.1, 10.3), 
       y0=CI[1,], 
       y1=CI[2,], 
       angle=90, 
            code=3, length=.15)
  
#  dev.off()
  
# sanity check - compare computed bootstrap CIs to binomial CIs generated by normal approximation
  barplot(props, 
          main='',
          ylim=c(0,1), 
          ylab='Proportion agreement', las=2)
  arrows(x0=c(.7, 1.9, 3.1, 4.3, 5.5, 6.7, 7.9, 9.1, 10.3), 
         y0=approx.lower, 
         y1=approx.upper,
         angle=90, 
         code=3, length=.15)
  
#
# basic descriptive stats, using 2-tailed bootstrap t-tests
#

prop.agree = function(v) abs(length(which(v))/length(v))
bootstrap.test = function(m1,m2) {
  d1 = subset(d, modal==m1)$response == 'Agree'
  d2 = subset(d, modal==m2)$response == 'Agree'
  diff = prop.agree(d1) - prop.agree(d2)
  comb = c(d1,d2)
  s = sapply(1:50000, function(i) {
    sim.d1 = prop.agree(sample(comb, length(d1), replace=T))
    sim.d2 = prop.agree(sample(comb, length(d2), replace=T))
    return(abs(sim.d1 - sim.d2) >= abs(diff))
  })
  return(mean(s)) # p-value computed by resampling
}

bootstrap.test('might', 'possible')

# key questions
bootstrap.test('knows', 'certain not')
bootstrap.test('knows', 'must not')
bootstrap.test('must not', 'certain not')
bootstrap.test('might', 'possible')
bootstrap.test('possibly', 'possible')
bootstrap.test('certain not', 'certainly not')
bootstrap.test('must not', 'did not')
bootstrap.test('did not', 'did')
bootstrap.test('certain not', 'must not')
bootstrap.test('certain not', 'did not')
bootstrap.test('did not', 'must not')

m0 = with(d, glm(response ~ modal + rt, family=binomial))
m1 = with(d, glm(response ~ modal * rt, family=binomial))
anova(m0, m1) 
  # nested movel comparison yields no evidence of interaction between modal and rt