#!/usr/bin/env R ## Functions for working with the IQAP data described at ## http://compprag.christopherpotts.net/iqap.html ## ## author: Christopher Potts ## copyright: Copyright 2011, Christopher Potts ## credits: [] ## license: Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/ ## version: 1.0 ## maintainer: Christopher Potts ## email: See the author's website ###################################################################### ## Calculate response entropy. ## pf: the row for an item from iqap-data.csv ## Value: the entropy of the response distribution for pf ResponseEntropy = function(pf){ counts = c(pf$definite.yes, pf$probable.yes, pf$definite.no, pf$probable.no) dist = counts/sum(counts) ## Remove 0 counts to avoid an Inf value from log2. dist = dist[dist > 0] ## The entropy calculuation: sum_x (p(x) * log2(p(x))) e = -sum(log2(dist) * dist) return(e) } ###################################################################### ## The following are responses to some of the ## questions/hunches/hypotheses that the in-class groups formulated on ## July 12: ## Read in the data and limit to the development set: iqap = read.csv('iqap-data.csv') iqap = subset(iqap, DevEval=='DEVELOPMENT') ##-------------------------------------------------------------------- ## Question: Which examples have prefixes? ## Add a column with TRUE where the example has a prefix, else FALSE: iqap$prefixed = iqap$Prefix != '' ## Cross-tabulate the Classification with the prefixed column: xtabs(~ Classification + prefixed, data=iqap) ##-------------------------------------------------------------------- ## Question: Are the 'probable' categories chosen more often where the ## answers contain attitude predicates, modals, hedges, additive ## particles, exclusives, etc., in the answer? ## Add a column grouping the 'probable' categories: iqap$prob = iqap$probable.yes + iqap$probable.no ## Tweak this regex as needed for particular versions of the question: regex = '\\b(thinks?|think|thought|can|could|shall|should|will|would|may|might|must)\\b' ## Add the column for this regex: iqap$regex = grepl(regex, iqap$Answer) ## Create a box plot relating the two: boxplot(prob ~ regex, data=iqap, xlab='Regex match', ylab='Probable annotations (by item)') ##-------------------------------------------------------------------- ## Question: To what extent does the yes/no grouping differ from the ## probable/definite grouping? ## General function for calculating the entropy of the counts vector. GeneralizedResponseEntropy = function(counts){ dist = counts/sum(counts) ## Remove 0 counts to avoid an Inf value from log2. dist = dist[dist > 0] ## The entropy calculuation: sum_x (p(x) * log2(p(x))) e = -sum(log2(dist) * dist) return(e) } ## Group the responses by yes/no and return the entropy of that distribution. PolarityResponseEntropy = function(pf){ counts = c( pf$definite.yes + pf$probable.yes, pf$definite.no + pf$probable.no) return(GeneralizedResponseEntropy(counts)) } ## Group the responses into definite.yes, probable, definite.no: TriResponseEntropy = function(pf){ counts = c( pf$definite.yes, pf$probable.yes + pf$probable.no, pf$definite.no) return(GeneralizedResponseEntropy(counts)) } ## Group the responses by definite/probable and return the entropy of that distribution. DegreeResponseEntropy = function(pf){ counts = c( pf$definite.yes + pf$definite.no, pf$probable.yes + pf$probable.no) return(GeneralizedResponseEntropy(counts)) } ## Calculate all entropy values. Entropys = function(pf){ return(c(PolarityResponseEntropy(pf), DegreeResponseEntropy(pf), TriResponseEntropy(pf))) } library(plyr) ## Add the new entropy values to iqap. EntropyComparisons = function(iqap){ ## Add the entropy values as two new columns for iqap. ## ddply calls ResponseEntropy for each row: iqapEntropy = ddply(iqap, .variables=c('Item'), .fun=Entropys) colnames(iqapEntropy)[2] = 'PolarityEntropy' colnames(iqapEntropy)[3] = 'DegreeEntropy' colnames(iqapEntropy)[4] = 'TriEntropy' ## Sort the entropy frame and the original by Item to ensure a proper alignment: iqapEntropy = iqapEntropy[order(iqapEntropy$Item), ] iqap = iqap[order(iqap$Item), ] ## Add columns for the entropy values to the main iqap frame: iqap$PolarityEntropy = iqapEntropy$PolarityEntropy iqap$DegreeEntropy = iqapEntropy$DegreeEntropy iqap$TriEntropy = iqapEntropy$TriEntropy ## Return the augmented frame: return(iqap) }