#!/usr/bin/env R ## Functions for working with the IQAP data described at ## http://compprag.christopherpotts.net/iqap.html ## ## author: Christopher Potts ## copyright: Copyright 2011, Christopher Potts ## credits: [] ## license: Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/ ## version: 1.0 ## maintainer: Christopher Potts ## email: See the author's website ###################################################################### ## Calculate response entropy. ## pf: the row for an item from iqap-data.csv ## Value: the entropy of the response distribution for pf ResponseEntropy = function(pf){ counts = c(pf\$definite.yes, pf\$probable.yes, pf\$definite.no, pf\$probable.no) dist = counts/sum(counts) ## Remove 0 counts to avoid an Inf value from log2. dist = dist[dist > 0] ## The entropy calculuation: sum_x (p(x) * log2(p(x))) e = -sum(log2(dist) * dist) return(e) } ###################################################################### ## The following are responses to some of the ## questions/hunches/hypotheses that the in-class groups formulated on ## July 12: ## Read in the data and limit to the development set: iqap = read.csv('iqap-data.csv') iqap = subset(iqap, DevEval=='DEVELOPMENT') ##-------------------------------------------------------------------- ## Question: Which examples have prefixes? ## Add a column with TRUE where the example has a prefix, else FALSE: iqap\$prefixed = iqap\$Prefix != '' ## Cross-tabulate the Classification with the prefixed column: xtabs(~ Classification + prefixed, data=iqap) ##-------------------------------------------------------------------- ## Question: Are the 'probable' categories chosen more often where the ## answers contain attitude predicates, modals, hedges, additive ## particles, exclusives, etc., in the answer? ## Add a column grouping the 'probable' categories: iqap\$prob = iqap\$probable.yes + iqap\$probable.no ## Tweak this regex as needed for particular versions of the question: regex = '\\b(thinks?|think|thought|can|could|shall|should|will|would|may|might|must)\\b' ## Add the column for this regex: iqap\$regex = grepl(regex, iqap\$Answer) ## Create a box plot relating the two: boxplot(prob ~ regex, data=iqap, xlab='Regex match', ylab='Probable annotations (by item)') ##-------------------------------------------------------------------- ## Question: To what extent does the yes/no grouping differ from the ## probable/definite grouping? ## General function for calculating the entropy of the counts vector. GeneralizedResponseEntropy = function(counts){ dist = counts/sum(counts) ## Remove 0 counts to avoid an Inf value from log2. dist = dist[dist > 0] ## The entropy calculuation: sum_x (p(x) * log2(p(x))) e = -sum(log2(dist) * dist) return(e) } ## Group the responses by yes/no and return the entropy of that distribution. PolarityResponseEntropy = function(pf){ counts = c( pf\$definite.yes + pf\$probable.yes, pf\$definite.no + pf\$probable.no) return(GeneralizedResponseEntropy(counts)) } ## Group the responses into definite.yes, probable, definite.no: TriResponseEntropy = function(pf){ counts = c( pf\$definite.yes, pf\$probable.yes + pf\$probable.no, pf\$definite.no) return(GeneralizedResponseEntropy(counts)) } ## Group the responses by definite/probable and return the entropy of that distribution. DegreeResponseEntropy = function(pf){ counts = c( pf\$definite.yes + pf\$definite.no, pf\$probable.yes + pf\$probable.no) return(GeneralizedResponseEntropy(counts)) } ## Calculate all entropy values. Entropys = function(pf){ return(c(PolarityResponseEntropy(pf), DegreeResponseEntropy(pf), TriResponseEntropy(pf))) } library(plyr) ## Add the new entropy values to iqap. EntropyComparisons = function(iqap){ ## Add the entropy values as two new columns for iqap. ## ddply calls ResponseEntropy for each row: iqapEntropy = ddply(iqap, .variables=c('Item'), .fun=Entropys) colnames(iqapEntropy)[2] = 'PolarityEntropy' colnames(iqapEntropy)[3] = 'DegreeEntropy' colnames(iqapEntropy)[4] = 'TriEntropy' ## Sort the entropy frame and the original by Item to ensure a proper alignment: iqapEntropy = iqapEntropy[order(iqapEntropy\$Item), ] iqap = iqap[order(iqap\$Item), ] ## Add columns for the entropy values to the main iqap frame: iqap\$PolarityEntropy = iqapEntropy\$PolarityEntropy iqap\$DegreeEntropy = iqapEntropy\$DegreeEntropy iqap\$TriEntropy = iqapEntropy\$TriEntropy ## Return the augmented frame: return(iqap) }