#!/usr/bin/env python

"""
A maximum entropy classifier for items in the IQAP corpus.

The central class is IqapClassifier, which takes as its main arguments
an IqapCorpusReader and a function mapping iqap.Item instances to
dictionaries mapping feature names to values.


The __main__ method builds a simple unigrams-based classifier --- the
feature function maps words to their counts.
"""

__author__ = "Christopher Potts"
__copyright__ = "Copyright 2011, Christopher Potts"
__credits__ = []
__license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/"
__version__ = "1.0"
__maintainer__ = "Christopher Potts"
__email__ = "See the author's website"

######################################################################

import re
from random import shuffle
from collections import defaultdict
import scipy
import numpy
import nltk.classify.maxent
from nltk.probability import DictionaryProbDist
from iqap import IqapReader
import review_functions

# New versions of Numpy/Scipy seem not to be able to optimze for
# NLTK's maxent classifier, so we need to insist on these older
# versions.  If they aren't present, then we use NLTK's included IIS
# algorithm in IqapClassifier.train()
COMPATIBLE_NUMPY_VERSION = False
if numpy.__version__[: 3] in ('1.3', '1.4', '1.5'):
    COMPATIBLE_NUMPY_VERSION = True
COMPATIBLE_SCIPY_VERSION = False
if scipy.__version__[: 3] in ('0.6', '0.7', '0.8'):
    COMPATIBLE_SCIPY_VERSION = True

######################################################################

class IqapClassifier:
    """
    The basic idea is that the user supplies a function mapping Item
    objects to feature dictionaries (with flexible types), and this
    class takes care of training the classifier and setting up
    assessments.

    The main interface is assess(), which, when the classifier is
    built with evaluation=False, does random splits of the development
    data and creates a report of each those runs and the mean
    performance across them.  """
    
    def __init__(self, corpus, iqap_pair_feature_function, train_percentage=0.8, evaluation=False, make_binary=True, algorithm='CG'):
        """
        Arguments:
        corpus -- an IqapCorpusReader instance
        iqap_pair_feature_function -- a function mapping items into feature dictionaries
        train_percentage (float) -- amount to devote to training
        evaluation (boolean) -- split along the data's DEVELOPMENT/EVALUATION division (default: False)
        make_binary (boolean) -- collapse definite-yes and probable-yes into yes, and similarly for no (default: True)
        algorithm (str) -- optimizer to use; must be drawn from
                           ('gis', 'iis', 'cg', 'bfgs', 'powell', 'lbfgsb', 'nelder-mead', 'megam');
                           the SciPy algorithms ('cg', 'bfgs', 'powell', 'lbfgsb', 'nelder-mead')
                           require particular versions of SciPy and NumPy; 'megam' requires that
                           external library (see the NLTK homepage); the default is C, but IIS is
                           used if SciPy/NumPy won't cooperate.
        """        
        self.corpus = corpus
        self.iqap_pair_feature_function = iqap_pair_feature_function
        self.train_percentage = train_percentage
        self.evaluation = evaluation
        self.make_binary = make_binary
        self.algorithm = algorithm.lower()
        # Try to ensure that we have a workable algorithm:
        if self.algorithm not in ('gis', 'iis', 'cg', 'bfgs', 'powell', 'lbfgsb', 'nelder-mead', 'megam'):
            self.algorithm = 'cg'
        if self.algorithm in ('cg', 'bfgs', 'powell', 'lbfgsb', 'nelder-mead') and \
               (not COMPATIBLE_NUMPY_VERSION or not COMPATIBLE_SCIPY_VERSION):
            self.algorithm = 'iis'
        # Additional attributes set during training:
        self.train_set = None
        self.test_set = None
        self.train_features = None
        self.test_features = None
        self.classifier = None

    ######################################################################
    # TRAINING

    def train(self):
        """
        Performs the train/test split, builds the train sets features,
        and then builds the classifier.  For the classifier,
        TypedMaxentFeatureEncoding is used to allow flexibility in
        terms of feature types/values. If scipy is present and has the
        right version, then we use it's Conjugate Gradient
        (algorithm='CG') optimizer, else we use NLTK's Improved
        Iterative Scaling (algorithm='IIS').
        """
        self.train_test_split()      
        self.train_features = self.get_features(self.train_set)
        # Most flexible kind of feature encoding:
        feature_encoding = nltk.classify.maxent.TypedMaxentFeatureEncoding.train(self.train_features)
        # Build the classifier:        
        self.classifier = nltk.MaxentClassifier.train(self.train_features, algorithm=self.algorithm, encoding=feature_encoding)
        
    def train_test_split(self):
        """
        If we're in evalution mode, use that split, else randomly
        split the development set using self.train_percentage.
        """        
        if self.evaluation:
            self.train_set = self.corpus.dev_set()
            self.test_set = self.corpus.eval_set()            
        else:
            items = self.corpus.dev_set()
            shuffle(items)
            train_count = int(round(len(items) * self.train_percentage, 0))
            self.train_set = items[: train_count]
            self.test_set = items[train_count: ]

    def get_features(self, items):
        """
        Use the user's self.iqap_pair_feature_function to create a
        list of (doc_features, response) tuples, where doc_features is
        a dictionary giving the features of a particular item, as
        delivered by self.iqap_pair_feature_function.
        """        
        feats = []
        for item in items:
            doc_features = self.iqap_pair_feature_function(item)
            response_counts = item.response_counts(make_binary=self.make_binary)
            # For each response with responses count.
            for resp, count in response_counts.items():
                # Create count (feature, class) pairs to capture the
                # information in the distribution of responses.
                for i in xrange(count):
                    feats.append((doc_features, resp))
        return feats

    ######################################################################
    # ASSESSMENT

    def assess(self, run_count=1, csv_format=False, show_most_informative_features=0):
        """
        Does run_count random train/test splits followed by training
        and assessment, and returns a summary of the performance,
        either for standard output or in CSV format for later
        analysis. In either case, the string format is basically:

        Turn Max-label accuracy Mean KL divergence Model log-likelihood
        1     ...
        2     ...
        ...
        Mean ...

        Arguments:
        run_count (int) -- number of runs to do
        csv_format (boolean) -- if true, value are comma delimited, else they are padded for neat formatting
        show_most_informative_features (int) -- if positive, returns this many features' info to standard output

        Value:
        s (str) -- the string giving the summary        
        """
        s = ""
        # Column width formatting:        
        def column_div(val):
            if csv_format:
                return str(val)
            else:
                return str(val).rjust(18)
        # Separator.
        sep = ""
        precision = 2
        if csv_format:
            sep = ","
            precision = 8
        # Assessment category labels.
        max_desc = "Max-label acc."
        kl_desc = "Mean KL div."
        loglik_desc = "Log-lik."
        micro_precision = "Micro-avg prec."
        micro_recall = "Micro-avg recall"
        train_accuracy = "Train acc."
        # Accumulator for averaging:
        runs = {max_desc:[], kl_desc:[], loglik_desc:[], micro_precision:[], micro_recall:[], train_accuracy:[]}
        # Header for the output:
        s += sep.join(map(column_div, ['Run'] + sorted(runs.keys()))) + "\n"
        # Repeated runs:
        for i in xrange(run_count):
            # New random split and retraining.
            self.train()
            # Assessment of the new model:
            summary = self.cumulative_effectiveness()            
            this_assess = {max_desc: self.max_label_accuracy(),             
                           kl_desc: self.mean_kl_divergence(),
                           loglik_desc: self.log_likelihood(),
                           micro_precision: summary['micro_precision'],
                           micro_recall: summary['micro_recall'],
                           train_accuracy: self.train_accuracy()
                           }
            # Add the new values to the string:
            row = [(i+1)] + map((lambda x : round(x, precision)), [val for key, val in sorted(this_assess.items())])
            s += sep.join(map(column_div, row)) + "\n"
            # Add the values to the accumulator:
            for key, val in this_assess.items():
                runs[key].append(val)
        # Averaging for multiple runs:
        if run_count > 1:
            row = ['Means'] + map((lambda x : round(x, precision)), [numpy.mean(val) for key, val in sorted(runs.items())])
            s += sep.join(map(column_div, row)) + "\n"
        # Feature info:
        if show_most_informative_features:
            self.show_most_informative_features(n=show_most_informative_features)
        return s

    def show_most_informative_features(self, n=10, show='pos'):
        """
        Interface to the corresponding NLTK function:
        model summary to standard output.

        n (int) -- number of features to show (default: 10)
        show (str) -- can be pos, neg, or all (default: pos, which shows only features with positive weights
        """
        self.classifier.show_most_informative_features(n=n, show=show)

    def log_likelihood(self):
        """Interface to the corresponding NLTK function: model log-likelihood."""
        self.test_features = self.get_features(self.test_set)
        return nltk.classify.util.log_likelihood(self.classifier, self.test_features)

    def cumulative_effectiveness(self):
        """
        At present, calculates the micro-averaged precision and recall
        for a run.  The value is a dictionary mapping strings to
        floats.
        """
        cm = self.max_label_confusion_matrix()
        d = defaultdict(list)
        for i in xrange(len(cm)):
            d['precision'].append(cm[i,i] / numpy.sum(cm[: , i]))
            d['recall'].append(cm[i,i] / numpy.sum(cm[i, : ]))
        summary = {}
        summary['micro_precision'] = numpy.mean(d['precision'])
        summary['micro_recall'] = numpy.mean(d['recall'])
        return summary

    def max_label_accuracy(self):
        """
        For items that have a unique max label, determine whether we
        also assign max probability to that label.  Note: this is
        different from what one would get from
        nltk.classify.util.accuracy().  Each of our items is repeated
        30 times in the training and testing feature production, so
        that we include their annotation distributions. For
        evaluation, we want to consider each only once, not 30 times.

        Value: percentage correct (float)
        """
        cm = self.max_label_confusion_matrix()
        return numpy.sum(cm.diagonal()) / float(numpy.sum(cm))

    def train_accuracy(self):
        """
        Returns the percentage of training items correctly classified by the model.
        Useful to checking overfitting.
        """
        d = defaultdict(int)
        for item in self.train_set:
            doc_features = self.iqap_pair_feature_function(item)
            predicted = self.classifier.classify(doc_features)
            max_label = item.max_label(make_binary=self.make_binary)
            d[predicted == max_label] += 1
        return d[True] / float(sum(d.values()))

    def max_label_confusion_matrix(self):
        """
        Confusion matrix for items with a unique max label.  The value
        is a 2d numpy array, a useful format for calculating
        effectiveness.
        """
        # Build a dictionary:
        d = defaultdict(lambda: defaultdict(int))
        for item in self.test_set:
            doc_features = self.iqap_pair_feature_function(item)
            predicted = self.classifier.classify(doc_features)
            max_label = item.max_label(make_binary=self.make_binary)
            if max_label:
                d[max_label][predicted] += 1
        # Convert to matrix format for easier calculating of
        # effectiveness:
        keys = sorted(d.keys())
        cm = numpy.zeros((len(keys), len(keys)))
        for i in xrange(len(keys)):
            for j in xrange(len(keys)):
                cm[i,j] = d[keys[i]][keys[j]]                
        return cm

    def mean_kl_divergence(self):
        """Mean of the KL-divergence of the predictions from the actual distribution."""
        return numpy.mean(self.kl_divergence().values())

    def kl_divergence(self):
        """
        For each item, compute the KL divergence of the predictions from that item's
        annotation distribution.
        Value: kl_dict -- mapping from item Ids to KL-divergences
        
        """
        kl_dict = {}
        for item in self.test_set:
            doc_features = self.iqap_pair_feature_function(item)
            predicted_dist = self.classifier.prob_classify(doc_features)
            actual_dist = item.response_dist(make_binary=self.make_binary)      
            kl_dict[item.Item] = self.kl(actual_dist, predicted_dist)
        return kl_dict

    def kl(self, actual, predicted):
        """KL-divergence, where log2(0/x) = 0."""
        score = 0.0
        for key, val in actual.iteritems():
            predicted_val = predicted.prob(key)            
            if val > 0 and predicted_val > 0:
                score += val * numpy.log2(val/predicted_val)
        return score
 
######################################################################
    
def unigrams_classifier():
    """
    An illustration of how to work with the classifier.  unigrams is
    the feature function.  It simply treats the count of each unigram
    inside CONTRAST trees as its features, and adds a feature counting
    the number of words in the answer but not the question.
    """
    def unigrams(item):
        """Feature function: unigram counts across question and answer CONTRAST trees."""
        # Output dictionary:
        feats = defaultdict(int)
        # Tokenization:
        q_words = item.question_contrast_pred_pos()
        a_words = item.answer_contrast_pred_pos()
        # Unigram features:
        for word in q_words + a_words:
            feats[word] += 1
        # Vocab difference:
        feats['vocab_difference'] = len(set(a_words) - set(q_words))
        return feats
    # Build the classifier and display the assessment:
    classifier = IqapClassifier(IqapReader('iqap-data.csv'), unigrams)
    print classifier.assess(run_count=10, show_most_informative_features=100)

if __name__ == '__main__':
    unigrams_classifier()
    
