#!/usr/bin/env python

import re
import csv
from collections import defaultdict
from operator import itemgetter
import review_functions

def sign(x):
    """Quick numerical sign comparison. Treats 0 as positive."""
    if x < 0: return '-'
    else: return '+'

def score_cmp(score1, score2):
    """
    Compare two ER or Coef scores, treating those with different signs
    as incomparable and comparing absolute values for the rest.
    """
    val = '='
    if sign(score1) != sign(score2):
        val = 'incmp'
    elif abs(score1) < abs(score2):
        val = '<'
    elif abs(score1) > abs(score2):
        val = '>'
    return val 

def ifnot_cmps():
    """
    Read in the ifnot-ngrams.txt data and compare them against the
    review score data from imdb-words-assess.csv. The function treats
    all words as adjectives and compares them only if their adjectival
    versions are in the imdb-words-assess.csv. The result is a CSV
    file called ifnot-cmps.csv that includes the word forms as well as
    their scores and the P-value associated with the coefficient
    scoring, for imposing thresholds.
    """    
    # Scoring dictionary mapping (word, tag) pairs to dictionaries
    # {'Word':word, 'Tag':tag, 'ER':float, 'Coef':float: 'P':float}
    SCORER = review_functions.get_all_imdb_scores('imdb-words-assess.csv')
    # The list of all the lines from the source file:
    lines = open('ifnot-ngrams.txt').read().splitlines()
    # Output CSV file:
    csvwriter = csv.writer(open('ifnot-cmps.csv', 'w'))
    csvwriter.writerow(['Word1', 'Word2', 'ERCmp', 'CoefCmp', 'P1', 'P2'])
    # Iterate through the lines:
    for line in lines:
        w1, w2, w3, w4, count = re.split(r'\s+', line)
        # We don't have POS data, so we approximate by limiting
        # attention to adjectival senses, where there are any:
        if (w1, 'a') in SCORER and (w4, 'a') in SCORER:
            d1 = SCORER[(w1, 'a')]
            d4 = SCORER[(w4, 'a')]
            # Subtract 4.5 from each ER to center the scale at 0:
            er_cmp = score_cmp(d1['ER']-5.5, d4['ER']-5.5)
            # Coefficient comparison can be direct:
            coef_cmp = score_cmp(d1['Coef'], d4['Coef'])
            row = [w1, w4, er_cmp, coef_cmp, d1['P'], d4['P']]
            csvwriter.writerow(row)
    
ifnot_cmps()                

        

        
        
    
