#!/usr/bin/env python

import csv
import numpy
from numpy import array
import math
from operator import itemgetter
from collections import defaultdict
from random import shuffle

######################################################################

def centroid(vectors):
    return numpy.sum(vectors, axis=0) / float(len(vectors))

def euclidean_distance(u, v):
    diff = u - v
    return math.sqrt(numpy.dot(diff, diff))

def distance(vec, c):
    return euclidean_distance(vec, c)

def rss(vectors, c):
    return sum(map((lambda v : distance(v, c)), vectors))

def score(clusters):
    return sum(map((lambda vecs : rss(vecs, centroid(vecs))), clusters))

def random_initial_means(vectors, k):
    indices = list(range(len(vectors)))
    shuffle(indices)
    return [vectors[i] for i in indices[:k]]

def classify_vector(vec, means):
    d = {}
    for i, m in enumerate(means):
        d[i] = distance(vec, m)
    return sorted(d.items(), key=itemgetter(1), reverse=False)[0][0]

def has_converged(means, new_means):
    threshold = 1e-6
    diff = sum(distance(means[i], new_means[i]) for i in xrange(len(means)))
    if diff <= threshold:
        return True
    else:
        return False

def means_to_csv(means, iterations):
    rows = []
    for i, mean in enumerate(means):        
        rows.append([iterations, mean[0], mean[1], i, 'TRUE'])
    return rows

def clusters_to_csv(clusters, iterations):
    rows = []
    for i, clust in enumerate(clusters):
        for vec in clust:
            rows.append([iterations, vec[0], vec[1], i, 'FALSE'])
    return rows
               
def cluster_with_csv_production(vectors, k, means=None):
    csvwriter = csv.writer(file('kmeans-wikipedia-illustration.csv', 'w'))
    header = ['iter', 'x', 'y', 'cluster', 'is_mean']
    csvwriter.writerow(header)
    # Initial means:
    iterations = 1
    if not means:
        means = random_initial_means(vectors, k)
    print "======================================================================"
    print "Initial cluster means:", means
    csvwriter.writerows(means_to_csv(means, iterations))
    converged = False    
    while not converged:
        # Cluster:
        clusters = [[] for i in xrange(len(means))]
        for vec in vectors:
            index = classify_vector(vec, means)
            clusters[index].append(vec)
        csvwriter.writerows(clusters_to_csv(clusters, iterations))
        # Adjust means:
        new_means = [array([]) for i in xrange(len(means))]
        for i, clust in enumerate(clusters):
            new_means[i] = centroid(clust)
        # Test for convergence:        
        if has_converged(means, new_means):
            converged = True
        else:
            print "======================================================================"
            print "Iteration %s" % iterations            
            print "Means", new_means
            print "Clusters", clusters
            iterations += 1
            csvwriter.writerows(means_to_csv(new_means, iterations))
            means = new_means            
    return clusters

######################################################################

# Original values:
# vectors = [
#     array([43.892857, 46.392857]),
#     array([43.892857, 64.25]),
#     array([43.892857, 83.535713]),
#     array([158.07144, 43.785713]),
#     array([158.07144, 62.35714]),
#     array([158.07143, 78.785713]),
#     array([73.785713, 118.07142]),
#     array([73.071434, 136.64287]),
#     array([90.214287, 150.21429]),
#     array([110.21429, 150.92857]),
#     array([128.07143, 116.64285]),
#     array([128.78572, 135.92857])]

vectors = array([
    array([44.0, 46.0]),
    array([44.0, 64.0]),
    array([44.0, 84.0]),
    array([158.0, 44.0]),
    array([158.0, 62.0]),
    array([158.0, 79.0]),
    array([74.0, 118.0]),
    array([74.0, 137.0]),
    array([90.0, 150.0]),
    array([110.0, 151.0]),
    array([128.0, 117.0]),
    array([128.0, 136.0]),
    # Means:
    array([73.0,  100.0]),
    array([73.0,  30.0]),
    array([140.0, 100.0])])

# Approximations of Wikipedia initial means (not used):
means = [
    [73.0,  100.0],
    [73.0,  30.0],
    [140.0, 100.0]]
    
cluster_with_csv_production(vectors, 3, means=None)

"""
R code:

iterPlot = function(df, i, assignClusters=TRUE, main=''){
  f = subset(df, iter==i)
  m = subset(f, is_mean==TRUE)
  clustColors = c('red', 'blue', 'green')
  if (assignClusters == FALSE) {
    clustColors = c('black', 'black', 'black')
  }
  if (main==''){
    main = paste('Iteration', i)
  }
  plot(f$x, f$y, type='n', xlab='x', ylab='y', main=main)
  points(m$x, m$y, col=c('red', 'blue', 'green'), pch=15, cex=3)
  for (i in 1:3){  
    clust = subset(f, cluster==i-1)
    points(clust$x, clust$y, pch=19, col=clustColors[i])
  }
  nonmean =  subset(f, is_mean==FALSE)  
  points(nonmean$x, nonmean$y, cex=1.5)
}

d = read.csv('kmeans-wikipedia-illustration.csv')
runs = length(unique(d$iter))
par(mfrow=c(2,3))
iterPlot(d, 1, assignClusters=FALSE, main='Random means drawn from the data')
for (i in 1:runs) {
  iterPlot(d, i)
}

"""
