#!/usr/bin/env python import csv import numpy from numpy import array import math from operator import itemgetter from collections import defaultdict from random import shuffle ###################################################################### def centroid(vectors): return numpy.sum(vectors, axis=0) / float(len(vectors)) def euclidean_distance(u, v): diff = u - v return math.sqrt(numpy.dot(diff, diff)) def distance(vec, c): return euclidean_distance(vec, c) def rss(vectors, c): return sum(map((lambda v : distance(v, c)), vectors)) def score(clusters): return sum(map((lambda vecs : rss(vecs, centroid(vecs))), clusters)) def random_initial_means(vectors, k): indices = list(range(len(vectors))) shuffle(indices) return [vectors[i] for i in indices[:k]] def classify_vector(vec, means): d = {} for i, m in enumerate(means): d[i] = distance(vec, m) return sorted(d.items(), key=itemgetter(1), reverse=False)[0][0] def has_converged(means, new_means): threshold = 1e-6 diff = sum(distance(means[i], new_means[i]) for i in xrange(len(means))) if diff <= threshold: return True else: return False def means_to_csv(means, iterations): rows = [] for i, mean in enumerate(means): rows.append([iterations, mean[0], mean[1], i, 'TRUE']) return rows def clusters_to_csv(clusters, iterations): rows = [] for i, clust in enumerate(clusters): for vec in clust: rows.append([iterations, vec[0], vec[1], i, 'FALSE']) return rows def cluster_with_csv_production(vectors, k, means=None): csvwriter = csv.writer(file('kmeans-wikipedia-illustration.csv', 'w')) header = ['iter', 'x', 'y', 'cluster', 'is_mean'] csvwriter.writerow(header) # Initial means: iterations = 1 if not means: means = random_initial_means(vectors, k) print "======================================================================" print "Initial cluster means:", means csvwriter.writerows(means_to_csv(means, iterations)) converged = False while not converged: # Cluster: clusters = [[] for i in xrange(len(means))] for vec in vectors: index = classify_vector(vec, means) clusters[index].append(vec) csvwriter.writerows(clusters_to_csv(clusters, iterations)) # Adjust means: new_means = [array([]) for i in xrange(len(means))] for i, clust in enumerate(clusters): new_means[i] = centroid(clust) # Test for convergence: if has_converged(means, new_means): converged = True else: print "======================================================================" print "Iteration %s" % iterations print "Means", new_means print "Clusters", clusters iterations += 1 csvwriter.writerows(means_to_csv(new_means, iterations)) means = new_means return clusters ###################################################################### # Original values: # vectors = [ # array([43.892857, 46.392857]), # array([43.892857, 64.25]), # array([43.892857, 83.535713]), # array([158.07144, 43.785713]), # array([158.07144, 62.35714]), # array([158.07143, 78.785713]), # array([73.785713, 118.07142]), # array([73.071434, 136.64287]), # array([90.214287, 150.21429]), # array([110.21429, 150.92857]), # array([128.07143, 116.64285]), # array([128.78572, 135.92857])] vectors = array([ array([44.0, 46.0]), array([44.0, 64.0]), array([44.0, 84.0]), array([158.0, 44.0]), array([158.0, 62.0]), array([158.0, 79.0]), array([74.0, 118.0]), array([74.0, 137.0]), array([90.0, 150.0]), array([110.0, 151.0]), array([128.0, 117.0]), array([128.0, 136.0]), # Means: array([73.0, 100.0]), array([73.0, 30.0]), array([140.0, 100.0])]) # Approximations of Wikipedia initial means (not used): means = [ [73.0, 100.0], [73.0, 30.0], [140.0, 100.0]] cluster_with_csv_production(vectors, 3, means=None) """ R code: iterPlot = function(df, i, assignClusters=TRUE, main=''){ f = subset(df, iter==i) m = subset(f, is_mean==TRUE) clustColors = c('red', 'blue', 'green') if (assignClusters == FALSE) { clustColors = c('black', 'black', 'black') } if (main==''){ main = paste('Iteration', i) } plot(f\$x, f\$y, type='n', xlab='x', ylab='y', main=main) points(m\$x, m\$y, col=c('red', 'blue', 'green'), pch=15, cex=3) for (i in 1:3){ clust = subset(f, cluster==i-1) points(clust\$x, clust\$y, pch=19, col=clustColors[i]) } nonmean = subset(f, is_mean==FALSE) points(nonmean\$x, nonmean\$y, cex=1.5) } d = read.csv('kmeans-wikipedia-illustration.csv') runs = length(unique(d\$iter)) par(mfrow=c(2,3)) iterPlot(d, 1, assignClusters=FALSE, main='Random means drawn from the data') for (i in 1:runs) { iterPlot(d, i) } """