1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
| from numpy import *
""" Code for hierarchical clustering, modified from Programming Collective Intelligence by Toby Segaran (O'Reilly Media 2007, page 33). """
class cluster_node: def __init__(self,vec,left=None,right=None,distance=0.0,id=None,count=1): self.left=left self.right=right self.vec=vec self.id=id self.distance=distance self.count=count
def L2dist(v1,v2): return sqrt(sum((v1-v2)**2)) def L1dist(v1,v2): return sum(abs(v1-v2))
def hcluster(features,distance=L2dist): distances={} currentclustid=-1
clust=[cluster_node(array(features[i]),id=i) for i in range(len(features))]
while len(clust)>1: lowestpair=(0,1) closest=distance(clust[0].vec,clust[1].vec) for i in range(len(clust)): for j in range(i+1,len(clust)): if (clust[i].id,clust[j].id) not in distances: distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec) d=distances[(clust[i].id,clust[j].id)] if d<closest: closest=d lowestpair=(i,j) mergevec=[(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 \ for i in range(len(clust[0].vec))] newcluster=cluster_node(array(mergevec),left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest,id=currentclustid) currentclustid-=1 del clust[lowestpair[1]] del clust[lowestpair[0]] clust.append(newcluster)
return clust[0]
def extract_clusters(clust,dist): clusters = {} if clust.distance<dist: return [clust] else: cl = [] cr = [] if clust.left!=None: cl = extract_clusters(clust.left,dist=dist) if clust.right!=None: cr = extract_clusters(clust.right,dist=dist) return cl+cr def get_cluster_elements(clust): if clust.id>=0: return [clust.id] else: cl = [] cr = [] if clust.left!=None: cl = get_cluster_elements(clust.left) if clust.right!=None: cr = get_cluster_elements(clust.right) return cl+cr
def printclust(clust,labels=None,n=0): for i in range(n): print ' ', if clust.id<0: print '-' else: if labels==None: print clust.id else: print labels[clust.id] if clust.left!=None: printclust(clust.left,labels=labels,n=n+1) if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)
def getheight(clust): if clust.left==None and clust.right==None: return 1 return getheight(clust.left)+getheight(clust.right)
def getdepth(clust): if clust.left==None and clust.right==None: return 0 return max(getdepth(clust.left),getdepth(clust.right))+clust.distance
|