Package rdkit :: Package ML :: Package Cluster :: Module Murtagh
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Cluster.Murtagh

  1  # $Id$ 
  2  # 
  3  # Copyright (C) 2001-2008  greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ Interface to the C++ Murtagh hierarchic clustering code 
 12   
 13  """ 
 14  import numpy 
 15   
 16  from rdkit.ML.Cluster import Clusters 
 17  from rdkit.ML.Cluster.Clustering import MurtaghCluster, MurtaghDistCluster 
 18   
 19  # constants to select the clustering algorithm 
 20  WARDS = 1 
 21  SLINK = 2 
 22  CLINK = 3 
 23  UPGMA = 4 
 24  MCQUITTY = 5 
 25  GOWER = 6 
 26  CENTROID = 7 
 27   
 28  # descriptions of the methods: 
 29  methods = [ 
 30    ("Ward's Minimum Variance", WARDS, "Ward's Minimum Variance"), 
 31    ('Average Linkage', UPGMA, 'Group Average Linkage (UPGMA)'), 
 32    ('Single Linkage', SLINK, 'Single Linkage (SLINK)'), 
 33    ('Complete Linkage', CLINK, 'Complete Linkage (CLINK)'), 
 34    #  ("McQuitty",MCQUITTY,"McQuitty's method"), 
 35    #  ("Gower",GOWER,"Gower's median method"), 
 36    ("Centroid", CENTROID, "Centroid method"), 
 37  ] 
 38   
 39   
40 -def _LookupDist(dists, i, j, n):
41 """ *Internal Use Only* 42 43 returns the distance between points i and j in the symmetric 44 distance matrix _dists_ 45 46 """ 47 if i == j: 48 return 0.0 49 if i > j: 50 i, j = j, i 51 return dists[j * (j - 1) / 2 + i]
52 53
54 -def _ToClusters(data, nPts, ia, ib, crit, isDistData=0):
55 """ *Internal Use Only* 56 57 Converts the results of the Murtagh clustering code into 58 a cluster tree, which is returned in a single-entry list 59 60 """ 61 cs = [None] * nPts 62 for i in range(nPts): 63 cs[i] = Clusters.Cluster(metric=0.0, data=i, index=(i + 1)) 64 65 nClus = len(ia) - 1 66 for i in range(nClus): 67 idx1 = ia[i] - 1 68 idx2 = ib[i] - 1 69 c1 = cs[idx1] 70 c2 = cs[idx2] 71 newClust = Clusters.Cluster(metric=crit[i], children=[c1, c2], index=nPts + i + 1) 72 cs[idx1] = newClust 73 74 return [newClust]
75 76
77 -def ClusterData(data, nPts, method, isDistData=0):
78 """ clusters the data points passed in and returns the cluster tree 79 80 **Arguments** 81 82 - data: a list of lists (or array, or whatever) with the input 83 data (see discussion of _isDistData_ argument for the exception) 84 85 - nPts: the number of points to be used 86 87 - method: determines which clustering algorithm should be used. 88 The defined constants for these are: 89 'WARDS, SLINK, CLINK, UPGMA' 90 91 - isDistData: set this toggle when the data passed in is a 92 distance matrix. The distance matrix should be stored 93 symmetrically so that _LookupDist (above) can retrieve 94 the results: 95 for i<j: d_ij = dists[j*(j-1)/2 + i] 96 97 98 **Returns** 99 100 - a single entry list with the cluster tree 101 """ 102 data = numpy.array(data) 103 if not isDistData: 104 sz = data.shape[1] 105 ia, ib, crit = MurtaghCluster(data, nPts, sz, method) 106 else: 107 ia, ib, crit = MurtaghDistCluster(data, nPts, method) 108 c = _ToClusters(data, nPts, ia, ib, crit, isDistData=isDistData) 109 110 return c
111