1
2
3
4
5
6
7
8
9
10 """ utility functionality for clustering molecules using fingerprints
11 includes a command line app for clustering
12
13
14 Sample Usage:
15 python ClusterMols.py -d data.gdb -t daylight_sig \
16 --idName="CAS_TF" -o clust1.pkl \
17 --actTable="dop_test" --actName="moa_quant"
18
19 """
20 from __future__ import print_function
21
22 import numpy
23
24 from rdkit import DataStructs
25 from rdkit.Chem.Fingerprints import FingerprintMols, MolSimilarity
26 from rdkit.ML.Cluster import Murtagh
27 from rdkit.six.moves import cPickle
28
29 message = FingerprintMols.message
30 error = FingerprintMols.error
31
32
34 """ data should be a list of tuples with fingerprints in position 1
35 (the rest of the elements of the tuple are not important)
36
37 Returns the symmetric distance matrix
38 (see ML.Cluster.Resemblance for layout documentation)
39
40 """
41 nPts = len(data)
42 res = numpy.zeros((nPts * (nPts - 1) / 2), numpy.float)
43 nSoFar = 0
44 for col in range(1, nPts):
45 for row in range(col):
46 fp1 = data[col][1]
47 fp2 = data[row][1]
48 if fp1.GetNumBits() > fp2.GetNumBits():
49 fp1 = DataStructs.FoldFingerprint(fp1, fp1.GetNumBits() / fp2.GetNumBits())
50 elif fp2.GetNumBits() > fp1.GetNumBits():
51 fp2 = DataStructs.FoldFingerprint(fp2, fp2.GetNumBits() / fp1.GetNumBits())
52 sim = metric(fp1, fp2)
53 if isSimilarity:
54 sim = 1. - sim
55 res[nSoFar] = sim
56 nSoFar += 1
57 return res
58
59
60 -def ClusterPoints(data, metric, algorithmId, haveLabels=False, haveActs=True,
61 returnDistances=False):
62 message('Generating distance matrix.\n')
63 dMat = GetDistanceMatrix(data, metric)
64 message('Clustering\n')
65 clustTree = Murtagh.ClusterData(dMat, len(data), algorithmId, isDistData=1)[0]
66 acts = []
67 if haveActs and len(data[0]) > 2:
68
69 acts = [int(x[2]) for x in data]
70
71 if not haveLabels:
72 labels = ['Mol: %s' % str(x[0]) for x in data]
73 else:
74 labels = [x[0] for x in data]
75 clustTree._ptLabels = labels
76 if acts:
77 clustTree._ptValues = acts
78 for pt in clustTree.GetPoints():
79 idx = pt.GetIndex() - 1
80 pt.SetName(labels[idx])
81 if acts:
82 try:
83 pt.SetData(int(acts[idx]))
84 except Exception:
85 pass
86 if not returnDistances:
87 return clustTree
88 else:
89 return clustTree, dMat
90
91
93 """ Returns the cluster tree
94
95 """
96 data = MolSimilarity.GetFingerprints(details)
97 if details.maxMols > 0:
98 data = data[:details.maxMols]
99 if details.outFileName:
100 try:
101 outF = open(details.outFileName, 'wb+')
102 except IOError:
103 error("Error: could not open output file %s for writing\n" % (details.outFileName))
104 return None
105 else:
106 outF = None
107
108 if not data:
109 return None
110
111 clustTree = ClusterPoints(data, details.metric, details.clusterAlgo, haveLabels=0, haveActs=1)
112 if outF:
113 cPickle.dump(clustTree, outF)
114 return clustTree
115
116
117 _usageDoc = """
118 Usage: ClusterMols.py [args] <fName>
119
120 If <fName> is provided and no tableName is specified (see below),
121 data will be read from the text file <fName>. Text files delimited
122 with either commas (extension .csv) or tabs (extension .txt) are
123 supported.
124
125 Command line arguments are:
126
127 - -d _dbName_: set the name of the database from which
128 to pull input fingerprint information.
129
130 - -t _tableName_: set the name of the database table
131 from which to pull input fingerprint information
132
133 - --idName=val: sets the name of the id column in the input
134 database. Default is *ID*.
135
136 - -o _outFileName_: name of the output file (output will
137 be a pickle (.pkl) file with the cluster tree)
138
139 - --actTable=val: name of table containing activity values
140 (used to color points in the cluster tree).
141
142 - --actName=val: name of column with activities in the activity
143 table. The values in this column should either be integers or
144 convertible into integers.
145
146 - --SLINK: use the single-linkage clustering algorithm
147 (default is Ward's minimum variance)
148
149 - --CLINK: use the complete-linkage clustering algorithm
150 (default is Ward's minimum variance)
151
152 - --UPGMA: use the group-average clustering algorithm
153 (default is Ward's minimum variance)
154
155 - --dice: use the DICE similarity metric instead of Tanimoto
156
157 - --cosine: use the cosine similarity metric instead of Tanimoto
158
159 - --fpColName=val: name to use for the column which stores
160 fingerprints (in pickled format) in the input db table.
161 Default is *AutoFragmentFP*
162
163 - --minPath=val: minimum path length to be included in
164 fragment-based fingerprints. Default is *2*.
165
166 - --maxPath=val: maximum path length to be included in
167 fragment-based fingerprints. Default is *7*.
168
169 - --nBitsPerHash: number of bits to be set in the output
170 fingerprint for each fragment. Default is *4*.
171
172 - --discrim: use of path-based discriminators to hash bits.
173 Default is *false*.
174
175 - -V: include valence information in the fingerprints
176 Default is *false*.
177
178 - -H: include Hs in the fingerprint
179 Default is *false*.
180
181 - --useMACCS: use the public MACCS keys to do the fingerprinting
182 (instead of a daylight-type fingerprint)
183
184
185 """
186 if __name__ == '__main__':
187 message("This is ClusterMols\n\n")
188 FingerprintMols._usageDoc = _usageDoc
189 details = FingerprintMols.ParseArgs()
190 ClusterFromDetails(details)
191