1
2
3
4
5
6
7
8
9
10
11 """ SMARTS definitions for the publically available MACCS keys
12 and a MACCS fingerprinter
13
14 I compared the MACCS fingerprints generated here with those from two
15 other packages (not MDL, unfortunately). Of course there are
16 disagreements between the various fingerprints still, but I think
17 these definitions work pretty well. Some notes:
18
19 1) most of the differences have to do with aromaticity
20 2) there's a discrepancy sometimes because the current RDKit
21 definitions do not require multiple matches to be distinct. e.g. the
22 SMILES C(=O)CC(=O) can match the (hypothetical) key O=CC twice in my
23 definition. It's not clear to me what the correct behavior is.
24 3) Some keys are not fully defined in the MDL documentation
25 4) Two keys, 125 and 166, have to be done outside of SMARTS.
26 5) Key 1 (ISOTOPE) isn't defined
27
28 Rev history:
29 2006 (gl): Original open-source release
30 May 2011 (gl): Update some definitions based on feedback from Andrew Dalke
31
32 """
33 from __future__ import print_function
34 from rdkit import Chem
35 from rdkit.Chem import rdMolDescriptors
36 from rdkit import DataStructs
37
38 smartsPatts = {
39 1: ('?', 0),
40
41 2: ('[#104]', 0),
42 3: ('[#32,#33,#34,#50,#51,#52,#82,#83,#84]', 0),
43 4: ('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]', 0),
44 5: ('[Sc,Ti,Y,Zr,Hf]', 0),
45 6: ('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]', 0),
46 7: ('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]', 0),
47 8: ('[!#6;!#1]1~*~*~*~1', 0),
48 9: ('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]', 0),
49 10: ('[Be,Mg,Ca,Sr,Ba,Ra]', 0),
50 11: ('*1~*~*~*~1', 0),
51 12: ('[Cu,Zn,Ag,Cd,Au,Hg]', 0),
52 13: ('[#8]~[#7](~[#6])~[#6]', 0),
53 14: ('[#16]-[#16]', 0),
54 15: ('[#8]~[#6](~[#8])~[#8]', 0),
55 16: ('[!#6;!#1]1~*~*~1', 0),
56 17: ('[#6]#[#6]', 0),
57 18: ('[#5,#13,#31,#49,#81]', 0),
58 19: ('*1~*~*~*~*~*~*~1', 0),
59 20: ('[#14]', 0),
60 21: ('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]', 0),
61 22: ('*1~*~*~1', 0),
62 23: ('[#7]~[#6](~[#8])~[#8]', 0),
63 24: ('[#7]-[#8]', 0),
64 25: ('[#7]~[#6](~[#7])~[#7]', 0),
65 26: ('[#6]=;@[#6](@*)@*', 0),
66 27: ('[I]', 0),
67 28: ('[!#6;!#1]~[CH2]~[!#6;!#1]', 0),
68 29: ('[#15]', 0),
69 30: ('[#6]~[!#6;!#1](~[#6])(~[#6])~*', 0),
70 31: ('[!#6;!#1]~[F,Cl,Br,I]', 0),
71 32: ('[#6]~[#16]~[#7]', 0),
72 33: ('[#7]~[#16]', 0),
73 34: ('[CH2]=*', 0),
74 35: ('[Li,Na,K,Rb,Cs,Fr]', 0),
75 36: ('[#16R]', 0),
76 37: ('[#7]~[#6](~[#8])~[#7]', 0),
77 38: ('[#7]~[#6](~[#6])~[#7]', 0),
78 39: ('[#8]~[#16](~[#8])~[#8]', 0),
79 40: ('[#16]-[#8]', 0),
80 41: ('[#6]#[#7]', 0),
81 42: ('F', 0),
82 43: ('[!#6;!#1;!H0]~*~[!#6;!#1;!H0]', 0),
83 44: ('[!#1;!#6;!#7;!#8;!#9;!#14;!#15;!#16;!#17;!#35;!#53]', 0),
84 45: ('[#6]=[#6]~[#7]', 0),
85 46: ('Br', 0),
86 47: ('[#16]~*~[#7]', 0),
87 48: ('[#8]~[!#6;!#1](~[#8])(~[#8])', 0),
88 49: ('[!+0]', 0),
89 50: ('[#6]=[#6](~[#6])~[#6]', 0),
90 51: ('[#6]~[#16]~[#8]', 0),
91 52: ('[#7]~[#7]', 0),
92 53: ('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]', 0),
93 54: ('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]', 0),
94 55: ('[#8]~[#16]~[#8]', 0),
95 56: ('[#8]~[#7](~[#8])~[#6]', 0),
96 57: ('[#8R]', 0),
97 58: ('[!#6;!#1]~[#16]~[!#6;!#1]', 0),
98 59: ('[#16]!:*:*', 0),
99 60: ('[#16]=[#8]', 0),
100 61: ('*~[#16](~*)~*', 0),
101 62: ('*@*!@*@*', 0),
102 63: ('[#7]=[#8]', 0),
103 64: ('*@*!@[#16]', 0),
104 65: ('c:n', 0),
105 66: ('[#6]~[#6](~[#6])(~[#6])~*', 0),
106 67: ('[!#6;!#1]~[#16]', 0),
107 68: ('[!#6;!#1;!H0]~[!#6;!#1;!H0]', 0),
108 69: ('[!#6;!#1]~[!#6;!#1;!H0]', 0),
109 70: ('[!#6;!#1]~[#7]~[!#6;!#1]', 0),
110 71: ('[#7]~[#8]', 0),
111 72: ('[#8]~*~*~[#8]', 0),
112 73: ('[#16]=*', 0),
113 74: ('[CH3]~*~[CH3]', 0),
114 75: ('*!@[#7]@*', 0),
115 76: ('[#6]=[#6](~*)~*', 0),
116 77: ('[#7]~*~[#7]', 0),
117 78: ('[#6]=[#7]', 0),
118 79: ('[#7]~*~*~[#7]', 0),
119 80: ('[#7]~*~*~*~[#7]', 0),
120 81: ('[#16]~*(~*)~*', 0),
121 82: ('*~[CH2]~[!#6;!#1;!H0]', 0),
122 83: ('[!#6;!#1]1~*~*~*~*~1', 0),
123 84: ('[NH2]', 0),
124 85: ('[#6]~[#7](~[#6])~[#6]', 0),
125 86: ('[C;H2,H3][!#6;!#1][C;H2,H3]', 0),
126 87: ('[F,Cl,Br,I]!@*@*', 0),
127 88: ('[#16]', 0),
128 89: ('[#8]~*~*~*~[#8]', 0),
129 90:
130 ('[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',
131 0),
132 91:
133 ('[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',
134 0),
135 92: ('[#8]~[#6](~[#7])~[#6]', 0),
136 93: ('[!#6;!#1]~[CH3]', 0),
137 94: ('[!#6;!#1]~[#7]', 0),
138 95: ('[#7]~*~*~[#8]', 0),
139 96: ('*1~*~*~*~*~1', 0),
140 97: ('[#7]~*~*~*~[#8]', 0),
141 98: ('[!#6;!#1]1~*~*~*~*~*~1', 0),
142 99: ('[#6]=[#6]', 0),
143 100: ('*~[CH2]~[#7]', 0),
144 101:
145 ('[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',
146 0),
147 102: ('[!#6;!#1]~[#8]', 0),
148 103: ('Cl', 0),
149 104: ('[!#6;!#1;!H0]~*~[CH2]~*', 0),
150 105: ('*@*(@*)@*', 0),
151 106: ('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]', 0),
152 107: ('[F,Cl,Br,I]~*(~*)~*', 0),
153 108: ('[CH3]~*~*~*~[CH2]~*', 0),
154 109: ('*~[CH2]~[#8]', 0),
155 110: ('[#7]~[#6]~[#8]', 0),
156 111: ('[#7]~*~[CH2]~*', 0),
157 112: ('*~*(~*)(~*)~*', 0),
158 113: ('[#8]!:*:*', 0),
159 114: ('[CH3]~[CH2]~*', 0),
160 115: ('[CH3]~*~[CH2]~*', 0),
161 116: ('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]', 0),
162 117: ('[#7]~*~[#8]', 0),
163 118: ('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]', 1),
164 119: ('[#7]=*', 0),
165 120: ('[!#6;R]', 1),
166 121: ('[#7;R]', 0),
167 122: ('*~[#7](~*)~*', 0),
168 123: ('[#8]~[#6]~[#8]', 0),
169 124: ('[!#6;!#1]~[!#6;!#1]', 0),
170 125: ('?', 0),
171 126: ('*!@[#8]!@*', 0),
172 127: ('*@*!@[#8]', 1),
173 128:
174 ('[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',
175 0),
176 129: ('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',
177 0),
178 130: ('[!#6;!#1]~[!#6;!#1]', 1),
179 131: ('[!#6;!#1;!H0]', 1),
180 132: ('[#8]~*~[CH2]~*', 0),
181 133: ('*@*!@[#7]', 0),
182 134: ('[F,Cl,Br,I]', 0),
183 135: ('[#7]!:*:*', 0),
184 136: ('[#8]=*', 1),
185 137: ('[!C;!c;R]', 0),
186 138: ('[!#6;!#1]~[CH2]~*', 1),
187 139: ('[O;!H0]', 0),
188 140: ('[#8]', 3),
189 141: ('[CH3]', 2),
190 142: ('[#7]', 1),
191 143: ('*@*!@[#8]', 0),
192 144: ('*!:*:*!:*', 0),
193 145: ('*1~*~*~*~*~*~1', 1),
194 146: ('[#8]', 2),
195 147: ('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]', 0),
196 148: ('*~[!#6;!#1](~*)~*', 0),
197 149: ('[C;H3,H4]', 1),
198 150: ('*!@*@*!@*', 0),
199 151: ('[#7;!H0]', 0),
200 152: ('[#8]~[#6](~[#6])~[#6]', 0),
201 153: ('[!#6;!#1]~[CH2]~*', 0),
202 154: ('[#6]=[#8]', 0),
203 155: ('*!@[CH2]!@*', 0),
204 156: ('[#7]~*(~*)~*', 0),
205 157: ('[#6]-[#8]', 0),
206 158: ('[#6]-[#7]', 0),
207 159: ('[#8]', 1),
208 160: ('[C;H3,H4]', 0),
209 161: ('[#7]', 0),
210 162: ('a', 0),
211 163: ('*1~*~*~*~*~*~1', 0),
212 164: ('[#8]', 0),
213 165: ('[R]', 0),
214 166: ('?', 0),
215 }
216
217 maccsKeys = None
218
219
221 """ *Internal Use Only*
222
223 generates SMARTS patterns for the keys, run once
224
225 """
226 assert len(keyList) == len(keyDict.keys()), 'length mismatch'
227 for key in keyDict.keys():
228 patt, count = keyDict[key]
229 if patt != '?':
230 sma = Chem.MolFromSmarts(patt)
231 if not sma:
232 print('SMARTS parser error for key #%d: %s' % (key, patt))
233 else:
234 keyList[key - 1] = sma, count
235
236
238 """ generates the MACCS fingerprint for a molecules
239
240 **Arguments**
241
242 - mol: the molecule to be fingerprinted
243
244 - any extra keyword arguments are ignored
245
246 **Returns**
247
248 a _DataStructs.SparseBitVect_ containing the fingerprint.
249
250 >>> m = Chem.MolFromSmiles('CNO')
251 >>> bv = GenMACCSKeys(m)
252 >>> tuple(bv.GetOnBits())
253 (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164)
254 >>> bv = GenMACCSKeys(Chem.MolFromSmiles('CCC'))
255 >>> tuple(bv.GetOnBits())
256 (74, 114, 149, 155, 160)
257
258 """
259 global maccsKeys
260 if maccsKeys is None:
261 maccsKeys = [(None, 0)] * len(smartsPatts.keys())
262 _InitKeys(maccsKeys, smartsPatts)
263 ctor = kwargs.get('ctor', DataStructs.SparseBitVect)
264
265 res = ctor(len(maccsKeys) + 1)
266 for i, (patt, count) in enumerate(maccsKeys):
267 if patt is not None:
268 if count == 0:
269 res[i + 1] = mol.HasSubstructMatch(patt)
270 else:
271 matches = mol.GetSubstructMatches(patt)
272 if len(matches) > count:
273 res[i + 1] = 1
274 elif (i + 1) == 125:
275
276 ri = mol.GetRingInfo()
277 nArom = 0
278 res[125] = 0
279 for ring in ri.BondRings():
280 isArom = True
281 for bondIdx in ring:
282 if not mol.GetBondWithIdx(bondIdx).GetIsAromatic():
283 isArom = False
284 break
285 if isArom:
286 nArom += 1
287 if nArom > 1:
288 res[125] = 1
289 break
290 elif (i + 1) == 166:
291 res[166] = 0
292
293 if len(Chem.GetMolFrags(mol)) > 1:
294 res[166] = 1
295
296 return res
297
298
299 GenMACCSKeys = rdMolDescriptors.GetMACCSKeysFingerprint
300 FingerprintMol = rdMolDescriptors.GetMACCSKeysFingerprint
301
302
303
304
305
306
308 import doctest, sys
309 return doctest.testmod(sys.modules["__main__"])
310
311
312 if __name__ == '__main__':
313 import sys
314 failed, tried = _test()
315 sys.exit(failed)
316