1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 try:
35 from rdkit.Avalon import pyAvalonTools
36 except ImportError:
37 raise ImportError("This code requires the RDKit to be built with AvalonTools support")
38
39 import base64
40 from collections import namedtuple
41 import hashlib
42 import logging
43 import os
44 import re
45 import tempfile
46 import uuid
47
48 from rdkit import Chem
49 from rdkit import RDConfig
50 from rdkit.Chem.MolKey import InchiInfo
51
52
55
56
59
60
61 MOL_KEY_VERSION = '1'
62
63 ERROR_DICT = dict(BAD_MOLECULE=1, ALIAS_CONVERSION_FAILED=2, TRANSFORMED=4, FRAGMENTS_FOUND=8,
64 EITHER_WARNING=16, STEREO_ERROR=32, DUBIOUS_STEREO_REMOVED=64, ATOM_CLASH=128,
65 ATOM_CHECK_FAILED=256, SIZE_CHECK_FAILED=512, RECHARGED=1024,
66 STEREO_FORCED_BAD=2048, STEREO_TRANSFORMED=4096, TEMPLATE_TRANSFORMED=8192,
67 INCHI_COMPUTATION_ERROR=65536, RDKIT_CONVERSION_ERROR=131072,
68 INCHI_READWRITE_ERROR=262144, NULL_MOL=524288)
69
70 INCHI_COMPUTATION_ERROR = ERROR_DICT['INCHI_COMPUTATION_ERROR']
71 RDKIT_CONVERSION_ERROR = ERROR_DICT['RDKIT_CONVERSION_ERROR']
72 INCHI_READWRITE_ERROR = ERROR_DICT['INCHI_READWRITE_ERROR']
73 NULL_MOL = ERROR_DICT['NULL_MOL']
74
75 BAD_SET = (pyAvalonTools.StruChkResult.bad_set | INCHI_COMPUTATION_ERROR | RDKIT_CONVERSION_ERROR |
76 INCHI_READWRITE_ERROR | NULL_MOL)
77
78 GET_STEREO_RE = re.compile(r'^InChI=1S(.*?)/(t.*?)/m\d/s1(.*$)')
79 NULL_SMILES_RE = re.compile(r'^\s*$|^\s*NO_STRUCTURE\s*$', re.IGNORECASE)
80 PATTERN_NULL_MOL = r'^([\s0]+[1-9]+[\s]+V[\w]*)'
81
82 CHIRAL_POS = 12
83
84 T_NULL_MOL = (NULL_MOL, '')
85
86 stereo_code_dict = {}
87 stereo_code_dict['DEFAULT'] = 0
88 stereo_code_dict['S_ACHIR'] = 1
89 stereo_code_dict['S_ABS'] = 2
90 stereo_code_dict['S_REL'] = 3
91 stereo_code_dict['S_PART'] = 4
92 stereo_code_dict['S_UNKN'] = 5
93 stereo_code_dict['S_ABS_ACHIR'] = 6
94 stereo_code_dict['R_ONE'] = 11
95 stereo_code_dict['R_REL'] = 12
96 stereo_code_dict['R_OTHER'] = 13
97 stereo_code_dict['MX_ENANT'] = 21
98 stereo_code_dict['MX_DIAST'] = 22
99 stereo_code_dict['MX_SP2'] = 31
100 stereo_code_dict['MX_DIAST_ABS'] = 32
101 stereo_code_dict['MX_DIAST_REL'] = 33
102 stereo_code_dict['OTHER'] = 100
103 stereo_code_dict['UNDEFINED'] = 200
104
105
107 try:
108 new_string = re.sub(pat, sbt, my_string)
109 return new_string
110 except:
111 return None
112
113
115 pat = '\r\n{0,1}'
116 sbt = '\n'
117 return _fix_all(pat, sbt, my_string)
118
119
121 pat = '0V2000'
122 sbt = 'V2000'
123 return _fix_all(pat, sbt, my_string)
124
125
127 ''' look at atom count position (line 4, characters 0:3)
128 Return True if the count is >0, False if 0.
129 Throw BadMoleculeException if there are no characters
130 at the required position or if they cannot be converted
131 to a positive integer
132 '''
133 try:
134 str_a_count = ctab_lines[3][0:3]
135 a_count = int(str_a_count)
136 if a_count < 0:
137 raise BadMoleculeException('Atom count negative')
138 if a_count > 0:
139 rval = True
140 else:
141 rval = False
142 except IndexError:
143 raise BadMoleculeException('Invalid molfile format')
144 except ValueError:
145 raise BadMoleculeException('Expected integer')
146
147 return rval
148
149
151 ''' read the chiral flag (line 4, characters 12:15)
152 and set it to 0. Return True if it was 1, False if 0.
153 Throw BadMoleculeException if there are no characters
154 at the required position or if they where not 0 or 1
155 '''
156 try:
157 str_a_count = ctab_lines[3][12:15]
158 a_count = int(str_a_count)
159 if a_count == 0:
160 rval = False
161 elif a_count == 1:
162 rval = True
163 orig_line = ctab_lines[3]
164 ctab_lines[3] = orig_line[:CHIRAL_POS] + ' 0' + orig_line[CHIRAL_POS + 3:]
165 else:
166 raise BadMoleculeException('Expected chiral flag 0 or 1')
167 except IndexError:
168 raise BadMoleculeException('Invalid molfile format')
169 except ValueError:
170 raise BadMoleculeException('Expected integer, got {0}'.format(str_a_count))
171
172 return rval
173
174
175 __initCalled = False
176
177
179 global __initCalled
180 if configDir is None:
181 configDir = os.path.join(RDConfig.RDDataDir, 'struchk')
182 if configDir[-1] != os.path.sep:
183 configDir += os.path.sep
184 if logFile is None:
185 fd = tempfile.NamedTemporaryFile(suffix='.log', delete=False)
186 fd.close()
187 logFile = fd.name
188 struchk_init = '''-tm
189 -ta {0}checkfgs.trn
190 -tm
191 -or
192 -ca {0}checkfgs.chk
193 -cc
194 -cl 3
195 -cs
196 -cn 999
197 -l {1}\n'''.format(configDir, logFile)
198 initRes = pyAvalonTools.InitializeCheckMol(struchk_init)
199 if initRes:
200 raise ValueError('bad result from InitializeCheckMol: ' + str(initRes))
201 __initCalled = True
202
203
232
233
234 InchiResult = namedtuple('InchiResult', ['error', 'inchi', 'fixed_ctab'])
235
236
238 """
239 >>> from rdkit.Chem.MolKey import MolKey
240 >>> from rdkit.Avalon import pyAvalonTools
241 >>> res = MolKey.GetInchiForCTAB(pyAvalonTools.Generate2DCoords('c1cn[nH]c1C(Cl)Br',True))
242 >>> res.inchi
243 'InChI=1/C4H4BrClN2/c5-4(6)3-1-2-7-8-3/h1-2,4H,(H,7,8)/t4?/f/h8H'
244 >>> res = MolKey.GetInchiForCTAB(pyAvalonTools.Generate2DCoords('c1c[nH]nc1C(Cl)Br',True))
245 >>> res.inchi
246 'InChI=1/C4H4BrClN2/c5-4(6)3-1-2-7-8-3/h1-2,4H,(H,7,8)/t4?/f/h7H'
247 >>>
248 """
249 inchi = None
250 ctab_str = ctab
251 (strucheck_err, fixed_mol) = CheckCTAB(ctab_str, False)
252 if strucheck_err & BAD_SET:
253 return InchiResult(strucheck_err, None, fixed_mol)
254
255 conversion_err = 0
256 try:
257 r_mol = Chem.MolFromMolBlock(fixed_mol, sanitize=False)
258 if r_mol:
259 inchi = Chem.MolToInchi(r_mol, '/FixedH /SUU')
260 if not inchi:
261 conversion_err = INCHI_COMPUTATION_ERROR
262 else:
263 conversion_err = RDKIT_CONVERSION_ERROR
264 except Chem.InchiReadWriteError:
265 conversion_err = INCHI_READWRITE_ERROR
266
267 return InchiResult(strucheck_err | conversion_err, inchi, fixed_mol)
268
269
271 """ Normalize the stereo information (t-layer) to one selected isomer. """
272
273
274 new_stereo = '/m0/s3/'
275 stereo_match = GET_STEREO_RE.match(inchi)
276 if stereo_match:
277 inchi = stereo_match.group(1) + new_stereo + stereo_match.group(2)
278 return inchi
279
280
299
300
302 key_string = str(uuid.uuid1())
303 return key_string
304
305
307 pieces = []
308 ctab_str = ctab
309 if ctab_str:
310 ctab_str = _fix_line_ends(ctab_str)
311 ctab_str = _fix_chemdraw_header(ctab_str)
312 ctab_str = '\n'.join(ctab_str.split('\n')[3:])
313 pieces.append(ctab_str.replace('\n', r'\n'))
314 else:
315 pass
316 if stereo_category:
317 key_string = 'ST={0}'.format(stereo_category)
318 pieces.append(key_string)
319 if extra_stereo:
320 key_string = 'XTR={0}'.format(extra_stereo)
321 pieces.append(key_string)
322 key_string = '/'.join(pieces)
323 return key_string
324
325
326 -def _identify(err, ctab, inchi, stereo_category, extra_structure_desc=None):
327 """ Compute the molecule key based on the inchi string,
328 stereo category as well as extra structure
329 information """
330 key_string = _get_identification_string(err, ctab, inchi, stereo_category, extra_structure_desc)
331 if key_string:
332 return "{0}|{1}".format(
333 MOL_KEY_VERSION, base64.b64encode(hashlib.md5(key_string.encode('UTF-8')).digest()).decode())
334 else:
335 return None
336
337
339 assert n_def >= 0
340 assert n_udf >= 0
341 id_str = 'OTHER'
342
343 if n_def == 0:
344 if n_udf == 0:
345 id_str = 'S_ACHIR'
346 elif n_udf == 1:
347 id_str = 'R_ONE'
348 else:
349 id_str = 'S_UNKN'
350 else:
351 if n_udf == 0:
352 id_str = 'S_ABS'
353 else:
354 id_str = 'S_PART'
355 return id_str
356
357
359 " returns a list of error bit descriptions for the error code provided "
360 return [k for k, v in ERROR_DICT.items() if (err & v) > 0]
361
362
363 MolKeyResult = namedtuple(
364 'MolKeyResult', ['mol_key', 'error', 'inchi', 'fixed_ctab', 'stereo_code', 'stereo_comment'])
365
366
367 -def GetKeyForCTAB(ctab, stereo_info=None, stereo_comment=None, logger=None):
368 """
369 >>> from rdkit.Chem.MolKey import MolKey
370 >>> from rdkit.Avalon import pyAvalonTools
371 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1ccccc1C(F)Cl',True))
372 >>> res.mol_key
373 '1|L7676nfGsSIU33wkx//NCg=='
374 >>> res.stereo_code
375 'R_ONE'
376 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1ccccc1[C@H](F)Cl',True))
377 >>> res.mol_key
378 '1|Aj38EIxf13RuPDQG2A0UMw=='
379 >>> res.stereo_code
380 'S_ABS'
381 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1ccccc1[C@@H](F)Cl',True))
382 >>> res.mol_key
383 '1|9ypfMrhxn1w0ncRooN5HXw=='
384 >>> res.stereo_code
385 'S_ABS'
386 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc(C(Br)Cl)c1[C@@H](F)Cl',True))
387 >>> res.mol_key
388 '1|c96jMSlbn7O9GW5d5uB9Mw=='
389 >>> res.stereo_code
390 'S_PART'
391 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc([C@H](Br)Cl)c1[C@@H](F)Cl',True))
392 >>> res.mol_key
393 '1|+B+GCEardrJteE8xzYdGLA=='
394 >>> res.stereo_code
395 'S_ABS'
396 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc(C(Br)Cl)c1C(F)Cl',True))
397 >>> res.mol_key
398 '1|5H9R3LvclagMXHp3Clrc/g=='
399 >>> res.stereo_code
400 'S_UNKN'
401 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc(C(Br)Cl)c1C(F)Cl',True),stereo_info='S_REL')
402 >>> res.mol_key
403 '1|cqKWVsUEY6QNpGCbDaDTYA=='
404 >>> res.stereo_code
405 'S_REL'
406 >>> res.inchi
407 'InChI=1/C8H6BrCl2F/c9-7(10)5-3-1-2-4-6(5)8(11)12/h1-4,7-8H/t7?,8?'
408
409 """
410 if logger is None:
411 logger = logging
412 try:
413 err, inchi, fixed_mol = GetInchiForCTAB(ctab)
414 except BadMoleculeException:
415 msg = u'Corrupt molecule substituting no-struct: --->\n{0}\n<----'.format(ctab)
416 logger.warn(msg)
417 err = NULL_MOL
418 key = _identify(err, '', '', None, None)
419 return MolKeyResult(key, err, '', '', None, None)
420
421
422 stereo_category = None
423 extra_structure_desc = stereo_comment
424 if stereo_info:
425 info_flds = stereo_info.split(' ', 1)
426 code_fld = info_flds[0]
427 if code_fld in stereo_code_dict:
428 stereo_category = code_fld
429 if (not stereo_comment) and len(info_flds) > 1:
430 extra_structure_desc = info_flds[1].strip()
431 else:
432 logger.warn('stereo code {0} not recognized. Using default value for ctab.'.format(code_fld))
433
434 if not (err & BAD_SET):
435 (n_stereo, n_undef_stereo, is_meso,
436 dummy) = InchiInfo.InchiInfo(inchi).get_sp3_stereo()['main']['non-isotopic']
437 if stereo_category is None or stereo_category == 'DEFAULT':
438 stereo_category = _get_chiral_identification_string(n_stereo - n_undef_stereo, n_undef_stereo)
439 else:
440 raise NotImplementedError(
441 "currently cannot generate correct keys for molecules with struchk errors")
442 key = _identify(err, fixed_mol, inchi, stereo_category, extra_structure_desc)
443 return MolKeyResult(key, err, inchi, fixed_mol, stereo_category, extra_structure_desc)
444
445
446
447
448
449
451 import sys
452 import doctest
453 failed, _ = doctest.testmod(optionflags=doctest.ELLIPSIS, verbose=verbose)
454 sys.exit(failed)
455
456
457 if __name__ == '__main__':
458 _runDoctests()
459