1
2
3
4
5 """ The "parser" for compound descriptors.
6
7 I almost hesitate to document this, because it's not the prettiest
8 thing the world has ever seen... but it does work (for at least some
9 definitions of the word).
10
11 Rather than getting into the whole mess of writing a parser for the
12 compound descriptor expressions, I'm just using string substitutions
13 and python's wonderful ability to *eval* code.
14
15 It would probably be a good idea at some point to replace this with a
16 real parser, if only for the flexibility and intelligent error
17 messages that would become possible.
18
19 The general idea is that we're going to deal with expressions where
20 atomic descriptors have some kind of method applied to them which
21 reduces them to a single number for the entire composition. Compound
22 descriptors (those applicable to the compound as a whole) are not
23 operated on by anything in particular (except for standard math stuff).
24
25 Here's the general flow of things:
26
27 1) Composition descriptor references ($a, $b, etc.) are replaced with the
28 corresponding descriptor names using string subsitution.
29 (*_SubForCompoundDescriptors*)
30
31 2) Atomic descriptor references ($1, $2, etc) are replaced with lookups
32 into the atomic dict with "DEADBEEF" in place of the atom name.
33 (*_SubForAtomicVars*)
34
35 3) Calls to Calculator Functions are augmented with a reference to
36 the composition and atomic dictionary
37 (*_SubMethodArgs*)
38
39 **NOTE:**
40
41 anytime we don't know the answer for a descriptor, rather than
42 throwing a (completely incomprehensible) exception, we just return
43 -666. So bad descriptor values should stand out like sore thumbs.
44
45 """
46
47 from __future__ import print_function
48
49
50 from math import *
51
52 from rdkit import RDConfig
53
54 __DEBUG = False
55
56
57
58
59
60
61
62 knownMethods = ['SUM', 'MIN', 'MAX', 'MEAN', 'AVG', 'DEV', 'HAS']
63
64
65 -def HAS(strArg, composList, atomDict):
66 """ *Calculator Method*
67
68 does a string search
69
70 **Arguments**
71
72 - strArg: the arguments in string form
73
74 - composList: the composition vector
75
76 - atomDict: the atomic dictionary
77
78 **Returns**
79
80 1 or 0
81
82 """
83 splitArgs = strArg.split(',')
84 if len(splitArgs) > 1:
85 for atom, _ in composList:
86 tStr = splitArgs[0].replace('DEADBEEF', atom)
87 where = eval(tStr)
88 what = eval(splitArgs[1])
89 if what in where:
90 return 1
91 return 0
92 else:
93 return -666
94
95
96 -def SUM(strArg, composList, atomDict):
97 """ *Calculator Method*
98
99 calculates the sum of a descriptor across a composition
100
101 **Arguments**
102
103 - strArg: the arguments in string form
104
105 - compos: the composition vector
106
107 - atomDict: the atomic dictionary
108
109 **Returns**
110
111 a float
112
113 """
114 accum = 0.0
115 for atom, num in composList:
116 tStr = strArg.replace('DEADBEEF', atom)
117 accum = accum + eval(tStr) * num
118 return accum
119
120
121 -def MEAN(strArg, composList, atomDict):
122 """ *Calculator Method*
123
124 calculates the average of a descriptor across a composition
125
126 **Arguments**
127
128 - strArg: the arguments in string form
129
130 - compos: the composition vector
131
132 - atomDict: the atomic dictionary
133
134 **Returns**
135
136 a float
137
138 """
139 accum = 0.0
140 nSoFar = 0
141 for atom, num in composList:
142 tStr = strArg.replace('DEADBEEF', atom)
143 accum = accum + eval(tStr) * num
144 nSoFar = nSoFar + num
145 return accum / nSoFar
146
147
148 AVG = MEAN
149
150
151 -def DEV(strArg, composList, atomDict):
152 """ *Calculator Method*
153
154 calculates the average deviation of a descriptor across a composition
155
156 **Arguments**
157
158 - strArg: the arguments in string form
159
160 - compos: the composition vector
161
162 - atomDict: the atomic dictionary
163
164 **Returns**
165
166 a float
167
168 """
169 avg = MEAN(strArg, composList, atomDict)
170 accum = 0.0
171 nSoFar = 0.0
172 for atom, num in composList:
173 tStr = strArg.replace('DEADBEEF', atom)
174 accum = accum + abs(eval(tStr) - avg) * num
175 nSoFar = nSoFar + num
176 return accum / nSoFar
177
178
179 -def MIN(strArg, composList, atomDict):
180 """ *Calculator Method*
181
182 calculates the minimum value of a descriptor across a composition
183
184 **Arguments**
185
186 - strArg: the arguments in string form
187
188 - compos: the composition vector
189
190 - atomDict: the atomic dictionary
191
192 **Returns**
193
194 a float
195
196 """
197 accum = []
198 for atom, _ in composList:
199 tStr = strArg.replace('DEADBEEF', atom)
200 accum.append(eval(tStr))
201 return min(accum)
202
203
204 -def MAX(strArg, composList, atomDict):
205 """ *Calculator Method*
206
207 calculates the maximum value of a descriptor across a composition
208
209 **Arguments**
210
211 - strArg: the arguments in string form
212
213 - compos: the composition vector
214
215 - atomDict: the atomic dictionary
216
217 **Returns**
218
219 a float
220
221 """
222 accum = []
223 for atom, _ in composList:
224 tStr = strArg.replace('DEADBEEF', atom)
225 accum.append(eval(tStr))
226 return max(accum)
227
228
229
230
231
232
233
235 """ replace atomic variables with the appropriate dictionary lookup
236
237 *Not intended for client use*
238
239 """
240 for i in range(len(varList)):
241 cExpr = cExpr.replace('$%d' % (i + 1), '%s["DEADBEEF"]["%s"]' % (dictName, varList[i]))
242 return cExpr
243
244
246 """ replace compound variables with the appropriate list index
247
248 *Not intended for client use*
249
250 """
251 for i in range(len(varList)):
252 cExpr = cExpr.replace('$%s' % chr(ord('a') + i), '%s["%s"]' % (dictName, varList[i]))
253 return cExpr
254
255
257 """ alters the arguments of calls to calculator methods
258
259 *Not intended for client use*
260
261 This is kind of putrid (and the code ain't so pretty either)
262 The general idea is that the various special methods for atomic
263 descriptors need two extra arguments (the composition and the atomic
264 dict). Rather than make the user type those in, we just find
265 invocations of these methods and fill out the function calls using
266 string replacements.
267 """
268 res = cExpr
269 for method in knownMethods:
270 p = 0
271 while p != -1 and p < len(res):
272 p = res.find(method, p)
273 if p != -1:
274 p = p + len(method) + 1
275 start = p
276 parenCount = 1
277 while parenCount and p < len(res):
278 if res[p] == ')':
279 parenCount = parenCount - 1
280 elif res[p] == '(':
281 parenCount = parenCount + 1
282 p = p + 1
283 if p <= len(res):
284 res = res[0:start] + "'%s',compos,atomDict" % (res[start:p - 1]) + res[p - 1:]
285 return res
286
287
289 """ calculates the value of the descriptor for a single compound
290
291 **ARGUMENTS:**
292
293 - compos: a vector/tuple containing the composition
294 information... in the form:
295 '[("Fe",1.),("Pt",2.),("Rh",0.02)]'
296
297 - argVect: a vector/tuple with three elements:
298
299 1) AtomicDescriptorNames: a list/tuple of the names of the
300 atomic descriptors being used. These determine the
301 meaning of $1, $2, etc. in the expression
302
303 2) CompoundDescriptorNames: a list/tuple of the names of the
304 compound descriptors being used. These determine the
305 meaning of $a, $b, etc. in the expression
306
307 3) Expr: a string containing the expression to be used to
308 evaluate the final result.
309
310 - atomDict:
311 a dictionary of atomic descriptors. Each atomic entry is
312 another dictionary containing the individual descriptors
313 and their values
314
315 - propVect:
316 a list of descriptors for the composition.
317
318 **RETURNS:**
319
320 the value of the descriptor, -666 if a problem was encountered
321
322 **NOTE:**
323
324 - because it takes rather a lot of work to get everything set
325 up to calculate a descriptor, if you are calculating the
326 same descriptor for multiple compounds, you probably want to
327 be calling _CalcMultipleCompoundsDescriptor()_.
328
329 """
330 try:
331 atomVarNames = argVect[0]
332 compositionVarNames = argVect[1]
333 formula = argVect[2]
334 formula = _SubForCompoundDescriptors(formula, compositionVarNames, 'propDict')
335 formula = _SubForAtomicVars(formula, atomVarNames, 'atomDict')
336 evalTarget = _SubMethodArgs(formula, knownMethods)
337 except Exception:
338 if __DEBUG:
339 import traceback
340 print('Sub Failure!')
341 traceback.print_exc()
342 print(evalTarget)
343 print(propDict)
344 raise RuntimeError('Failure 1')
345 else:
346 return -666
347
348 try:
349 v = eval(evalTarget)
350 except Exception:
351 if __DEBUG:
352 import traceback
353 outF = open(RDConfig.RDCodeDir + '/ml/descriptors/log.txt', 'a+')
354 outF.write('#------------------------------\n')
355 outF.write('formula: %s\n' % repr(formula))
356 outF.write('target: %s\n' % repr(evalTarget))
357 outF.write('propDict: %s\n' % (repr(propDict)))
358
359 outF.write('keys: %s\n' % (repr(sorted(atomDict))))
360 outF.close()
361 print('ick!')
362 print('formula:', formula)
363 print('target:', evalTarget)
364 print('propDict:', propDict)
365 print('keys:', atomDict.keys())
366 traceback.print_exc()
367 raise RuntimeError('Failure 2')
368 else:
369 v = -666
370 return v
371
372
374 """ calculates the value of the descriptor for a list of compounds
375
376 **ARGUMENTS:**
377
378 - composVect: a vector of vector/tuple containing the composition
379 information.
380 See _CalcSingleCompoundDescriptor()_ for an explanation of the elements.
381
382 - argVect: a vector/tuple with three elements:
383
384 1) AtomicDescriptorNames: a list/tuple of the names of the
385 atomic descriptors being used. These determine the
386 meaning of $1, $2, etc. in the expression
387
388 2) CompoundDsscriptorNames: a list/tuple of the names of the
389 compound descriptors being used. These determine the
390 meaning of $a, $b, etc. in the expression
391
392 3) Expr: a string containing the expression to be used to
393 evaluate the final result.
394
395 - atomDict:
396 a dictionary of atomic descriptors. Each atomic entry is
397 another dictionary containing the individual descriptors
398 and their values
399
400 - propVectList:
401 a vector of vectors of descriptors for the composition.
402
403 **RETURNS:**
404
405 a vector containing the values of the descriptor for each
406 compound. Any given entry will be -666 if problems were
407 encountered
408
409 """
410 res = [-666] * len(composVect)
411 try:
412 atomVarNames = argVect[0]
413 compositionVarNames = argVect[1]
414 formula = argVect[2]
415 formula = _SubForCompoundDescriptors(formula, compositionVarNames, 'propDict')
416 formula = _SubForAtomicVars(formula, atomVarNames, 'atomDict')
417 evalTarget = _SubMethodArgs(formula, knownMethods)
418 except Exception:
419 return res
420 for i in range(len(composVect)):
421 propDict = propDictList[i]
422 compos = composVect[i]
423 try:
424 v = eval(evalTarget)
425 except Exception:
426 v = -666
427 res[i] = v
428 return res
429
430
431
432
433
435 piece1 = [['d1', 'd2', 's1'], ['d1', 'd2', 's1']]
436 aDict = {'Fe': {'d1': 1., 'd2': 2., 's1': 'abc'}, 'Pt': {'d1': 10., 'd2': 20., 's1': 'def'}}
437 pDict = {'d1': 100., 'd2': 200.}
438 compos = [('Fe', 1), ('Pt', 1)]
439
440 cExprs = ["SUM($1)", "SUM($1)+SUM($2)", "SUM($1)+SUM($1)", "MEAN($1)", "DEV($2)", "MAX($1)",
441 "MIN($1)/MAX($1)", "MIN($2)", "SUM($1)/$a", "sqrt($a+$b)", "SUM((3.*$1)/($2))",
442 'HAS($3,"def")', 'HAS($3,"xyz")', "foo"]
443
444 for cExpr in cExprs:
445 argVect = piece1 + [cExpr]
446 print(cExpr)
447 print(CalcSingleCompoundDescriptor(compos, argVect, aDict, pDict))
448 print(CalcMultipleCompoundsDescriptor([compos, compos], argVect, aDict, [pDict, pDict]))
449
450
451 if __name__ == '__main__':
452 _exampleCode()
453