1
2
3
4 """ descriptor calculator for compounds defined by a composition alone
5 (only the composition is required)
6
7 """
8 from __future__ import print_function
9
10 from rdkit import RDConfig
11 from rdkit.ML.Descriptors import Parser, Descriptors
12 from rdkit.utils import chemutils
13
14
15 countOptions = [('NVAL', 'total number of valence electrons'),
16 ('NVAL_NO_FULL_F', 'number of valence electrons neglecting filled f shells'),
17 ('NVAL_NO_FULL_D', 'number of valence electrons neglecting filled d shells'),
18 ('NVAL_NO_FULL', 'number of valence electrons neglecting filled f and d shells')]
19
20
22 """ gets possible descriptor names from a database
23
24 **Arguments**
25
26 - db: the name of the database to use
27
28 - tbl1: the name of the table to be used for reading descriptor values
29
30 - tbl2: the name of the table to be used for reading notes about the
31 descriptors (*descriptions of the descriptors if you like*)
32
33 - user: the user name for DB access
34
35 - password: the password for DB access
36
37 **Returns**
38
39 a 2-tuple containing:
40
41 1) a list of column names
42
43 2) a list of column descriptors
44
45 **Notes**
46
47 - this uses _Dbase.DbInfo_ and Dfunctionality for querying the database
48
49 - it is assumed that tbl2 includes 'property' and 'notes' columns
50
51 """
52 from rdkit.Dbase.DbConnection import DbConnect
53 conn = DbConnect(db, user=user, password=password)
54
55 colNames = conn.GetColumnNames(table=tbl1)
56 colDesc = map(lambda x: (x[0].upper(), x[1]), conn.GetColumns('property,notes', table=tbl2))
57 for name, desc in countOptions:
58 colNames.append(name)
59 colDesc.append((name, desc))
60 return colNames, colDesc
61
62
64 """ used for calculating descriptors
65
66 This is the central point for descriptor calculation
67
68 **Notes**
69
70 - There are two kinds of descriptors this cares about:
71
72 1) *Simple Descriptors* can be calculated solely using atomic descriptor
73 values and the composition of the compound. The full list of possible
74 simple descriptors is determined by the types of *Calculator Methods*
75 (see below) and the contents of an atomic database.
76
77 Simple Descriptors can be marked as *nonZeroDescriptors*. These are used
78 to winnow out atom types where particular atomic descriptors are zero
79 (usually indicating that the value is unknown)
80
81 Simple Descriptors are maintained locally in the _simpleList_
82
83 2) *Compound Descriptors* may rely upon more complicated computation schemes
84 and descriptors for the compound as a whole (e.g. structural variables, etc.).
85 The full list of compound descriptors is limitless. They are calculated using
86 the _ML.Descriptors.Parser_ module.
87
88 Compound Descriptors are maintained locally in the _compoundList_
89
90 - This class has a some special methods which are labelled as *Calculator Method*
91 These are used internally to take atomic descriptors and reduce them to a single
92 simple descriptor value for a composition. They are primarily intended for internal use.
93
94 - a *composition vector* is a list of 2-tuples: '[(atom1name,atom1Num),...]'
95 where atom1Num is the contribution of the atom to the stoichiometry of the
96 compound. No assumption is made about the stoichiometries (i.e. they don't
97 have to be either integral or all sum to one).
98
99 """
100
101
102
103
104
105 - def SUM(self, desc, compos):
106 """ *Calculator Method*
107
108 sums the descriptor values across the composition
109
110 **Arguments**
111
112 - desc: the name of the descriptor
113
114 - compos: the composition vector
115
116 **Returns**
117
118 a float
119
120 """
121 res = 0.0
122 for atom, num in compos:
123 res = res + self.atomDict[atom][desc] * num
124 return res
125
126 - def MEAN(self, desc, compos):
127 """ *Calculator Method*
128
129 averages the descriptor values across the composition
130
131 **Arguments**
132
133 - desc: the name of the descriptor
134
135 - compos: the composition vector
136
137 **Returns**
138
139 a float
140
141 """
142 res = 0.0
143 nSoFar = 0.0
144 for atom, num in compos:
145 res = res + self.atomDict[atom][desc] * num
146 nSoFar = nSoFar + num
147 return res / nSoFar
148
149 - def DEV(self, desc, compos):
150 """ *Calculator Method*
151
152 average deviation of the descriptor values across the composition
153
154 **Arguments**
155
156 - desc: the name of the descriptor
157
158 - compos: the composition vector
159
160 **Returns**
161
162 a float
163
164 """
165 mean = self.MEAN(desc, compos)
166 res = 0.0
167 nSoFar = 0.0
168 for atom, num in compos:
169 res = res + abs(self.atomDict[atom][desc] - mean) * num
170 nSoFar = nSoFar + num
171 return res / nSoFar
172
173 - def MIN(self, desc, compos):
174 """ *Calculator Method*
175
176 minimum of the descriptor values across the composition
177
178 **Arguments**
179
180 - desc: the name of the descriptor
181
182 - compos: the composition vector
183
184 **Returns**
185
186 a float
187
188 """
189 return min(map(lambda x, y=desc, z=self: z.atomDict[x[0]][y], compos))
190
191 - def MAX(self, desc, compos):
192 """ *Calculator Method*
193
194 maximum of the descriptor values across the composition
195
196 **Arguments**
197
198 - desc: the name of the descriptor
199
200 - compos: the composition vector
201
202 **Returns**
203
204 a float
205
206 """
207 return max(map(lambda x, y=desc, z=self: z.atomDict[x[0]][y], compos))
208
209
210
211
212
214 """ Handles the list of simple descriptors
215
216 This constructs the list of _nonZeroDescriptors_ and _requiredDescriptors_.
217
218 There's some other magic going on that I can't decipher at the moment.
219
220 """
221 global countOptions
222
223 self.nonZeroDescriptors = []
224 lCopy = self.simpleList[:]
225 tList = map(lambda x: x[0], countOptions)
226 for entry in lCopy:
227 if 'NONZERO' in entry[1]:
228 if entry[0] not in tList:
229 self.nonZeroDescriptors.append('%s != 0' % entry[0])
230 if len(entry[1]) == 1:
231 self.simpleList.remove(entry)
232 else:
233 self.simpleList[self.simpleList.index(entry)][1].remove('NONZERO')
234 self.requiredDescriptors = map(lambda x: x[0], self.simpleList)
235 for entry in tList:
236 if entry in self.requiredDescriptors:
237 self.requiredDescriptors.remove(entry)
238
240 """ Adds entries from the _compoundList_ to the list of _requiredDescriptors_
241
242 Each compound descriptor is surveyed. Any atomic descriptors it requires
243 are added to the list of _requiredDescriptors_ to be pulled from the database.
244
245 """
246
247 for entry in self.compoundList:
248 for atomicDesc in entry[1]:
249 if atomicDesc != '' and atomicDesc not in self.requiredDescriptors:
250 self.requiredDescriptors.append(atomicDesc)
251
253 """ builds the local atomic dict
254
255 We don't want to keep around all descriptor values for all atoms, so this
256 method takes care of only pulling out the descriptors in which we are
257 interested.
258
259 **Notes**
260
261 - this uses _chemutils.GetAtomicData_ to actually pull the data
262
263 """
264 self.ProcessSimpleList()
265 self.ProcessCompoundList()
266
267 self.atomDict = {}
268 whereString = ' and '.join(self.nonZeroDescriptors)
269 if whereString != '':
270 whereString = 'where ' + whereString
271 chemutils.GetAtomicData(self.atomDict, self.requiredDescriptors, self.dbName, self.dbTable,
272 whereString, self.dbUser, self.dbPassword, includeElCounts=1)
273
275 """ calculates all simple descriptors for a given composition
276
277 **Arguments**
278
279 - compos: a string representation of the composition
280
281 - composList: a *composVect*
282
283 The client must provide either _compos_ or _composList_. If both are
284 provided, _composList_ takes priority.
285
286 **Returns**
287 the list of descriptor values
288
289 **Notes**
290
291 - when _compos_ is provided, this uses _chemutils.SplitComposition_
292 to split the composition into its individual pieces
293
294 - if problems are encountered because of either an unknown descriptor or
295 atom type, a _KeyError_ will be raised.
296
297 """
298 if composList is None:
299 composList = chemutils.SplitComposition(compos)
300 try:
301 res = []
302 for descName, targets in self.simpleList:
303 for target in targets:
304 try:
305 method = getattr(self, target)
306 except AttributeError:
307 print('Method %s does not exist' % (target))
308 else:
309 res.append(method(descName, composList))
310 except KeyError as msg:
311 print('composition %s caused problems' % composList)
312 raise KeyError(msg)
313 return res
314
316 """ calculates all simple descriptors for a given composition
317
318 **Arguments**
319
320 - compos: a string representation of the composition
321
322 - composList: a *composVect*
323
324 - propDict: a dictionary containing the properties of the composition
325 as a whole (e.g. structural variables, etc.)
326
327 The client must provide either _compos_ or _composList_. If both are
328 provided, _composList_ takes priority.
329
330 **Returns**
331 the list of descriptor values
332
333 **Notes**
334
335 - when _compos_ is provided, this uses _chemutils.SplitComposition_
336 to split the composition into its individual pieces
337
338 """
339 if composList is None:
340 composList = chemutils.SplitComposition(compos)
341 res = []
342 for cl in self.compoundList:
343 val = Parser.CalcSingleCompoundDescriptor(composList, cl[1:], self.atomDict, propDict)
344 res.append(val)
345 return res
346
348 """ calculates all descriptors for a given composition
349
350 **Arguments**
351
352 - compos: a string representation of the composition
353
354 - propDict: a dictionary containing the properties of the composition
355 as a whole (e.g. structural variables, etc.). These are used to
356 generate Compound Descriptors
357
358 **Returns**
359 the list of all descriptor values
360
361 **Notes**
362
363 - this uses _chemutils.SplitComposition_
364 to split the composition into its individual pieces
365
366 """
367 composList = chemutils.SplitComposition(composVect[0])
368 try:
369 r1 = self.CalcSimpleDescriptorsForComposition(composList=composList)
370 except KeyError:
371 res = []
372 else:
373 r2 = self.CalcCompoundDescriptorsForComposition(composList=composList, propDict=propDict)
374 res = r1 + r2
375
376 return tuple(res)
377
378 CalcDescriptors = CalcDescriptorsForComposition
379
381 """ returns a list of the names of the descriptors this calculator generates
382
383 """
384 if self.descriptorNames is not None:
385 return self.descriptorNames
386 else:
387 res = []
388 for descName, targets in self.simpleList:
389 for target in targets:
390 if hasattr(self, target):
391 res.append('%s_%s' % (target, descName))
392 else:
393 print('Method %s does not exist' % (target))
394 for entry in self.compoundList:
395 res.append(entry[0])
396 self.descriptorNames = res[:]
397 return tuple(res)
398
399 - def __init__(self, simpleList, compoundList=None, dbName=None, dbTable='atomic_data',
400 dbUser='sysdba', dbPassword='masterkey'):
401 """ Constructor
402
403 **Arguments**
404
405 - simpleList: list of simple descriptors to be calculated
406 (see below for format)
407
408 - compoundList: list of compound descriptors to be calculated
409 (see below for format)
410
411 - dbName: name of the atomic database to be used
412
413 - dbTable: name the table in _dbName_ which has atomic data
414
415 - dbUser: user name for DB access
416
417 - dbPassword: password for DB access
418
419 **Note**
420
421 - format of simpleList:
422 a list of 2-tuples containing:
423
424 1) name of the atomic descriptor
425
426 2) a list of operations on that descriptor (e.g. NonZero, Max, etc.)
427 These must correspond to the *Calculator Method* names above.
428
429 - format of compoundList:
430 a list of 2-tuples containing:
431
432 1) name of the descriptor to be calculated
433
434 2) list of selected atomic descriptor names (define $1, $2, etc.)
435
436 3) list of selected compound descriptor names (define $a, $b, etc.)
437
438 4) text formula defining the calculation (see _Parser_)
439
440 """
441
442 if dbName is None:
443 dbName = RDConfig.RDDataDatabase
444
445 Descriptors.DescriptorCalculator.__init__(self)
446 self.simpleList = [(x[0].upper(), [y.upper() for y in x[1]]) for x in simpleList]
447 self.descriptorNames = None
448 self.compoundList = compoundList
449 if self.compoundList is None:
450 self.compoundList = []
451 self.dbName = dbName
452 self.dbTable = dbTable
453 self.dbUser = dbUser
454 self.dbPassword = dbPassword
455
456
458 d = [('DED', ['NonZero', 'Mean', 'Dev']), ('M_B_electroneg', ['NonZero']),
459 ('Cov_rad', ['Max', 'Min'])]
460 o = CompoundDescriptorCalculator(d)
461 o.BuildAtomDict()
462 print('len:', len(o.atomDict.keys()))
463 for key in list(o.atomDict)[-4:-1]:
464 print(key, o.atomDict[key])
465
466 print('descriptors:', o.GetDescriptorNames())
467 composList = ['Nb', 'Nb3', 'NbPt', 'Nb2Pt']
468 for compos in composList:
469 descs = o.CalcSimpleDescriptorsForComposition(compos)
470 print(compos, descs)
471
472
473 if __name__ == '__main__':
474 _exampleCode()
475