1 '''
2 Importing pandasTools enables several features that allow for using RDKit molecules as columns of a
3 Pandas dataframe.
4 If the dataframe is containing a molecule format in a column (e.g. smiles), like in this example:
5 >>> from rdkit.Chem import PandasTools
6 >>> import pandas as pd
7 >>> import os
8 >>> from rdkit import RDConfig
9 >>> antibiotics = pd.DataFrame(columns=['Name','Smiles'])
10 >>> antibiotics = antibiotics.append({'Smiles':'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C',
11 ... 'Name':'Penicilline G'}, ignore_index=True)#Penicilline G
12 >>> antibiotics = antibiotics.append({
13 ... 'Smiles':'CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O',
14 ... 'Name':'Tetracycline'}, ignore_index=True)#Tetracycline
15 >>> antibiotics = antibiotics.append({
16 ... 'Smiles':'CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O)O)C',
17 ... 'Name':'Ampicilline'}, ignore_index=True)#Ampicilline
18 >>> print([str(x) for x in antibiotics.columns])
19 ['Name', 'Smiles']
20 >>> print(antibiotics)
21 Name Smiles
22 0 Penicilline G CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C
23 1 Tetracycline CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4...
24 2 Ampicilline CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...
25
26 a new column can be created holding the respective RDKit molecule objects. The fingerprint can be
27 included to accelerate substructure searches on the dataframe.
28
29 >>> PandasTools.AddMoleculeColumnToFrame(antibiotics,'Smiles','Molecule',includeFingerprints=True)
30 >>> print([str(x) for x in antibiotics.columns])
31 ['Name', 'Smiles', 'Molecule']
32
33 A substructure filter can be applied on the dataframe using the RDKit molecule column,
34 because the ">=" operator has been modified to work as a substructure check.
35 Such the antibiotics containing the beta-lactam ring "C1C(=O)NC1" can be obtained by
36
37 >>> beta_lactam = Chem.MolFromSmiles('C1C(=O)NC1')
38 >>> beta_lactam_antibiotics = antibiotics[antibiotics['Molecule'] >= beta_lactam]
39 >>> print(beta_lactam_antibiotics[['Name','Smiles']])
40 Name Smiles
41 0 Penicilline G CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C
42 2 Ampicilline CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...
43
44
45 It is also possible to load an SDF file can be load into a dataframe.
46
47 >>> sdfFile = os.path.join(RDConfig.RDDataDir,'NCI/first_200.props.sdf')
48 >>> frame = PandasTools.LoadSDF(sdfFile,smilesName='SMILES',molColName='Molecule',
49 ... includeFingerprints=True)
50 >>> frame.info # doctest: +SKIP
51 <bound method DataFrame.info of <class 'pandas.core.frame.DataFrame'>
52 Int64Index: 200 entries, 0 to 199
53 Data columns:
54 AMW 200 non-null values
55 CLOGP 200 non-null values
56 CP 200 non-null values
57 CR 200 non-null values
58 DAYLIGHT.FPG 200 non-null values
59 DAYLIGHT_CLOGP 200 non-null values
60 FP 200 non-null values
61 ID 200 non-null values
62 ISM 200 non-null values
63 LIPINSKI_VIOLATIONS 200 non-null values
64 NUM_HACCEPTORS 200 non-null values
65 NUM_HDONORS 200 non-null values
66 NUM_HETEROATOMS 200 non-null values
67 NUM_LIPINSKIHACCEPTORS 200 non-null values
68 NUM_LIPINSKIHDONORS 200 non-null values
69 NUM_RINGS 200 non-null values
70 NUM_ROTATABLEBONDS 200 non-null values
71 P1 30 non-null values
72 SMILES 200 non-null values
73 Molecule 200 non-null values
74 dtypes: object(20)>
75
76 Conversion to html is quite easy:
77 >>> htm = frame.to_html()
78 >>> str(htm[:36])
79 '<table border="1" class="dataframe">'
80
81 In order to support rendering the molecules as images in the HTML export of the dataframe,
82 the __str__ method is monkey-patched to return a base64 encoded PNG:
83 >>> molX = Chem.MolFromSmiles('Fc1cNc2ccccc12')
84 >>> print(molX) # doctest: +SKIP
85 <img src="data:image/png;base64,..." alt="Mol"/>
86 This can be reverted using the ChangeMoleculeRendering method
87 >>> ChangeMoleculeRendering(renderer='String')
88 >>> print(molX) # doctest: +SKIP
89 <rdkit.Chem.rdchem.Mol object at 0x10d179440>
90 >>> ChangeMoleculeRendering(renderer='PNG')
91 >>> print(molX) # doctest: +SKIP
92 <img src="data:image/png;base64,..." alt="Mol"/>
93
94 '''
95 from __future__ import print_function
96
97 from base64 import b64encode
98 import sys
99 import types
100
101 import numpy as np
102 from rdkit import Chem
103 from rdkit import DataStructs
104 from rdkit.Chem import AllChem
105 from rdkit.Chem import Draw
106 from rdkit.Chem.Draw import rdMolDraw2D
107 from rdkit.Chem import SDWriter
108 from rdkit.Chem import rdchem
109 from rdkit.Chem.Scaffolds import MurckoScaffold
110 from rdkit.six import BytesIO, string_types, PY3
111
112 try:
113 import pandas as pd
116 """ Get the pandas version as a tuple """
117 import re
118 try:
119 v = pd.__version__
120 except AttributeError:
121 v = pd.version.version
122 v = re.split(r'[^0-9,.]', v)[0].split('.')
123 return tuple(int(vi) for vi in v)
124
125 if _getPandasVersion() < (0, 10):
126 print("Pandas version {0} not compatible with tests".format(_getPandasVersion()),
127 file=sys.stderr)
128 pd = None
129 else:
130 if 'display.width' in pd.core.config._registered_options:
131 pd.set_option('display.width', 1000000000)
132 if 'display.max_rows' in pd.core.config._registered_options:
133 pd.set_option('display.max_rows', 1000000000)
134 elif 'display.height' in pd.core.config._registered_options:
135 pd.set_option('display.height', 1000000000)
136 if 'display.max_colwidth' in pd.core.config._registered_options:
137 pd.set_option('display.max_colwidth', 1000000000)
138
139 defPandasRendering = pd.core.frame.DataFrame.to_html
140 except ImportError:
141 import traceback
142 traceback.print_exc()
143 pd = None
144
145 except Exception as e:
146 import traceback
147 traceback.print_exc()
148 pd = None
149
150 if pd:
151 try:
152 from pandas.io.formats import format as fmt
153 except:
154 try:
155 from pandas.formats import format as fmt
156 except ImportError:
157 from pandas.core import format as fmt
158 else:
159 fmt = 'Pandas not available'
160
161 highlightSubstructures = True
162 molRepresentation = 'png'
163 molSize = (200, 200)
167 '''
168 Patched default escaping of HTML control characters to allow molecule image rendering dataframes
169 '''
170 formatter = fmt.DataFrameFormatter(
171 self, buf=None, columns=None, col_space=None, colSpace=None, header=True, index=True,
172 na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None,
173 force_unicode=None, bold_rows=True, classes=None, escape=False)
174 formatter.to_html()
175 html = formatter.buf.getvalue()
176 return html
177
180 '''Ensure inheritance of patched to_html in "head" subframe
181 '''
182 df = self[:n]
183 df.to_html = types.MethodType(patchPandasHTMLrepr, df)
184 df.head = types.MethodType(patchPandasHeadMethod, df)
185 return df
186
189 """displayhook function for PNG data"""
190 s = b64encode(x).decode('ascii')
191 pd.set_option('display.max_columns', len(s) + 1000)
192 pd.set_option('display.max_rows', len(s) + 1000)
193 if len(s) + 100 > pd.get_option("display.max_colwidth"):
194 pd.set_option("display.max_colwidth", len(s) + 1000)
195 return s
196
199 """ mol rendered as SVG """
200 from IPython.display import SVG
201 from rdkit.Chem import rdDepictor
202 from rdkit.Chem.Draw import rdMolDraw2D
203 try:
204
205 mol.GetConformer(-1)
206 except ValueError:
207 rdDepictor.Compute2DCoords(mol)
208 drawer = rdMolDraw2D.MolDraw2DSVG(*size)
209 drawer.DrawMolecule(mol, highlightAtoms=highlightAtoms)
210 drawer.FinishDrawing()
211 svg = drawer.GetDrawingText().replace('svg:', '')
212 return SVG(svg).data
213
214
215 try:
216 from rdkit.Avalon import pyAvalonTools as pyAvalonTools
217
218 _fingerprinter = lambda x, y: pyAvalonTools.GetAvalonFP(x, isQuery=y, bitFlags=pyAvalonTools.avalonSSSBits)
219 except ImportError:
220
221 _fingerprinter = lambda x, y: Chem.PatternFingerprint(x, fpSize=2048)
225 """Allows for substructure check using the >= operator (X has substructure Y -> X >= Y) by
226 monkey-patching the __ge__ function
227 This has the effect that the pandas/numpy rowfilter can be used for substructure filtering
228 (filtered = dframe[dframe['RDKitColumn'] >= SubstructureMolecule])
229 """
230 if x is None or y is None:
231 return False
232 if hasattr(x, '_substructfp'):
233 if not hasattr(y, '_substructfp'):
234 y._substructfp = _fingerprinter(y, True)
235 if not DataStructs.AllProbeBitsMatch(y._substructfp, x._substructfp):
236 return False
237 match = x.GetSubstructMatch(y)
238 x.__sssAtoms = []
239 if match:
240 if highlightSubstructures:
241 x.__sssAtoms = list(match)
242 return True
243 else:
244 return False
245
248 '''returns the molecules as base64 encoded PNG image
249 '''
250 if highlightSubstructures and hasattr(x, '__sssAtoms'):
251 highlightAtoms = x.__sssAtoms
252 else:
253 highlightAtoms = []
254 if molRepresentation.lower() == 'svg':
255 from IPython.display import SVG
256 svg = Draw._moltoSVG(x, molSize, highlightAtoms, "", True)
257 return SVG(svg).data
258 else:
259 data = Draw._moltoimg(x,molSize,highlightAtoms,"",returnPNG=True, kekulize=True)
260 return '<img src="data:image/png;base64,%s" alt="Mol"/>' % _get_image(data)
261
265
268 '''Precomputes fingerprints and stores results in molecule objects to accelerate
269 substructure matching
270 '''
271 if m is not None:
272 m._substructfp = _fingerprinter(m, False)
273 return m
274
277 '''Changes the default dataframe rendering to not escape HTML characters, thus allowing
278 rendered images in all dataframes.
279 IMPORTANT: THIS IS A GLOBAL CHANGE THAT WILL AFFECT TO COMPLETE PYTHON SESSION. If you want
280 to change the rendering only for a single dataframe use the "ChangeMoleculeRendering" method
281 instead.
282 '''
283 if images:
284 pd.core.frame.DataFrame.to_html = patchPandasHTMLrepr
285 else:
286 pd.core.frame.DataFrame.to_html = defPandasRendering
287
290 '''Converts the molecules contains in "smilesCol" to RDKit molecules and appends them to the
291 dataframe "frame" using the specified column name.
292 If desired, a fingerprint can be computed and stored with the molecule objects to accelerate
293 substructure matching
294 '''
295 if not includeFingerprints:
296 frame[molCol] = frame[smilesCol].map(Chem.MolFromSmiles)
297 else:
298 frame[molCol] = frame[smilesCol].map(
299 lambda smiles: _MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
300 RenderImagesInAllDataFrames(images=True)
301
304 '''Allows to change the rendering of the molecules between base64 PNG images and string
305 representations.
306 This serves two purposes: First it allows to avoid the generation of images if this is
307 not desired and, secondly, it allows to enable image rendering for newly created dataframe
308 that already contains molecules, without having to rerun the time-consuming
309 AddMoleculeColumnToFrame. Note: this behaviour is, because some pandas methods, e.g. head()
310 returns a new dataframe instance that uses the default pandas rendering (thus not drawing
311 images for molecules) instead of the monkey-patched one.
312 '''
313 if renderer == 'String':
314 Chem.Mol.__str__ = PrintDefaultMolRep
315 else:
316 Chem.Mol.__str__ = PrintAsBase64PNGString
317 if frame is not None:
318 frame.to_html = types.MethodType(patchPandasHTMLrepr, frame)
319
320
321 -def LoadSDF(filename, idName='ID', molColName='ROMol', includeFingerprints=False,
322 isomericSmiles=False, smilesName=None, embedProps=False):
323 '''Read file in SDF format and return as Pandas data frame.
324 If embedProps=True all properties also get embedded in Mol objects in the molecule column.
325 If molColName=None molecules would not be present in resulting DataFrame (only properties
326 would be read).
327 '''
328 if isinstance(filename, string_types):
329 if filename.lower()[-3:] == ".gz":
330 import gzip
331 f = gzip.open(filename, "rb")
332 else:
333 f = open(filename, 'rb')
334 close = f.close
335 else:
336 f = filename
337 close = None
338 records = []
339 indices = []
340 for i, mol in enumerate(Chem.ForwardSDMolSupplier(f, sanitize=(molColName is not None))):
341 if mol is None:
342 continue
343 row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames())
344 if molColName is not None and not embedProps:
345 for prop in mol.GetPropNames():
346 mol.ClearProp(prop)
347 if mol.HasProp('_Name'):
348 row[idName] = mol.GetProp('_Name')
349 if smilesName is not None:
350 row[smilesName] = Chem.MolToSmiles(mol, isomericSmiles=isomericSmiles)
351 if molColName is not None and not includeFingerprints:
352 row[molColName] = mol
353 elif molColName is not None:
354 row[molColName] = _MolPlusFingerprint(mol)
355 records.append(row)
356 indices.append(i)
357
358 if close is not None:
359 close()
360 RenderImagesInAllDataFrames(images=True)
361 return pd.DataFrame(records, index=indices)
362
363
364 -def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumeric=False):
365 '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as
366 SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export
367 all columns.
368 The "allNumeric" flag allows to automatically include all numeric columns in the output.
369 User has to make sure that correct data type is assigned to column.
370 "idName" can be used to select a column to serve as molecule title. It can be set to
371 "RowID" to use the dataframe row key as title.
372 '''
373 close = None
374 if isinstance(out, string_types):
375 if out.lower()[-3:] == ".gz":
376 import gzip
377 if PY3:
378 out = gzip.open(out, "wt")
379 else:
380 out = gzip.open(out, "wb")
381 close = out.close
382
383 writer = SDWriter(out)
384 if properties is None:
385 properties = []
386 else:
387 properties = list(properties)
388 if allNumeric:
389 properties.extend(
390 [dt for dt in df.dtypes.keys()
391 if (np.issubdtype(df.dtypes[dt], float) or np.issubdtype(df.dtypes[dt], int))])
392
393 if molColName in properties:
394 properties.remove(molColName)
395 if idName in properties:
396 properties.remove(idName)
397 writer.SetProps(properties)
398 for row in df.iterrows():
399
400 mol = Chem.Mol(row[1][molColName])
401
402 if idName is not None:
403 if idName == 'RowID':
404 mol.SetProp('_Name', str(row[0]))
405 else:
406 mol.SetProp('_Name', str(row[1][idName]))
407 for p in properties:
408 cell_value = row[1][p]
409
410 if np.issubdtype(type(cell_value), float):
411 s = '{:f}'.format(cell_value).rstrip("0")
412 if s[-1] == ".":
413 s += "0"
414 mol.SetProp(p, s)
415 else:
416 mol.SetProp(p, str(cell_value))
417 writer.write(mol)
418 writer.close()
419 if close is not None:
420 close()
421
422
423 _saltRemover = None
435
436
437 -def SaveSMILESFromFrame(frame, outFile, molCol='ROMol', NamesCol='', isomericSmiles=False):
438 '''
439 Saves smi file. SMILES are generated from column with RDKit molecules. Column
440 with names is optional.
441 '''
442 w = Chem.SmilesWriter(outFile, isomericSmiles=isomericSmiles)
443 if NamesCol != '':
444 for m, n in zip(frame[molCol], (str(c) for c in frame[NamesCol])):
445 m.SetProp('_Name', n)
446 w.write(m)
447 w.close()
448 else:
449 for m in frame[molCol]:
450 w.write(m)
451 w.close()
452
455 """
456 Saves pandas DataFrame as a xlsx file with embedded images.
457 It maps numpy data types to excel cell types:
458 int, float -> number
459 datetime -> datetime
460 object -> string (limited to 32k character - xlsx limitations)
461
462 Cells with compound images are a bit larger than images due to excel.
463 Column width weirdness explained (from xlsxwriter docs):
464 The width corresponds to the column width value that is specified in Excel.
465 It is approximately equal to the length of a string in the default font of Calibri 11.
466 Unfortunately, there is no way to specify "AutoFit" for a column in the Excel file format.
467 This feature is only available at runtime from within Excel.
468 """
469
470 import xlsxwriter
471
472 cols = list(frame.columns)
473 cols.remove(molCol)
474 dataTypes = dict(frame.dtypes)
475
476 workbook = xlsxwriter.Workbook(outFile)
477 worksheet = workbook.add_worksheet()
478 worksheet.set_column('A:A', size[0] / 6.)
479
480
481 c2 = 1
482 for x in cols:
483 worksheet.write_string(0, c2, x)
484 c2 += 1
485
486 c = 1
487 for _, row in frame.iterrows():
488 image_data = BytesIO()
489 img = Draw.MolToImage(row[molCol], size=size)
490 img.save(image_data, format='PNG')
491
492 worksheet.set_row(c, height=size[1])
493 worksheet.insert_image(c, 0, "f", {'image_data': image_data})
494
495 c2 = 1
496 for x in cols:
497 if str(dataTypes[x]) == "object":
498 worksheet.write_string(c, c2, str(row[x])[:32000])
499 elif ('float' in str(dataTypes[x])) or ('int' in str(dataTypes[x])):
500 if (row[x] != np.nan) or (row[x] != np.inf):
501 worksheet.write_number(c, c2, row[x])
502 elif 'datetime' in str(dataTypes[x]):
503 worksheet.write_datetime(c, c2, row[x])
504 c2 += 1
505 c += 1
506
507 workbook.close()
508 image_data.close()
509
512 '''
513 Draw grid image of mols in pandas DataFrame.
514 '''
515 if legendsCol:
516 if legendsCol == frame.index.name:
517 kwargs['legends'] = [str(c) for c in frame.index]
518 else:
519 kwargs['legends'] = [str(c) for c in frame[legendsCol]]
520 return Draw.MolsToGridImage(frame[column], **kwargs)
521
522
523 -def AddMurckoToFrame(frame, molCol='ROMol', MurckoCol='Murcko_SMILES', Generic=False):
535
538 """
539 Aligns mol (RDKit mol object) to scaffold (SMILES string)
540 """
541 scaffold = Chem.MolFromSmiles(scaffold)
542 AllChem.Compute2DCoords(scaffold)
543 AllChem.GenerateDepictionMatching2DStructure(mol, scaffold)
544 return mol
545
546
547 -def AlignToScaffold(frame, molCol='ROMol', scaffoldCol='Murcko_SMILES'):
548 '''
549 Aligns molecules in molCol to scaffolds in scaffoldCol
550 '''
551 frame[molCol] = frame.apply(lambda x: AlignMol(x[molCol], x[scaffoldCol]), axis=1)
552
564
571
572
573 _originalSettings = {}
574 InstallPandasTools()
582 import doctest
583 failed, _ = doctest.testmod(optionflags=doctest.ELLIPSIS + doctest.NORMALIZE_WHITESPACE,
584 verbose=verbose)
585 if(failed):
586 sys.exit(failed)
587
588 if __name__ == '__main__':
589 import unittest
590 try:
591 import xlsxwriter
592 except ImportError:
593 print('not there')
594 xlsxwriter = None
596 @unittest.skipIf(xlsxwriter is None,'xlsxwriter not installed')
598 import os
599 from rdkit import RDConfig
600 sdfFile = os.path.join(RDConfig.RDDataDir,'NCI/first_200.props.sdf')
601 frame = LoadSDF(sdfFile)
602 SaveXlsxFromFrame(frame,'foo.xlsx')
603
604 if pd is None:
605 print("pandas installation not found, skipping tests", file=sys.stderr)
606 elif _getPandasVersion() < (0, 10):
607 print("pandas installation >=0.10 not found, skipping tests", file=sys.stderr)
608 else:
609 _runDoctests();
610 unittest.main()
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646