Package rdkit :: Package ML :: Package Data :: Module MLData
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Data.MLData

  1  # 
  2  #  Copyright (C) 2000-2008  greg Landrum and Rational Discovery LLC 
  3  #    All Rights Reserved 
  4  # 
  5  """ classes to be used to help work with data sets 
  6   
  7  """ 
  8  from __future__ import print_function 
  9   
 10  import copy 
 11  import math 
 12   
 13  import numpy 
 14   
 15  from rdkit.six import integer_types 
 16   
 17   
 18  numericTypes = integer_types + (float, ) 
 19   
 20   
21 -class MLDataSet(object):
22 """ A data set for holding general data (floats, ints, and strings) 23 24 **Note** 25 this is intended to be a read-only data structure 26 (i.e. after calling the constructor you cannot touch it) 27 """ 28
29 - def __init__(self, data, nVars=None, nPts=None, nPossibleVals=None, qBounds=None, varNames=None, 30 ptNames=None, nResults=1):
31 """ Constructor 32 33 **Arguments** 34 35 - data: a list of lists containing the data. The data are copied, so don't worry 36 about us overwriting them. 37 38 - nVars: the number of variables 39 40 - nPts: the number of points 41 42 - nPossibleVals: an list containing the number of possible values 43 for each variable (should contain 0 when not relevant) 44 This is _nVars_ long 45 46 - qBounds: a list of lists containing quantization bounds for variables 47 which are to be quantized (note, this class does not quantize 48 the variables itself, it merely stores quantization bounds. 49 an empty sublist indicates no quantization for a given variable 50 This is _nVars_ long 51 52 - varNames: a list of the names of the variables. 53 This is _nVars_ long 54 55 - ptNames: the names (labels) of the individual data points 56 This is _nPts_ long 57 58 - nResults: the number of results columns in the data lists. This is usually 59 1, but can be higher. 60 """ 61 self.data = [x[:] for x in data] 62 self.nResults = nResults 63 if nVars is None: 64 nVars = len(self.data[0]) - self.nResults 65 self.nVars = nVars 66 if nPts is None: 67 nPts = len(data) 68 self.nPts = nPts 69 if qBounds is None: 70 qBounds = [[]] * len(self.data[0]) 71 self.qBounds = qBounds 72 if nPossibleVals is None: 73 nPossibleVals = self._CalcNPossible(self.data) 74 self.nPossibleVals = nPossibleVals 75 if varNames is None: 76 varNames = [''] * self.nVars 77 self.varNames = varNames 78 if ptNames is None: 79 ptNames = [''] * self.nPts 80 self.ptNames = ptNames
81
82 - def _CalcNPossible(self, data):
83 """calculates the number of possible values of each variable (where possible) 84 85 **Arguments** 86 87 -data: a list of examples to be used 88 89 **Returns** 90 91 a list of nPossible values for each variable 92 93 """ 94 nVars = self.GetNVars() + self.nResults 95 nPossible = [-1] * nVars 96 cols = list(range(nVars)) 97 for i, bounds in enumerate(self.qBounds): 98 if len(bounds) > 0: 99 nPossible[i] = len(bounds) 100 cols.remove(i) 101 102 for i, pt in enumerate(self.data): 103 for col in cols[:]: 104 d = pt[col] 105 if type(d) in numericTypes: 106 if math.floor(d) == d: 107 nPossible[col] = max(math.floor(d), nPossible[col]) 108 else: 109 nPossible[col] = -1 110 cols.remove(col) 111 else: 112 nPossible[col] = -1 113 cols.remove(col) 114 return [int(x) + 1 for x in nPossible]
115
116 - def GetNResults(self):
117 return self.nResults
118
119 - def GetNVars(self):
120 return self.nVars
121
122 - def GetNPts(self):
123 return self.nPts
124
125 - def GetNPossibleVals(self):
126 return self.nPossibleVals
127
128 - def GetQuantBounds(self):
129 return self.qBounds
130
131 - def __getitem__(self, idx):
132 res = [self.ptNames[idx]] + self.data[idx][:] 133 return res
134
135 - def __setitem__(self, idx, val):
136 if len(val) != self.GetNVars() + self.GetNResults() + 1: 137 raise ValueError('bad value in assignment') 138 self.ptNames[idx] = val[0] 139 self.data[idx] = val[1:] 140 return val
141
142 - def GetNamedData(self):
143 """ returns a list of named examples 144 145 **Note** 146 147 a named example is the result of prepending the example 148 name to the data list 149 150 """ 151 res = [None] * self.nPts 152 for i in range(self.nPts): 153 res[i] = [self.ptNames[i]] + self.data[i][:] 154 return res
155
156 - def GetAllData(self):
157 """ returns a *copy* of the data 158 159 """ 160 return copy.deepcopy(self.data)
161
162 - def GetInputData(self):
163 """ returns the input data 164 165 **Note** 166 167 _inputData_ means the examples without their result fields 168 (the last _NResults_ entries) 169 170 """ 171 v = self.GetNResults() 172 return [x[:-v] for x in self.data]
173
174 - def GetResults(self):
175 """ Returns the result fields from each example 176 177 """ 178 if self.GetNResults() > 1: 179 v = self.GetNResults() 180 res = [x[-v:] for x in self.data] 181 else: 182 res = [x[-1] for x in self.data] 183 return res
184
185 - def GetVarNames(self):
186 return self.varNames
187
188 - def GetPtNames(self):
189 return self.ptNames
190
191 - def AddPoint(self, pt):
192 self.data.append(pt[1:]) 193 self.ptNames.append(pt[0]) 194 self.nPts += 1
195
196 - def AddPoints(self, pts, names):
197 if len(pts) != len(names): 198 raise ValueError("input length mismatch") 199 self.data += pts 200 self.ptNames += names 201 self.nPts = len(self.data)
202 203
204 -class MLQuantDataSet(MLDataSet):
205 """ a data set for holding quantized data 206 207 208 **Note** 209 210 this is intended to be a read-only data structure 211 (i.e. after calling the constructor you cannot touch it) 212 213 **Big differences to MLDataSet** 214 215 1) data are stored in a numpy array since they are homogenous 216 217 2) results are assumed to be quantized (i.e. no qBounds entry is required) 218 219 """ 220
221 - def _CalcNPossible(self, data):
222 """calculates the number of possible values of each variable 223 224 **Arguments** 225 226 -data: a list of examples to be used 227 228 **Returns** 229 230 a list of nPossible values for each variable 231 232 """ 233 return [max(x) + 1 for x in numpy.transpose(data)]
234
235 - def GetNamedData(self):
236 """ returns a list of named examples 237 238 **Note** 239 240 a named example is the result of prepending the example 241 name to the data list 242 243 """ 244 res = [None] * self.nPts 245 for i in range(self.nPts): 246 res[i] = [self.ptNames[i]] + self.data[i].tolist() 247 return res
248
249 - def GetAllData(self):
250 """ returns a *copy* of the data 251 252 """ 253 return self.data.tolist()
254
255 - def GetInputData(self):
256 """ returns the input data 257 258 **Note** 259 260 _inputData_ means the examples without their result fields 261 (the last _NResults_ entries) 262 263 """ 264 return (self.data[:, :-self.nResults]).tolist()
265
266 - def GetResults(self):
267 """ Returns the result fields from each example 268 269 """ 270 if self.GetNResults() > 1: 271 v = self.GetNResults() 272 res = [x[-v:] for x in self.data] 273 else: 274 res = [x[-1] for x in self.data] 275 return res
276
277 - def __init__(self, data, nVars=None, nPts=None, nPossibleVals=None, qBounds=None, varNames=None, 278 ptNames=None, nResults=1):
279 """ Constructor 280 281 **Arguments** 282 283 - data: a list of lists containing the data. The data are copied, so don't worry 284 about us overwriting them. 285 286 - nVars: the number of variables 287 288 - nPts: the number of points 289 290 - nPossibleVals: an list containing the number of possible values 291 for each variable (should contain 0 when not relevant) 292 This is _nVars_ long 293 294 - qBounds: a list of lists containing quantization bounds for variables 295 which are to be quantized (note, this class does not quantize 296 the variables itself, it merely stores quantization bounds. 297 an empty sublist indicates no quantization for a given variable 298 This is _nVars_ long 299 300 - varNames: a list of the names of the variables. 301 This is _nVars_ long 302 303 - ptNames: the names (labels) of the individual data points 304 This is _nPts_ long 305 306 - nResults: the number of results columns in the data lists. This is usually 307 1, but can be higher. 308 """ 309 self.data = numpy.array(data) 310 self.nResults = nResults 311 if nVars is None: 312 nVars = len(data[0]) - self.nResults 313 self.nVars = nVars 314 if nPts is None: 315 nPts = len(data) 316 self.nPts = nPts 317 if qBounds is None: 318 qBounds = [[]] * self.nVars 319 self.qBounds = qBounds 320 if nPossibleVals is None: 321 nPossibleVals = self._CalcNPossible(data) 322 self.nPossibleVals = nPossibleVals 323 if varNames is None: 324 varNames = [''] * self.nVars 325 self.varNames = varNames 326 if ptNames is None: 327 ptNames = [''] * self.nPts 328 self.ptNames = ptNames
329 330 331 if __name__ == '__main__': 332 from . import DataUtils 333 examples = [[0, 0, 0, 0, 0], [0, 0, 0, 1, 0], [1, 0, 0, 0, 1], [2, 1, 0, 0, 1], [2, 2, 1, 0, 1]] 334 varNames = ['foo1', 'foo2', 'foo3', 'foo4', 'res'] 335 ptNames = ['p1', 'p2', 'p3', 'p4', 'p5'] 336 dataset = MLQuantDataSet(examples, varNames=varNames, ptNames=ptNames) 337 DataUtils.WritePickledData('test_data/test.qdat.pkl', dataset) 338 print('nVars:', dataset.GetNVars()) 339 print('nPts:', dataset.GetNPts()) 340 print('nPoss:', dataset.GetNPossibleVals()) 341 print('qBounds:', dataset.GetQuantBounds()) 342 print('data:', dataset.GetAllData()) 343 print('Input data:', dataset.GetInputData()) 344 print('results:', dataset.GetResults()) 345 346 print('nameddata:', dataset.GetNamedData()) 347 348 examples = [ 349 ['foo', 1, 1.0, 1, 1.1], 350 ['foo', 2, 1.0, 1, 2.1], 351 ['foo', 3, 1.2, 1.1, 3.1], 352 ['foo', 4, 1.0, 1, 4.1], 353 ['foo', 5, 1.1, 1, 5.1], 354 ] 355 qBounds = [[], [], [], [], [2, 4]] 356 varNames = ['foo1', 'foo2', 'foo3', 'foo4', 'res'] 357 ptNames = ['p1', 'p2', 'p3', 'p4', 'p5'] 358 dataset = MLDataSet(examples, qBounds=qBounds) 359 DataUtils.WritePickledData('test_data/test.dat.pkl', dataset) 360 print('nVars:', dataset.GetNVars()) 361 print('nPts:', dataset.GetNPts()) 362 print('nPoss:', dataset.GetNPossibleVals()) 363 print('qBounds:', dataset.GetQuantBounds()) 364 print('data:', dataset.GetAllData()) 365 print('Input data:', dataset.GetInputData()) 366 print('results:', dataset.GetResults()) 367 368 print('nameddata:', dataset.GetNamedData()) 369