BALL  1.4.79
QSARData.h
Go to the documentation of this file.
1 /* QSARData.h
2  *
3  * Copyright (C) 2009 Marcel Schumann
4  *
5  * This file is part of QuEasy -- A Toolbox for Automated QSAR Model
6  * Construction and Validation.
7  * QuEasy is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3 of the License, or (at
10  * your option) any later version.
11  *
12  * QuEasy is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 // -*- Mode: C++; tab-width: 2; -*-
22 // vi: set ts=2:
23 //
24 //
25 
26 #ifndef QSARH
27 #define QSARH
28 
29 #include <iostream>
30 #include <BALL/KERNEL/system.h>
31 #include <BALL/FORMAT/SDFile.h>
32 #include <BALL/FORMAT/PDBFile.h>
33 #include <BALL/FORMAT/HINFile.h>
34 #include <BALL/FORMAT/MOLFile.h>
35 #include <vector>
36 #include <list>
37 #include <set>
38 #include <map>
39 #include <math.h>
40 #include <sstream>
41 #include <fstream>
42 #include <limits>
43 #include <fstream>
48 #include <BALL/COMMON/exception.h>
49 #include <cstring>
50 
51 #ifndef STATISTICS
52 #include <BALL/QSAR/statistics.h>
53 #endif
54 
55 #ifndef QSAR_EXCEPTION
56 #include <BALL/QSAR/exception.h>
57 #endif
58 
59 #include <BALL/CONCEPT/timeStamp.h>
60 
61 // #ifndef MODEL
62 // #include "Model.h"
63 // #endif
64 
65 namespace BALL
66 {
67  class MolecularSimilarity;
68 
69  namespace QSAR
70  {
71  typedef vector<double> Column;
72  typedef vector<Column> VMatrix;
73 
76  {
77  public:
78 
79  QSARData();
80 
81  ~QSARData();
82 
86  bool isDataCentered() const;
87 
89  bool isResponseCentered() const;
90 
95  vector<String>* readPropertyNames(String sd_file);
96 
100  void readSDFile(const char* file);
101 
107  void readSDFile(const char* file, std::multiset<int>& act, bool useExDesc=1, bool append=0, bool translate_class_labels=0);
108 
109  void readSDFile(const char* file, std::set<String>& activity_names, bool useExDesc=1, bool append=0, bool translate_class_labels=0, bool calc_phychem_properties=1, bool calc_topological_properties=1);
110 
112  void displayMatrix();
113 
116  void centerData(bool center_Y=0);
117 
119  void scaleAllDescriptors();
120 
122  unsigned int getNoSubstances() const;
123 
125  unsigned int getNoDescriptors() const;
126 
134  void readCSVFile(const char* file, int no_y, bool xlabels, bool ylabels, const char* sep=",", bool appendDescriptors=0, bool translate_class_labels=0);
135 
137  void manipulateY(std::vector<String> v);
138 
141  void manipulateY(String v);
142 
145  void discretizeY(std::vector<double> thresholds);
146 
147  void transformX(std::vector<String> v);
148 
150  std::vector<QSARData*> partitionInputData(int p);
151 
153  void saveToFile(string filename) const;
154 
156  void readFromFile(string filename);
157 
160  std::vector<QSARData*> generateExternalSet(double fraction) const;
161 
166  std::vector<QSARData*> evenSplit(int no_test_splits, int current_test_split_id, int response_id=0) const;
167 
169  std::vector<double>* getSubstance(int s) const;
170 
172  std::vector<double>* getActivity(int s) const;
173 
175  unsigned int getNoResponseVariables() const;
176 
177  const std::vector<string>* getSubstanceNames() const;
178 
180  bool checkforDiscreteY() const;
181 
182 
184  bool checkforDiscreteY(const char* file, std::multiset<int>& activity_IDs) const;
185 
187  void setDataFolder(const char* folder);
188 
191  void removeHighlyCorrelatedCompounds(double& compound_cor_threshold, double& feature_cor_threshold);
192 
198  void getSimilarDescriptors(int descriptor_ID, double correlation, std::list<std::pair<uint,String> >& similar_descriptor_IDs) const;
200 
201 
202  protected:
203 
207 
211  void calculateBALLDescriptors(Molecule& m);
212 
214  void calculateTopologicalDescriptors(Molecule& mol, MolecularSimilarity& molsim, const std::map<String,int>& descriptor_map);
215 
217  void setDescriptorNames(const Molecule& m, std::multiset<int>& activity_IDs, bool useExDesc=1, bool resize=1);
218 
221  void removeInvalidDescriptors(std::multiset<int>& invalidDescriptors);
222 
223  void removeInvalidSubstances(std::multiset<int>& inv);
224 
226  void readMatrix(VMatrix& mat, std::ifstream& in, char seperator, unsigned int lines, unsigned int col);
227 
230  void checkActivityIDs(std::multiset<int>& act, int no_properties);
231 
234  void insertSubstance(const QSARData* source, int s, bool backtransformation=0);
235 
237  void printMatrix(const VMatrix& mat, std::ostream& out) const;
239 
245 
247  VMatrix Y_;
248 
251 
254 
256  vector<string> column_names_;
257 
259  vector<string> substance_names_;
260 
262  std::multiset<int> invalidDescriptors_;
263 
264  std::multiset<int> invalidSubstances_;
265 
267 
269  std::map<String,int> class_names_;
271 
272 
273 
275  friend class RegressionValidation;
276  friend class Validation;
277  friend class Model;
278  friend class FitModel;
279  friend class FeatureSelection;
280 
281  };
282 
283  }
284 }
285 
286 #endif // QSARH
std::map< String, int > class_names_
Definition: QSARData.h:269
vector< string > substance_names_
Definition: QSARData.h:259
vector< string > column_names_
Definition: QSARData.h:256
VMatrix y_transformations_
Definition: QSARData.h:253
vector< Column > VMatrix
Definition: QSARData.h:72
vector< double > Column
Definition: QSARData.h:71
VMatrix descriptor_transformations_
Definition: QSARData.h:250
std::multiset< int > invalidDescriptors_
Definition: QSARData.h:262
VMatrix descriptor_matrix_
Definition: QSARData.h:244
-*- Mode: C++; tab-width: 2; -*-
Definition: constants.h:12
#define BALL_EXPORT
Definition: COMMON/global.h:50
std::multiset< int > invalidSubstances_
Definition: QSARData.h:264