1
2
3
4
5 from rdkit import DataStructs
6 from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv
7
8 if rdsimdiv is None:
9 raise ImportError('rdSimDivPickers not built')
10
11
13 """ Class to cluster a set of bits based on their correllation
14
15 The correlation matrix is first built using by reading the fingerprints
16 from a database or a list of fingerprints
17 """
18
19 - def __init__(self, idList, nCluster, type=rdsimdiv.ClusterMethod.WARD):
20 self._clusters = []
21 self._bidList = idList
22
23 self._nClusters = nCluster
24 self._type = type
25
27
28 distMat = 1 / corrMat
29
30 pkr = rdsimdiv.HierarchicalClusterPicker(self._type)
31
32 cls = pkr.Cluster(distMat, len(self._bidList), self._nClusters)
33
34 self._clusters = []
35 for cl in cls:
36 self._clusters.append([self._bidList[i] for i in cl])
37
39 assert len(clusters) == self._nClusters
40 self._clusters = clusters
41
44
46 """ Map the fingerprint to a real valued vector of score based on the bit clusters
47
48 The dimension of the vector is same as the number of clusters. Each value in the
49 vector corresponds to the number of bits in the corresponding cluster
50 that are turned on in the fingerprint
51
52 ARGUMENTS:
53 - fp : the fingerprint
54 """
55 scores = [0] * self._nClusters
56 for i, cls in enumerate(self._clusters):
57 for bid in cls:
58 if fp[bid]:
59 scores[i] += 1
60 return scores
61
63 """ Map the fingerprint to a smaller sized (= number of clusters) fingerprint
64
65 Each cluster get a bit in the new fingerprint and is turned on if any of the bits in
66 the cluster are turned on in the original fingerprint"""
67
68 ebv = DataStructs.ExplicitBitVect(self._nClusters)
69 for i, cls in enumerate(self._clusters):
70 for bid in cls:
71 if fp[bid]:
72 ebv.SetBit(i)
73 break
74 return ebv
75