forked from njohner/ost_pymodules
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering.py
115 lines (108 loc) · 4.17 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#------------------------------------------------------------------------------------------------
#This file is part of the ost_pymodules project (https://github.com/njohner/ost_pymodules).
#
#Copyright 2015 Niklaus Johner
#
#ost_pymodules is free software: you can redistribute it and/or modify
#it under the terms of the GNU Lesser General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#ost_pymodules is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU Lesser General Public License for more details.
#
#You should have received a copy of the GNU Lesser General Public License
#along with ost_pymodules. If not, see <http://www.gnu.org/licenses/>.
#------------------------------------------------------------------------------------------------
"""
.. codeauthor: Niklaus Johner <[email protected]>
This module contains functions for performing distance-based clustering.
It can be used to perform hierarchical clustering or clustering using the
Hoshen-Kopelman algorithm.
"""
from ost import *
import time
import numpy as npy
import os,math
def HierarchicalClusteringOnPairwiseDistance(view,dist_cutoff,prop_name='cluster',method='single'):
"""
This function uses the scipy hierarchical clustering to perform clustering of atoms based on their
pairwise distance. The same methods as in the scipy.cluster.hierarchy.linkage toolbox are available
('single', 'average', 'full').
"""
import numpy as npy
import scipy
import scipy.cluster
dist_mat=mol.alg.PariwiseDistanceMatrix(view,view)
dm=[]
for i,el in enumerate(dist_mat):
for j,el2 in enumerate(el):
if i>=j:continue
dm.append(el2)
dm=npy.array(dm)
print 'starting clustering'
z=scipy.cluster.hierarchy.linkage(dm,method=method)
m=scipy.cluster.hierarchy.fcluster(z,dist_cutoff,criterion='distance')
c_list=[[] for i in range(max(m))]
for i,el in enumerate(m):c_list[el-1].append(i)
for a,c in zip(v.atoms,m):
a.SetIntProp(prop_name,int(c))
return (z,c_list)
def HoshenKopelman(neighbor_list):
"""
This function performs a clustering using the Hoshen-Kopelman algorithm.
It takes as input a list of neighbors, specifically for node i,
neighbor_list[i] is a list of all the neighbors of node i (a list of integers). So if
neighbor_list[i]=[2,4,7] means that node i has nodes 2, 4, and 7 as neighbors.
It returns a list of integers representing the cluster number for each node
"""
node_list=[-1 for el in neighbor_list]
node_list_labels=[]
clusters=0
for i,nl in enumerate(neighbor_list):
#print i,node_list_labels,nl,node_list
nl2=[node_list_labels[node_list[el]] for el in nl if node_list[el]!=-1]
if len(nl2)==0:
node_list_labels.append(clusters)
node_list[i]=clusters
clusters+=1
continue
nlp=min([node_list_labels[el] for el in nl2])
node_list[i]=nlp
#print 'nlp',nlp,node_list_labels
for j in nl2:node_list_labels[j]=nlp
#Correct the labels
for i in range(len(node_list_labels)):
N=i
while (node_list_labels[N]<N):
N=node_list_labels[N]
node_list_labels[i]=N
#Renumber labels
ld={}
clusters=1
for el in node_list_labels:
if not el in ld:
ld[el]=clusters
clusters+=1
for i in range(len(node_list_labels)):
node_list_labels[i]=ld[node_list_labels[i]]
#Apply labels to array
for i,el in enumerate(node_list):
node_list[i]=node_list_labels[el]
return node_list
def ClusterOnPairwiseDistance(view,dist_cutoff=3,prop_name='cluster'):
"""
This function clusters the atoms in the view based on the pairwise distances.
A pair of atom with a distance lower than dist_cutoff will be part of the same cluster.
The clustering scales linearly with the number of atoms, unlike hierarchical clustering.
"""
an_l=[view.FindWithin(a.pos,dist_cutoff) for a in view.atoms]
d={}
for i,a in enumerate(view.atoms):d[a]=i
an_l=[[d[a] for a in el] for el in an_l]
cl=HoshenKopelman(an_l)
for i in range(len(cl)):
view.atoms[i].SetIntProp(prop_name,cl[i])
return max(cl)