-
Notifications
You must be signed in to change notification settings - Fork 0
/
link_prediction.py
executable file
·265 lines (211 loc) · 9.42 KB
/
link_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import os
import pickle
import networkx as nx
import numpy as np
from sklearn import metrics, pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
default_params = {
'edge_function': "hadamard", # Default edge function to use
"prop_pos": 0.5, # Proportion of edges to remove nad use as positive samples
"prop_neg": 0.5, # Number of non-edges to use as negative samples
# (as a proportion of existing edges, same as prop_pos)
}
edge_functions = {
"hadamard": lambda a, b: a * b,
"average": lambda a, b: 0.5 * (a + b),
"l1": lambda a, b: np.abs(a - b),
"l2": lambda a, b: np.abs(a - b) ** 2,
}
class Graph():
def __init__(self,
nx_G=None, is_directed=False,
prop_pos=0.5, prop_neg=0.5,
workers=1,
random_seed=None):
self.G = nx_G
self.is_directed = is_directed
self.prop_pos = prop_neg
self.prop_neg = prop_pos
self.wvecs = None
self.workers = workers
self._rnd = np.random.RandomState(seed=random_seed)
def read_graph(self, input, enforce_connectivity=True, weighted=False, directed=False):
'''
Reads the input network in networkx.
'''
# if weighted:
# G = nx.read_edgelist(input, nodetype=int, data=(('weight', float),), create_using=nx.DiGraph())
# else:
G = nx.read_edgelist(input, nodetype=int, create_using=nx.DiGraph())
for edge in G.edges():
G.adj[edge[0]][edge[1]]['weight'] = 1
print("Read graph, nodes: %d, edges: %d" % (G.number_of_nodes(), G.number_of_edges()))
G1 = G.to_undirected()
self.G = G1
def generate_pos_neg_links(self):
"""
Select random existing edges in the graph to be postive links,
and random non-edges to be negative links.
Modify graph by removing the postive links.
"""
# Select n edges at random (positive samples)
n_edges = self.G.number_of_edges()
n_nodes = self.G.number_of_nodes()
npos = int(self.prop_pos * n_edges)
nneg = int(self.prop_neg * n_edges)
if not nx.is_connected(self.G):
raise RuntimeError("Input graph is not connected")
n_neighbors = [len(list(self.G.neighbors(v))) for v in self.G.nodes()] ##
n_non_edges = n_nodes - 1 - np.array(n_neighbors)
non_edges = [e for e in nx.non_edges(self.G)]
print("Finding %d of %d non-edges" % (nneg, len(non_edges)))
# Select m pairs of non-edges (negative samples)
rnd_inx = self._rnd.choice(len(non_edges), nneg, replace=False)
neg_edge_list = [non_edges[ii] for ii in rnd_inx]
if len(neg_edge_list) < nneg:
raise RuntimeWarning(
"Only %d negative edges found" % (len(neg_edge_list))
)
print("Finding %d positive edges of %d total edges" % (npos, n_edges))
# Find positive edges, and remove them.
edges = self.G.edges()
edges = list(edges)
pos_edge_list = []
n_count = 0
n_ignored_count = 0
rnd_inx = self._rnd.permutation(n_edges)
for eii in rnd_inx.tolist():
edge = edges[eii]
# Remove edge from graph
data = self.G[edge[0]][edge[1]]
self.G.remove_edge(*edge)
# Check if graph is still connected
# TODO: We shouldn't be using a private function for bfs
reachable_from_v1 = nx.connected._plain_bfs(self.G, edge[0])
if edge[1] not in reachable_from_v1:
self.G.add_edge(*edge, **data)
n_ignored_count += 1
else:
pos_edge_list.append(edge)
print("Found: %d " % (n_count), end="\r")
n_count += 1
# Exit if we've found npos nodes or we have gone through the whole list
if n_count >= npos:
print("")
break
if len(pos_edge_list) < npos:
raise RuntimeWarning("Only %d positive edges found." % (n_count))
self._pos_edge_list = pos_edge_list
self._neg_edge_list = neg_edge_list
def get_selected_edges(self):
edges = self._pos_edge_list + self._neg_edge_list
labels = np.zeros(len(edges))
labels[:len(self._pos_edge_list)] = 1
return edges, labels
def edges_to_features(self, edge_list, edge_function, emb_size, model):
"""
Given a list of edge lists and a list of labels, create
an edge feature array using binary_edge_function and
create a label array matching the label in the list to all
edges in the corresponding edge list
:param edge_function:
Function of two arguments taking the node features and returning
an edge feature of given dimension
:param dimension:
Size of returned edge feature vector, if None defaults to
node feature size.
:param k:
Partition number. If None use all positive & negative edges
:return:
feature_vec (n, dimensions), label_vec (n)
"""
n_tot = len(edge_list)
feature_vec = np.empty((n_tot, emb_size), dtype='f')
for ii in range(n_tot):
v1, v2 = edge_list[ii]
# Edge-node features
emb1 = np.asarray(model[str(v1)])
emb2 = np.asarray(model[str(v2)])
# Calculate edge feature
feature_vec[ii] = edge_function(emb1, emb2)
return feature_vec
def create_train_test_graphs(input='Facebook.edges', regen='regen', workers=8):
"""
Create and cache train & test graphs.
Will load from cache if exists unless --regen option is given.
# :param args:
# :return:
Gtrain, Gtest: Train & test graphs
"""
# Remove half the edges, and the same number of "negative" edges
prop_pos = default_params['prop_pos']
prop_neg = default_params['prop_neg']
# Create random training and test graphs with different random edge selections
cached_fn = "%s.graph" % (os.path.basename(input))
if os.path.exists(cached_fn) and not regen:
print("Loading link prediction graphs from %s" % cached_fn)
with open(cached_fn, 'rb') as f:
cache_data = pickle.load(f)
Gtrain = cache_data['g_train']
Gtest = cache_data['g_test']
else:
print("Regenerating link prediction graphs")
# Train graph embeddings on graph with random links
Gtrain = Graph(is_directed=False,
prop_pos=prop_pos,
prop_neg=prop_neg,
workers=workers)
Gtrain.read_graph(input)
Gtrain.generate_pos_neg_links()
# Generate a different random graph for testing
Gtest = Graph(is_directed=False,
prop_pos=prop_pos,
prop_neg=prop_neg,
workers=workers)
Gtest.read_graph(input)
Gtest.generate_pos_neg_links()
# Cache generated graph
#cache_data = {'g_train': Gtrain, 'g_test': Gtest}
#with open(cached_fn, 'wb') as f:
# pickle.dump(cache_data, f)
return Gtrain, Gtest
def test_edge_functions(Gtrain, Gtest, workers=8, num_experiments=1, emb_size=128, model=None,
edges_train=None, edges_test=None, labels_train=None, labels_test=None,lin_clf=LogisticRegression(C=1)):
# With fixed test & train graphs (these are expensive to generate)
# we perform k iterations of the algorithm
# TODO: It would be nice if the walks had a settable random seed
aucs = {func: [] for func in edge_functions}
for iter in range(num_experiments):
for edge_fn_name, edge_fn in edge_functions.items():
# Calculate edge embeddings using binary function
edge_features_train = Gtrain.edges_to_features(edges_train, edge_fn, emb_size, model)
edge_features_test = Gtest.edges_to_features(edges_test, edge_fn, emb_size, model)
# Linear classifier
scaler = StandardScaler()
clf = pipeline.make_pipeline(scaler, lin_clf)
# Train classifier
clf.fit(edge_features_train, labels_train)
auc_train = metrics.scorer.roc_auc_scorer(clf, edge_features_train, labels_train)
# Test classifier
auc_test = metrics.scorer.roc_auc_scorer(clf, edge_features_test, labels_test)
aucs[edge_fn_name].append(auc_test)
return aucs
if __name__ == "__main__": ### 'protein', 'karate', 'Facebook' 'blog',
from dataset import read_df
input = 'facebookRF.txt'
Gtrain, Gtest = create_train_test_graphs(input, regen='regen', workers=8)
# Train and test graphs, with different edges
edges_train, labels_train = Gtrain.get_selected_edges()
edges_test, labels_test = Gtest.get_selected_edges()
ds = read_df('HALKLINK2')
ds['node'] = ds['node'].map(lambda x: str(x))
ds = zip(ds['node'], ds['embedding'])
ds = dict(ds)
functions = {"hadamard", "average", "l1", "l2"}
auc = test_edge_functions(input, 'regen', workers=8, num_experiments=1, emb_size=128,
model=ds, edges_train=edges_train, edges_test=edges_test)
with open('HALKLINK2result.txt', 'w+') as f:
for i in functions:
f.write(str(i) + ' AUC: ' + str('%.3f' % np.mean(auc[i])))
f.write('\n')