-
Notifications
You must be signed in to change notification settings - Fork 2
/
textrank2.py
executable file
·76 lines (59 loc) · 2.24 KB
/
textrank2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
Cornell Data Science Fall 2018
Text summarization group: Wes Gurnee, Qian Huang, Jane Zhang
This is an implementation of the textrank algorithm, an adaptation of the
pagerank algorithm for keyword extraction and extractive summarization.
To use: call getSummary(text, numSentences) -> will return numSentences sentence
summary of text
"""
import editdistance
import itertools
import networkx as nx
import nltk
def build_graph(nodes):
"""
Return networkx graph instance built from nodes
"""
#initialize graph
graph = nx.Graph()
graph.add_nodes_from(nodes)
nodePairs = list(itertools.combinations(nodes, 2))
#add edges weighted by Levenshtein distance
for pair in nodePairs:
levDistance = editdistance.eval(pair[0], pair[1])
graph.add_edge(pair[0], pair[1], weight=levDistance)
return graph
def extract_sentences(text, clean_sentences=False, language='english'):
"""
Return list of sentences in text sorted in descending order of importance
"""
sent_detector = nltk.data.load('tokenizers/punkt/'+language+'.pickle')
sentence_tokens = sent_detector.tokenize(text.strip())
graph = build_graph(sentence_tokens)
calculated_page_rank = nx.pagerank(graph, weight='weight')
#list of most important sentences first
sentences = sorted(calculated_page_rank, key=calculated_page_rank.get,
reverse=True)
return sentences
def get_summary2(text, numSentences=1):
'''
Return summary of text in numSentences sentences (default: 1)
'''
sentences = extract_sentences(text)
if numSentences > len(sentences) or numSentences == 1: summary = sentences[0]
else:
summary = ' '.join(sentences[:numSentences])
summary_words = summary.split()
dot_indices = [idx for idx, word in enumerate(summary_words) if word.find('.') != -1]
if dot_indices:
last_dot = max(dot_indices) + 1
summary = ' '.join(summary_words[:last_dot])
else:
summary = ' '.join(summary_words)
print(summary)
return summary
if __name__ == "__main__":
print("###########Input text here:############")
text = input()
print("#########Generated Summary:############")
get_summary(text)