summarizer.py
5.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import numpy as np
from .rank import pagerank
from .sentence import sent_graph
from .word import word_graph
class KeywordSummarizer:
"""
Arguments
---------
sents : list of str
Sentence list
tokenize : callable
Tokenize function: tokenize(str) = list of str
min_count : int
Minumum frequency of words will be used to construct sentence graph
window : int
Word cooccurrence window size. Default is -1.
'-1' means there is cooccurrence between two words if the words occur in a sentence
min_cooccurrence : int
Minimum cooccurrence frequency of two words
vocab_to_idx : dict or None
Vocabulary to index mapper
df : float
PageRank damping factor
max_iter : int
Number of PageRank iterations
verbose : Boolean
If True, it shows training progress
"""
def __init__(self, sents=None, tokenize=None, min_count=2,
window=-1, min_cooccurrence=2, vocab_to_idx=None,
df=0.85, max_iter=30, verbose=False):
self.tokenize = tokenize
self.min_count = min_count
self.window = window
self.min_cooccurrence = min_cooccurrence
self.vocab_to_idx = vocab_to_idx
self.df = df
self.max_iter = max_iter
self.verbose = verbose
if sents is not None:
self.train_textrank(sents)
def train_textrank(self, sents, bias=None):
"""
Arguments
---------
sents : list of str
Sentence list
bias : None or numpy.ndarray
PageRank bias term
Returns
-------
None
"""
g, self.idx_to_vocab = word_graph(sents,
self.tokenize, self.min_count,self.window,
self.min_cooccurrence, self.vocab_to_idx, self.verbose)
self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
if self.verbose:
print('trained TextRank. n words = {}'.format(self.R.shape[0]))
def keywords(self, topk=30):
"""
Arguments
---------
topk : int
Number of keywords selected from TextRank
Returns
-------
keywords : list of tuple
Each tuple stands for (word, rank)
"""
if not hasattr(self, 'R'):
raise RuntimeError('Train textrank first or use summarize function')
idxs = self.R.argsort()[-topk:]
keywords = [(self.idx_to_vocab[idx], self.R[idx]) for idx in reversed(idxs)]
return keywords
def summarize(self, sents, topk=30):
"""
Arguments
---------
sents : list of str
Sentence list
topk : int
Number of keywords selected from TextRank
Returns
-------
keywords : list of tuple
Each tuple stands for (word, rank)
"""
self.train_textrank(sents)
return self.keywords(topk)
class KeysentenceSummarizer:
"""
Arguments
---------
sents : list of str
Sentence list
tokenize : callable
Tokenize function: tokenize(str) = list of str
min_count : int
Minumum frequency of words will be used to construct sentence graph
min_sim : float
Minimum similarity between sentences in sentence graph
similarity : str
available similarity = ['cosine', 'textrank']
vocab_to_idx : dict or None
Vocabulary to index mapper
df : float
PageRank damping factor
max_iter : int
Number of PageRank iterations
verbose : Boolean
If True, it shows training progress
"""
def __init__(self, sents=None, tokenize=None, min_count=2,
min_sim=0.3, similarity=None, vocab_to_idx=None,
df=0.85, max_iter=30, verbose=False):
self.tokenize = tokenize
self.min_count = min_count
self.min_sim = min_sim
self.similarity = similarity
self.vocab_to_idx = vocab_to_idx
self.df = df
self.max_iter = max_iter
self.verbose = verbose
if sents is not None:
self.train_textrank(sents)
def train_textrank(self, sents, bias=None):
"""
Arguments
---------
sents : list of str
Sentence list
bias : None or numpy.ndarray
PageRank bias term
Shape must be (n_sents,)
Returns
-------
None
"""
g = sent_graph(sents, self.tokenize, self.min_count,
self.min_sim, self.similarity, self.vocab_to_idx, self.verbose)
self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
if self.verbose:
print('trained TextRank. n sentences = {}'.format(self.R.shape[0]))
def summarize(self, sents, topk=30, bias=None):
"""
Arguments
---------
sents : list of str
Sentence list
topk : int
Number of key-sentences to be selected.
bias : None or numpy.ndarray
PageRank bias term
Shape must be (n_sents,)
Returns
-------
keysents : list of tuple
Each tuple stands for (sentence index, rank, sentence)
Usage
-----
>>> from textrank import KeysentenceSummarizer
>>> summarizer = KeysentenceSummarizer(tokenize = tokenizer, min_sim = 0.5)
>>> keysents = summarizer.summarize(texts, topk=30)
"""
n_sents = len(sents)
if isinstance(bias, np.ndarray):
if bias.shape != (n_sents,):
raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape))
elif bias is not None:
raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias)))
self.train_textrank(sents, bias)
idxs = self.R.argsort()[-topk:]
keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)]
return keysents