Showing
1 changed file
with
195 additions
and
0 deletions
src/textrank/sentence.py
0 → 100644
1 | +from collections import Counter | ||
2 | +import math | ||
3 | +import numpy as np | ||
4 | +import scipy as sp | ||
5 | +from scipy.sparse import csr_matrix | ||
6 | +from sklearn.metrics import pairwise_distances | ||
7 | + | ||
8 | +from .utils import scan_vocabulary | ||
9 | +from .utils import tokenize_sents | ||
10 | + | ||
11 | + | ||
12 | +def sent_graph(sents, tokenize=None, min_count=2, min_sim=0.3, | ||
13 | + similarity=None, vocab_to_idx=None, verbose=False): | ||
14 | + """ | ||
15 | + Arguments | ||
16 | + --------- | ||
17 | + sents : list of str | ||
18 | + Sentence list | ||
19 | + tokenize : callable | ||
20 | + tokenize(sent) return list of str | ||
21 | + min_count : int | ||
22 | + Minimum term frequency | ||
23 | + min_sim : float | ||
24 | + Minimum similarity between sentences | ||
25 | + similarity : callable or str | ||
26 | + similarity(s1, s2) returns float | ||
27 | + s1 and s2 are list of str. | ||
28 | + available similarity = [callable, 'cosine', 'textrank'] | ||
29 | + vocab_to_idx : dict | ||
30 | + Vocabulary to index mapper. | ||
31 | + If None, this function scan vocabulary first. | ||
32 | + verbose : Boolean | ||
33 | + If True, verbose mode on | ||
34 | + Returns | ||
35 | + ------- | ||
36 | + sentence similarity graph : scipy.sparse.csr_matrix | ||
37 | + shape = (n sents, n sents) | ||
38 | + """ | ||
39 | + | ||
40 | + if vocab_to_idx is None: | ||
41 | + idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count) | ||
42 | + else: | ||
43 | + idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])] | ||
44 | + | ||
45 | + x = vectorize_sents(sents, tokenize, vocab_to_idx) | ||
46 | + if similarity == 'cosine': | ||
47 | + x = numpy_cosine_similarity_matrix(x, min_sim, verbose, batch_size=1000) | ||
48 | + else: | ||
49 | + x = numpy_textrank_similarity_matrix(x, min_sim, verbose, batch_size=1000) | ||
50 | + return x | ||
51 | + | ||
52 | +def vectorize_sents(sents, tokenize, vocab_to_idx): | ||
53 | + rows, cols, data = [], [], [] | ||
54 | + for i, sent in enumerate(sents): | ||
55 | + counter = Counter(tokenize(sent)) | ||
56 | + for token, count in counter.items(): | ||
57 | + j = vocab_to_idx.get(token, -1) | ||
58 | + if j == -1: | ||
59 | + continue | ||
60 | + rows.append(i) | ||
61 | + cols.append(j) | ||
62 | + data.append(count) | ||
63 | + n_rows = len(sents) | ||
64 | + n_cols = len(vocab_to_idx) | ||
65 | + return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols)) | ||
66 | + | ||
67 | +def numpy_cosine_similarity_matrix(x, min_sim=0.3, verbose=True, batch_size=1000): | ||
68 | + n_rows = x.shape[0] | ||
69 | + mat = [] | ||
70 | + for bidx in range(math.ceil(n_rows / batch_size)): | ||
71 | + b = int(bidx * batch_size) | ||
72 | + e = min(n_rows, int((bidx+1) * batch_size)) | ||
73 | + psim = 1 - pairwise_distances(x[b:e], x, metric='cosine') | ||
74 | + rows, cols = np.where(psim >= min_sim) | ||
75 | + data = psim[rows, cols] | ||
76 | + mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows))) | ||
77 | + if verbose: | ||
78 | + print('\rcalculating cosine sentence similarity {} / {}'.format(b, n_rows), end='') | ||
79 | + mat = sp.sparse.vstack(mat) | ||
80 | + if verbose: | ||
81 | + print('\rcalculating cosine sentence similarity was done with {} sents'.format(n_rows)) | ||
82 | + return mat | ||
83 | + | ||
84 | +def numpy_textrank_similarity_matrix(x, min_sim=0.3, verbose=True, min_length=1, batch_size=1000): | ||
85 | + n_rows, n_cols = x.shape | ||
86 | + | ||
87 | + # Boolean matrix | ||
88 | + rows, cols = x.nonzero() | ||
89 | + data = np.ones(rows.shape[0]) | ||
90 | + z = csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols)) | ||
91 | + | ||
92 | + # Inverse sentence length | ||
93 | + size = np.asarray(x.sum(axis=1)).reshape(-1) | ||
94 | + size[np.where(size <= min_length)] = 10000 | ||
95 | + size = np.log(size) | ||
96 | + | ||
97 | + mat = [] | ||
98 | + for bidx in range(math.ceil(n_rows / batch_size)): | ||
99 | + | ||
100 | + # slicing | ||
101 | + b = int(bidx * batch_size) | ||
102 | + e = min(n_rows, int((bidx+1) * batch_size)) | ||
103 | + | ||
104 | + # dot product | ||
105 | + inner = z[b:e,:] * z.transpose() | ||
106 | + | ||
107 | + # sentence len[i,j] = size[i] + size[j] | ||
108 | + norm = size[b:e].reshape(-1,1) + size.reshape(1,-1) | ||
109 | + norm = norm ** (-1) | ||
110 | + norm[np.where(norm == np.inf)] = 0 | ||
111 | + | ||
112 | + # normalize | ||
113 | + sim = inner.multiply(norm).tocsr() | ||
114 | + rows, cols = (sim >= min_sim).nonzero() | ||
115 | + data = np.asarray(sim[rows, cols]).reshape(-1) | ||
116 | + | ||
117 | + # append | ||
118 | + mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows))) | ||
119 | + | ||
120 | + if verbose: | ||
121 | + print('\rcalculating textrank sentence similarity {} / {}'.format(b, n_rows), end='') | ||
122 | + | ||
123 | + mat = sp.sparse.vstack(mat) | ||
124 | + if verbose: | ||
125 | + print('\rcalculating textrank sentence similarity was done with {} sents'.format(n_rows)) | ||
126 | + | ||
127 | + return mat | ||
128 | + | ||
129 | +def graph_with_python_sim(tokens, verbose, similarity, min_sim): | ||
130 | + if similarity == 'cosine': | ||
131 | + similarity = cosine_sent_sim | ||
132 | + elif callable(similarity): | ||
133 | + similarity = similarity | ||
134 | + else: | ||
135 | + similarity = textrank_sent_sim | ||
136 | + | ||
137 | + rows, cols, data = [], [], [] | ||
138 | + n_sents = len(tokens) | ||
139 | + for i, tokens_i in enumerate(tokens): | ||
140 | + if verbose and i % 1000 == 0: | ||
141 | + print('\rconstructing sentence graph {} / {} ...'.format(i, n_sents), end='') | ||
142 | + for j, tokens_j in enumerate(tokens): | ||
143 | + if i >= j: | ||
144 | + continue | ||
145 | + sim = similarity(tokens_i, tokens_j) | ||
146 | + if sim < min_sim: | ||
147 | + continue | ||
148 | + rows.append(i) | ||
149 | + cols.append(j) | ||
150 | + data.append(sim) | ||
151 | + if verbose: | ||
152 | + print('\rconstructing sentence graph was constructed from {} sents'.format(n_sents)) | ||
153 | + return csr_matrix((data, (rows, cols)), shape=(n_sents, n_sents)) | ||
154 | + | ||
155 | +def textrank_sent_sim(s1, s2): | ||
156 | + """ | ||
157 | + Arguments | ||
158 | + --------- | ||
159 | + s1, s2 : list of str | ||
160 | + Tokenized sentences | ||
161 | + Returns | ||
162 | + ------- | ||
163 | + Sentence similarity : float | ||
164 | + Non-negative number | ||
165 | + """ | ||
166 | + n1 = len(s1) | ||
167 | + n2 = len(s2) | ||
168 | + if (n1 <= 1) or (n2 <= 1): | ||
169 | + return 0 | ||
170 | + common = len(set(s1).intersection(set(s2))) | ||
171 | + base = math.log(n1) + math.log(n2) | ||
172 | + return common / base | ||
173 | + | ||
174 | +def cosine_sent_sim(s1, s2): | ||
175 | + """ | ||
176 | + Arguments | ||
177 | + --------- | ||
178 | + s1, s2 : list of str | ||
179 | + Tokenized sentences | ||
180 | + Returns | ||
181 | + ------- | ||
182 | + Sentence similarity : float | ||
183 | + Non-negative number | ||
184 | + """ | ||
185 | + if (not s1) or (not s2): | ||
186 | + return 0 | ||
187 | + | ||
188 | + s1 = Counter(s1) | ||
189 | + s2 = Counter(s2) | ||
190 | + norm1 = math.sqrt(sum(v ** 2 for v in s1.values())) | ||
191 | + norm2 = math.sqrt(sum(v ** 2 for v in s2.values())) | ||
192 | + prod = 0 | ||
193 | + for k, v in s1.items(): | ||
194 | + prod += v * s2.get(k, 0) | ||
195 | + return prod / (norm1 * norm2) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment