Showing
1 changed file
with
195 additions
and
0 deletions
src/textrank/sentence.py
0 → 100644
| 1 | +from collections import Counter | ||
| 2 | +import math | ||
| 3 | +import numpy as np | ||
| 4 | +import scipy as sp | ||
| 5 | +from scipy.sparse import csr_matrix | ||
| 6 | +from sklearn.metrics import pairwise_distances | ||
| 7 | + | ||
| 8 | +from .utils import scan_vocabulary | ||
| 9 | +from .utils import tokenize_sents | ||
| 10 | + | ||
| 11 | + | ||
| 12 | +def sent_graph(sents, tokenize=None, min_count=2, min_sim=0.3, | ||
| 13 | + similarity=None, vocab_to_idx=None, verbose=False): | ||
| 14 | + """ | ||
| 15 | + Arguments | ||
| 16 | + --------- | ||
| 17 | + sents : list of str | ||
| 18 | + Sentence list | ||
| 19 | + tokenize : callable | ||
| 20 | + tokenize(sent) return list of str | ||
| 21 | + min_count : int | ||
| 22 | + Minimum term frequency | ||
| 23 | + min_sim : float | ||
| 24 | + Minimum similarity between sentences | ||
| 25 | + similarity : callable or str | ||
| 26 | + similarity(s1, s2) returns float | ||
| 27 | + s1 and s2 are list of str. | ||
| 28 | + available similarity = [callable, 'cosine', 'textrank'] | ||
| 29 | + vocab_to_idx : dict | ||
| 30 | + Vocabulary to index mapper. | ||
| 31 | + If None, this function scan vocabulary first. | ||
| 32 | + verbose : Boolean | ||
| 33 | + If True, verbose mode on | ||
| 34 | + Returns | ||
| 35 | + ------- | ||
| 36 | + sentence similarity graph : scipy.sparse.csr_matrix | ||
| 37 | + shape = (n sents, n sents) | ||
| 38 | + """ | ||
| 39 | + | ||
| 40 | + if vocab_to_idx is None: | ||
| 41 | + idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count) | ||
| 42 | + else: | ||
| 43 | + idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])] | ||
| 44 | + | ||
| 45 | + x = vectorize_sents(sents, tokenize, vocab_to_idx) | ||
| 46 | + if similarity == 'cosine': | ||
| 47 | + x = numpy_cosine_similarity_matrix(x, min_sim, verbose, batch_size=1000) | ||
| 48 | + else: | ||
| 49 | + x = numpy_textrank_similarity_matrix(x, min_sim, verbose, batch_size=1000) | ||
| 50 | + return x | ||
| 51 | + | ||
| 52 | +def vectorize_sents(sents, tokenize, vocab_to_idx): | ||
| 53 | + rows, cols, data = [], [], [] | ||
| 54 | + for i, sent in enumerate(sents): | ||
| 55 | + counter = Counter(tokenize(sent)) | ||
| 56 | + for token, count in counter.items(): | ||
| 57 | + j = vocab_to_idx.get(token, -1) | ||
| 58 | + if j == -1: | ||
| 59 | + continue | ||
| 60 | + rows.append(i) | ||
| 61 | + cols.append(j) | ||
| 62 | + data.append(count) | ||
| 63 | + n_rows = len(sents) | ||
| 64 | + n_cols = len(vocab_to_idx) | ||
| 65 | + return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols)) | ||
| 66 | + | ||
| 67 | +def numpy_cosine_similarity_matrix(x, min_sim=0.3, verbose=True, batch_size=1000): | ||
| 68 | + n_rows = x.shape[0] | ||
| 69 | + mat = [] | ||
| 70 | + for bidx in range(math.ceil(n_rows / batch_size)): | ||
| 71 | + b = int(bidx * batch_size) | ||
| 72 | + e = min(n_rows, int((bidx+1) * batch_size)) | ||
| 73 | + psim = 1 - pairwise_distances(x[b:e], x, metric='cosine') | ||
| 74 | + rows, cols = np.where(psim >= min_sim) | ||
| 75 | + data = psim[rows, cols] | ||
| 76 | + mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows))) | ||
| 77 | + if verbose: | ||
| 78 | + print('\rcalculating cosine sentence similarity {} / {}'.format(b, n_rows), end='') | ||
| 79 | + mat = sp.sparse.vstack(mat) | ||
| 80 | + if verbose: | ||
| 81 | + print('\rcalculating cosine sentence similarity was done with {} sents'.format(n_rows)) | ||
| 82 | + return mat | ||
| 83 | + | ||
| 84 | +def numpy_textrank_similarity_matrix(x, min_sim=0.3, verbose=True, min_length=1, batch_size=1000): | ||
| 85 | + n_rows, n_cols = x.shape | ||
| 86 | + | ||
| 87 | + # Boolean matrix | ||
| 88 | + rows, cols = x.nonzero() | ||
| 89 | + data = np.ones(rows.shape[0]) | ||
| 90 | + z = csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols)) | ||
| 91 | + | ||
| 92 | + # Inverse sentence length | ||
| 93 | + size = np.asarray(x.sum(axis=1)).reshape(-1) | ||
| 94 | + size[np.where(size <= min_length)] = 10000 | ||
| 95 | + size = np.log(size) | ||
| 96 | + | ||
| 97 | + mat = [] | ||
| 98 | + for bidx in range(math.ceil(n_rows / batch_size)): | ||
| 99 | + | ||
| 100 | + # slicing | ||
| 101 | + b = int(bidx * batch_size) | ||
| 102 | + e = min(n_rows, int((bidx+1) * batch_size)) | ||
| 103 | + | ||
| 104 | + # dot product | ||
| 105 | + inner = z[b:e,:] * z.transpose() | ||
| 106 | + | ||
| 107 | + # sentence len[i,j] = size[i] + size[j] | ||
| 108 | + norm = size[b:e].reshape(-1,1) + size.reshape(1,-1) | ||
| 109 | + norm = norm ** (-1) | ||
| 110 | + norm[np.where(norm == np.inf)] = 0 | ||
| 111 | + | ||
| 112 | + # normalize | ||
| 113 | + sim = inner.multiply(norm).tocsr() | ||
| 114 | + rows, cols = (sim >= min_sim).nonzero() | ||
| 115 | + data = np.asarray(sim[rows, cols]).reshape(-1) | ||
| 116 | + | ||
| 117 | + # append | ||
| 118 | + mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows))) | ||
| 119 | + | ||
| 120 | + if verbose: | ||
| 121 | + print('\rcalculating textrank sentence similarity {} / {}'.format(b, n_rows), end='') | ||
| 122 | + | ||
| 123 | + mat = sp.sparse.vstack(mat) | ||
| 124 | + if verbose: | ||
| 125 | + print('\rcalculating textrank sentence similarity was done with {} sents'.format(n_rows)) | ||
| 126 | + | ||
| 127 | + return mat | ||
| 128 | + | ||
| 129 | +def graph_with_python_sim(tokens, verbose, similarity, min_sim): | ||
| 130 | + if similarity == 'cosine': | ||
| 131 | + similarity = cosine_sent_sim | ||
| 132 | + elif callable(similarity): | ||
| 133 | + similarity = similarity | ||
| 134 | + else: | ||
| 135 | + similarity = textrank_sent_sim | ||
| 136 | + | ||
| 137 | + rows, cols, data = [], [], [] | ||
| 138 | + n_sents = len(tokens) | ||
| 139 | + for i, tokens_i in enumerate(tokens): | ||
| 140 | + if verbose and i % 1000 == 0: | ||
| 141 | + print('\rconstructing sentence graph {} / {} ...'.format(i, n_sents), end='') | ||
| 142 | + for j, tokens_j in enumerate(tokens): | ||
| 143 | + if i >= j: | ||
| 144 | + continue | ||
| 145 | + sim = similarity(tokens_i, tokens_j) | ||
| 146 | + if sim < min_sim: | ||
| 147 | + continue | ||
| 148 | + rows.append(i) | ||
| 149 | + cols.append(j) | ||
| 150 | + data.append(sim) | ||
| 151 | + if verbose: | ||
| 152 | + print('\rconstructing sentence graph was constructed from {} sents'.format(n_sents)) | ||
| 153 | + return csr_matrix((data, (rows, cols)), shape=(n_sents, n_sents)) | ||
| 154 | + | ||
| 155 | +def textrank_sent_sim(s1, s2): | ||
| 156 | + """ | ||
| 157 | + Arguments | ||
| 158 | + --------- | ||
| 159 | + s1, s2 : list of str | ||
| 160 | + Tokenized sentences | ||
| 161 | + Returns | ||
| 162 | + ------- | ||
| 163 | + Sentence similarity : float | ||
| 164 | + Non-negative number | ||
| 165 | + """ | ||
| 166 | + n1 = len(s1) | ||
| 167 | + n2 = len(s2) | ||
| 168 | + if (n1 <= 1) or (n2 <= 1): | ||
| 169 | + return 0 | ||
| 170 | + common = len(set(s1).intersection(set(s2))) | ||
| 171 | + base = math.log(n1) + math.log(n2) | ||
| 172 | + return common / base | ||
| 173 | + | ||
| 174 | +def cosine_sent_sim(s1, s2): | ||
| 175 | + """ | ||
| 176 | + Arguments | ||
| 177 | + --------- | ||
| 178 | + s1, s2 : list of str | ||
| 179 | + Tokenized sentences | ||
| 180 | + Returns | ||
| 181 | + ------- | ||
| 182 | + Sentence similarity : float | ||
| 183 | + Non-negative number | ||
| 184 | + """ | ||
| 185 | + if (not s1) or (not s2): | ||
| 186 | + return 0 | ||
| 187 | + | ||
| 188 | + s1 = Counter(s1) | ||
| 189 | + s2 = Counter(s2) | ||
| 190 | + norm1 = math.sqrt(sum(v ** 2 for v in s1.values())) | ||
| 191 | + norm2 = math.sqrt(sum(v ** 2 for v in s2.values())) | ||
| 192 | + prod = 0 | ||
| 193 | + for k, v in s1.items(): | ||
| 194 | + prod += v * s2.get(k, 0) | ||
| 195 | + return prod / (norm1 * norm2) | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment