Showing
1 changed file
with
101 additions
and
0 deletions
src/textrank/word.py
0 → 100644
| 1 | +from collections import defaultdict | ||
| 2 | +from scipy.sparse import csr_matrix | ||
| 3 | + | ||
| 4 | +from .utils import scan_vocabulary | ||
| 5 | +from .utils import tokenize_sents | ||
| 6 | + | ||
| 7 | + | ||
| 8 | +def word_graph(sents, tokenize=None, min_count=2, window=2, | ||
| 9 | + min_cooccurrence=2, vocab_to_idx=None, verbose=False): | ||
| 10 | + """ | ||
| 11 | + Arguments | ||
| 12 | + --------- | ||
| 13 | + sents : list of str | ||
| 14 | + Sentence list | ||
| 15 | + tokenize : callable | ||
| 16 | + tokenize(str) returns list of str | ||
| 17 | + min_count : int | ||
| 18 | + Minumum term frequency | ||
| 19 | + window : int | ||
| 20 | + Co-occurrence window size | ||
| 21 | + min_cooccurrence : int | ||
| 22 | + Minimum cooccurrence frequency | ||
| 23 | + vocab_to_idx : dict | ||
| 24 | + Vocabulary to index mapper. | ||
| 25 | + If None, this function scan vocabulary first. | ||
| 26 | + verbose : Boolean | ||
| 27 | + If True, verbose mode on | ||
| 28 | + Returns | ||
| 29 | + ------- | ||
| 30 | + co-occurrence word graph : scipy.sparse.csr_matrix | ||
| 31 | + idx_to_vocab : list of str | ||
| 32 | + Word list corresponding row and column | ||
| 33 | + """ | ||
| 34 | + if vocab_to_idx is None: | ||
| 35 | + idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count) | ||
| 36 | + else: | ||
| 37 | + idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])] | ||
| 38 | + | ||
| 39 | + tokens = tokenize_sents(sents, tokenize) | ||
| 40 | + g = cooccurrence(tokens, vocab_to_idx, window, min_cooccurrence, verbose) | ||
| 41 | + return g, idx_to_vocab | ||
| 42 | + | ||
| 43 | +def cooccurrence(tokens, vocab_to_idx, window=2, min_cooccurrence=2, verbose=False): | ||
| 44 | + """ | ||
| 45 | + Arguments | ||
| 46 | + --------- | ||
| 47 | + tokens : list of list of str | ||
| 48 | + Tokenized sentence list | ||
| 49 | + vocab_to_idx : dict | ||
| 50 | + Vocabulary to index mapper | ||
| 51 | + window : int | ||
| 52 | + Co-occurrence window size | ||
| 53 | + min_cooccurrence : int | ||
| 54 | + Minimum cooccurrence frequency | ||
| 55 | + verbose : Boolean | ||
| 56 | + If True, verbose mode on | ||
| 57 | + Returns | ||
| 58 | + ------- | ||
| 59 | + co-occurrence matrix : scipy.sparse.csr_matrix | ||
| 60 | + shape = (n_vocabs, n_vocabs) | ||
| 61 | + """ | ||
| 62 | + counter = defaultdict(int) | ||
| 63 | + for s, tokens_i in enumerate(tokens): | ||
| 64 | + if verbose and s % 1000 == 0: | ||
| 65 | + print('\rword cooccurrence counting {}'.format(s), end='') | ||
| 66 | + vocabs = [vocab_to_idx[w] for w in tokens_i if w in vocab_to_idx] | ||
| 67 | + n = len(vocabs) | ||
| 68 | + for i, v in enumerate(vocabs): | ||
| 69 | + if window <= 0: | ||
| 70 | + b, e = 0, n | ||
| 71 | + else: | ||
| 72 | + b = max(0, i - window) | ||
| 73 | + e = min(i + window, n) | ||
| 74 | + for j in range(b, e): | ||
| 75 | + if i == j: | ||
| 76 | + continue | ||
| 77 | + counter[(v, vocabs[j])] += 1 | ||
| 78 | + counter[(vocabs[j], v)] += 1 | ||
| 79 | + counter = {k:v for k,v in counter.items() if v >= min_cooccurrence} | ||
| 80 | + n_vocabs = len(vocab_to_idx) | ||
| 81 | + if verbose: | ||
| 82 | + print('\rword cooccurrence counting from {} sents was done'.format(s+1)) | ||
| 83 | + return dict_to_mat(counter, n_vocabs, n_vocabs) | ||
| 84 | + | ||
| 85 | +def dict_to_mat(d, n_rows, n_cols): | ||
| 86 | + """ | ||
| 87 | + Arguments | ||
| 88 | + --------- | ||
| 89 | + d : dict | ||
| 90 | + key : (i,j) tuple | ||
| 91 | + value : float value | ||
| 92 | + Returns | ||
| 93 | + ------- | ||
| 94 | + scipy.sparse.csr_matrix | ||
| 95 | + """ | ||
| 96 | + rows, cols, data = [], [], [] | ||
| 97 | + for (i, j), v in d.items(): | ||
| 98 | + rows.append(i) | ||
| 99 | + cols.append(j) | ||
| 100 | + data.append(v) | ||
| 101 | + return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols)) | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment