Showing
1 changed file
with
68 additions
and
0 deletions
src/textrank/utils.py
0 → 100644
| 1 | +from collections import Counter | ||
| 2 | +from scipy.sparse import csr_matrix | ||
| 3 | +import numpy as np | ||
| 4 | + | ||
| 5 | + | ||
| 6 | +def scan_vocabulary(sents, tokenize=None, min_count=2): | ||
| 7 | + """ | ||
| 8 | + Arguments | ||
| 9 | + --------- | ||
| 10 | + sents : list of str | ||
| 11 | + Sentence list | ||
| 12 | + tokenize : callable | ||
| 13 | + tokenize(str) returns list of str | ||
| 14 | + min_count : int | ||
| 15 | + Minumum term frequency | ||
| 16 | + Returns | ||
| 17 | + ------- | ||
| 18 | + idx_to_vocab : list of str | ||
| 19 | + Vocabulary list | ||
| 20 | + vocab_to_idx : dict | ||
| 21 | + Vocabulary to index mapper. | ||
| 22 | + """ | ||
| 23 | + counter = Counter(w for sent in sents for w in tokenize(sent)) | ||
| 24 | + counter = {w:c for w,c in counter.items() if c >= min_count} | ||
| 25 | + idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])] | ||
| 26 | + vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)} | ||
| 27 | + return idx_to_vocab, vocab_to_idx | ||
| 28 | + | ||
| 29 | +def tokenize_sents(sents, tokenize): | ||
| 30 | + """ | ||
| 31 | + Arguments | ||
| 32 | + --------- | ||
| 33 | + sents : list of str | ||
| 34 | + Sentence list | ||
| 35 | + tokenize : callable | ||
| 36 | + tokenize(sent) returns list of str (word sequence) | ||
| 37 | + Returns | ||
| 38 | + ------- | ||
| 39 | + tokenized sentence list : list of list of str | ||
| 40 | + """ | ||
| 41 | + return [tokenize(sent) for sent in sents] | ||
| 42 | + | ||
| 43 | +def vectorize(tokens, vocab_to_idx): | ||
| 44 | + """ | ||
| 45 | + Arguments | ||
| 46 | + --------- | ||
| 47 | + tokens : list of list of str | ||
| 48 | + Tokenzed sentence list | ||
| 49 | + vocab_to_idx : dict | ||
| 50 | + Vocabulary to index mapper | ||
| 51 | + Returns | ||
| 52 | + ------- | ||
| 53 | + sentence bow : scipy.sparse.csr_matrix | ||
| 54 | + shape = (n_sents, n_terms) | ||
| 55 | + """ | ||
| 56 | + rows, cols, data = [], [], [] | ||
| 57 | + for i, tokens_i in enumerate(tokens): | ||
| 58 | + for t, c in Counter(tokens_i).items(): | ||
| 59 | + j = vocab_to_idx.get(t, -1) | ||
| 60 | + if j == -1: | ||
| 61 | + continue | ||
| 62 | + rows.append(i) | ||
| 63 | + cols.append(j) | ||
| 64 | + data.append(c) | ||
| 65 | + n_sents = len(tokens) | ||
| 66 | + n_terms = len(vocab_to_idx) | ||
| 67 | + x = csr_matrix((data, (rows, cols)), shape=(n_sents, n_terms)) | ||
| 68 | + return x | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment