[Add] 단어 관련 함수들(그래프화, 중요도)

GyuhoLee
Commit 843289638d9cecd4f489eb0c9fdbe16337af78dd 84328963 1 parent ff34ecea
Showing 1 changed file with 101 additions and 0 deletions
src/textrank/word.py
--- a/src/textrank/word.py 0 → 100644
View file @8432896
+++ b/src/textrank/word.py 0 → 100644
View file @8432896
+from collections import defaultdict
+from scipy.sparse import csr_matrix
+
+from .utils import scan_vocabulary
+from .utils import tokenize_sents
+
+
+def word_graph(sents, tokenize=None, min_count=2, window=2,
+    min_cooccurrence=2, vocab_to_idx=None, verbose=False):
+    """
+    Arguments
+    ---------
+    sents : list of str
+        Sentence list
+    tokenize : callable
+        tokenize(str) returns list of str
+    min_count : int
+        Minumum term frequency
+    window : int
+        Co-occurrence window size
+    min_cooccurrence : int
+        Minimum cooccurrence frequency
+    vocab_to_idx : dict
+        Vocabulary to index mapper.
+        If None, this function scan vocabulary first.
+    verbose : Boolean
+        If True, verbose mode on
+    Returns
+    -------
+    co-occurrence word graph : scipy.sparse.csr_matrix
+    idx_to_vocab : list of str
+        Word list corresponding row and column
+    """
+    if vocab_to_idx is None:
+        idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
+    else:
+        idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])]
+
+    tokens = tokenize_sents(sents, tokenize)
+    g = cooccurrence(tokens, vocab_to_idx, window, min_cooccurrence, verbose)
+    return g, idx_to_vocab
+
+def cooccurrence(tokens, vocab_to_idx, window=2, min_cooccurrence=2, verbose=False):
+    """
+    Arguments
+    ---------
+    tokens : list of list of str
+        Tokenized sentence list
+    vocab_to_idx : dict
+        Vocabulary to index mapper
+    window : int
+        Co-occurrence window size
+    min_cooccurrence : int
+        Minimum cooccurrence frequency
+    verbose : Boolean
+        If True, verbose mode on
+    Returns
+    -------
+    co-occurrence matrix : scipy.sparse.csr_matrix
+        shape = (n_vocabs, n_vocabs)
+    """
+    counter = defaultdict(int)
+    for s, tokens_i in enumerate(tokens):
+        if verbose and s % 1000 == 0:
+            print('\rword cooccurrence counting {}'.format(s), end='')
+        vocabs = [vocab_to_idx[w] for w in tokens_i if w in vocab_to_idx]
+        n = len(vocabs)
+        for i, v in enumerate(vocabs):
+            if window <= 0:
+                b, e = 0, n
+            else:
+                b = max(0, i - window)
+                e = min(i + window, n)
+            for j in range(b, e):
+                if i == j:
+                    continue
+                counter[(v, vocabs[j])] += 1
+                counter[(vocabs[j], v)] += 1
+    counter = {k:v for k,v in counter.items() if v >= min_cooccurrence}
+    n_vocabs = len(vocab_to_idx)
+    if verbose:
+        print('\rword cooccurrence counting from {} sents was done'.format(s+1))
+    return dict_to_mat(counter, n_vocabs, n_vocabs)
+
+def dict_to_mat(d, n_rows, n_cols):
+    """
+    Arguments
+    ---------
+    d : dict
+        key : (i,j) tuple
+        value : float value
+    Returns
+    -------
+    scipy.sparse.csr_matrix
+    """
+    rows, cols, data = [], [], []
+    for (i, j), v in d.items():
+        rows.append(i)
+        cols.append(j)
+        data.append(v)
+    return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))
\ No newline at end of file