GyuhoLee

[Add] 문자열 처리를 다루는 함수들

1 +from collections import Counter
2 +import math
3 +import numpy as np
4 +import scipy as sp
5 +from scipy.sparse import csr_matrix
6 +from sklearn.metrics import pairwise_distances
7 +
8 +from .utils import scan_vocabulary
9 +from .utils import tokenize_sents
10 +
11 +
12 +def sent_graph(sents, tokenize=None, min_count=2, min_sim=0.3,
13 + similarity=None, vocab_to_idx=None, verbose=False):
14 + """
15 + Arguments
16 + ---------
17 + sents : list of str
18 + Sentence list
19 + tokenize : callable
20 + tokenize(sent) return list of str
21 + min_count : int
22 + Minimum term frequency
23 + min_sim : float
24 + Minimum similarity between sentences
25 + similarity : callable or str
26 + similarity(s1, s2) returns float
27 + s1 and s2 are list of str.
28 + available similarity = [callable, 'cosine', 'textrank']
29 + vocab_to_idx : dict
30 + Vocabulary to index mapper.
31 + If None, this function scan vocabulary first.
32 + verbose : Boolean
33 + If True, verbose mode on
34 + Returns
35 + -------
36 + sentence similarity graph : scipy.sparse.csr_matrix
37 + shape = (n sents, n sents)
38 + """
39 +
40 + if vocab_to_idx is None:
41 + idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
42 + else:
43 + idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])]
44 +
45 + x = vectorize_sents(sents, tokenize, vocab_to_idx)
46 + if similarity == 'cosine':
47 + x = numpy_cosine_similarity_matrix(x, min_sim, verbose, batch_size=1000)
48 + else:
49 + x = numpy_textrank_similarity_matrix(x, min_sim, verbose, batch_size=1000)
50 + return x
51 +
52 +def vectorize_sents(sents, tokenize, vocab_to_idx):
53 + rows, cols, data = [], [], []
54 + for i, sent in enumerate(sents):
55 + counter = Counter(tokenize(sent))
56 + for token, count in counter.items():
57 + j = vocab_to_idx.get(token, -1)
58 + if j == -1:
59 + continue
60 + rows.append(i)
61 + cols.append(j)
62 + data.append(count)
63 + n_rows = len(sents)
64 + n_cols = len(vocab_to_idx)
65 + return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))
66 +
67 +def numpy_cosine_similarity_matrix(x, min_sim=0.3, verbose=True, batch_size=1000):
68 + n_rows = x.shape[0]
69 + mat = []
70 + for bidx in range(math.ceil(n_rows / batch_size)):
71 + b = int(bidx * batch_size)
72 + e = min(n_rows, int((bidx+1) * batch_size))
73 + psim = 1 - pairwise_distances(x[b:e], x, metric='cosine')
74 + rows, cols = np.where(psim >= min_sim)
75 + data = psim[rows, cols]
76 + mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows)))
77 + if verbose:
78 + print('\rcalculating cosine sentence similarity {} / {}'.format(b, n_rows), end='')
79 + mat = sp.sparse.vstack(mat)
80 + if verbose:
81 + print('\rcalculating cosine sentence similarity was done with {} sents'.format(n_rows))
82 + return mat
83 +
84 +def numpy_textrank_similarity_matrix(x, min_sim=0.3, verbose=True, min_length=1, batch_size=1000):
85 + n_rows, n_cols = x.shape
86 +
87 + # Boolean matrix
88 + rows, cols = x.nonzero()
89 + data = np.ones(rows.shape[0])
90 + z = csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))
91 +
92 + # Inverse sentence length
93 + size = np.asarray(x.sum(axis=1)).reshape(-1)
94 + size[np.where(size <= min_length)] = 10000
95 + size = np.log(size)
96 +
97 + mat = []
98 + for bidx in range(math.ceil(n_rows / batch_size)):
99 +
100 + # slicing
101 + b = int(bidx * batch_size)
102 + e = min(n_rows, int((bidx+1) * batch_size))
103 +
104 + # dot product
105 + inner = z[b:e,:] * z.transpose()
106 +
107 + # sentence len[i,j] = size[i] + size[j]
108 + norm = size[b:e].reshape(-1,1) + size.reshape(1,-1)
109 + norm = norm ** (-1)
110 + norm[np.where(norm == np.inf)] = 0
111 +
112 + # normalize
113 + sim = inner.multiply(norm).tocsr()
114 + rows, cols = (sim >= min_sim).nonzero()
115 + data = np.asarray(sim[rows, cols]).reshape(-1)
116 +
117 + # append
118 + mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows)))
119 +
120 + if verbose:
121 + print('\rcalculating textrank sentence similarity {} / {}'.format(b, n_rows), end='')
122 +
123 + mat = sp.sparse.vstack(mat)
124 + if verbose:
125 + print('\rcalculating textrank sentence similarity was done with {} sents'.format(n_rows))
126 +
127 + return mat
128 +
129 +def graph_with_python_sim(tokens, verbose, similarity, min_sim):
130 + if similarity == 'cosine':
131 + similarity = cosine_sent_sim
132 + elif callable(similarity):
133 + similarity = similarity
134 + else:
135 + similarity = textrank_sent_sim
136 +
137 + rows, cols, data = [], [], []
138 + n_sents = len(tokens)
139 + for i, tokens_i in enumerate(tokens):
140 + if verbose and i % 1000 == 0:
141 + print('\rconstructing sentence graph {} / {} ...'.format(i, n_sents), end='')
142 + for j, tokens_j in enumerate(tokens):
143 + if i >= j:
144 + continue
145 + sim = similarity(tokens_i, tokens_j)
146 + if sim < min_sim:
147 + continue
148 + rows.append(i)
149 + cols.append(j)
150 + data.append(sim)
151 + if verbose:
152 + print('\rconstructing sentence graph was constructed from {} sents'.format(n_sents))
153 + return csr_matrix((data, (rows, cols)), shape=(n_sents, n_sents))
154 +
155 +def textrank_sent_sim(s1, s2):
156 + """
157 + Arguments
158 + ---------
159 + s1, s2 : list of str
160 + Tokenized sentences
161 + Returns
162 + -------
163 + Sentence similarity : float
164 + Non-negative number
165 + """
166 + n1 = len(s1)
167 + n2 = len(s2)
168 + if (n1 <= 1) or (n2 <= 1):
169 + return 0
170 + common = len(set(s1).intersection(set(s2)))
171 + base = math.log(n1) + math.log(n2)
172 + return common / base
173 +
174 +def cosine_sent_sim(s1, s2):
175 + """
176 + Arguments
177 + ---------
178 + s1, s2 : list of str
179 + Tokenized sentences
180 + Returns
181 + -------
182 + Sentence similarity : float
183 + Non-negative number
184 + """
185 + if (not s1) or (not s2):
186 + return 0
187 +
188 + s1 = Counter(s1)
189 + s2 = Counter(s2)
190 + norm1 = math.sqrt(sum(v ** 2 for v in s1.values()))
191 + norm2 = math.sqrt(sum(v ** 2 for v in s2.values()))
192 + prod = 0
193 + for k, v in s1.items():
194 + prod += v * s2.get(k, 0)
195 + return prod / (norm1 * norm2)
...\ No newline at end of file ...\ No newline at end of file