Showing
1 changed file
with
46 additions
and
0 deletions
src/pickle_to_csv.py
0 → 100644
1 | +import re, csv, pickle | ||
2 | +from song import * | ||
3 | +from PyKomoran import * | ||
4 | +from textrank import KeywordSummarizer | ||
5 | + | ||
6 | +def komoran_tokenize(sent): | ||
7 | + words = sent.split() | ||
8 | + words = [w for w in words if ('/NNP' in w or '/NNG' in w or '/SL' in w)] | ||
9 | + return words | ||
10 | + | ||
11 | +data = [] | ||
12 | +for filename in range(1112, 2122, 202): | ||
13 | + with open(str(filename)+'.pickle', 'rb') as f: | ||
14 | + tmp = pickle.load(f) | ||
15 | + data.extend(tmp) | ||
16 | + | ||
17 | +f = open('data.csv', 'w', newline='', encoding='UTF-8') | ||
18 | +wr = csv.writer(f) | ||
19 | +komoran = Komoran('STABLE') | ||
20 | + | ||
21 | +for i in range(len(data)): | ||
22 | + # 제목 정제 | ||
23 | + idx = data[i].title.find('(') | ||
24 | + if idx != -1: | ||
25 | + data[i].title = data[i].title[:idx] | ||
26 | + # 가사 정제 | ||
27 | + if data[i].lyrics != '' and data[i].title != '거꾸로 걷는다': | ||
28 | + texts = data[i].lyrics.split('\n') | ||
29 | + sents = [] | ||
30 | + for text in texts: | ||
31 | + tokened_text = komoran.get_plain_text(text) | ||
32 | + if tokened_text != '': | ||
33 | + sents.append(tokened_text) | ||
34 | + keyword_extractor = KeywordSummarizer( | ||
35 | + tokenize = komoran_tokenize, | ||
36 | + window = -1, | ||
37 | + verbose = False | ||
38 | + ) | ||
39 | + if len(sents) != 0: | ||
40 | + keywords = keyword_extractor.summarize(sents, topk=5) | ||
41 | + data[i].keywords = list(map(lambda x : x[0][:x[0].find('/')], keywords)) | ||
42 | + | ||
43 | + wr.writerow(data[i].getRow()) | ||
44 | + data[i].saveImg() | ||
45 | + | ||
46 | +f.close() | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment