GyuhoLee

[Add] data preprocessing code

1 +import re, csv, pickle
2 +from song import *
3 +from PyKomoran import *
4 +from textrank import KeywordSummarizer
5 +
6 +def komoran_tokenize(sent):
7 + words = sent.split()
8 + words = [w for w in words if ('/NNP' in w or '/NNG' in w or '/SL' in w)]
9 + return words
10 +
11 +data = []
12 +for filename in range(1112, 2122, 202):
13 + with open(str(filename)+'.pickle', 'rb') as f:
14 + tmp = pickle.load(f)
15 + data.extend(tmp)
16 +
17 +f = open('data.csv', 'w', newline='', encoding='UTF-8')
18 +wr = csv.writer(f)
19 +komoran = Komoran('STABLE')
20 +
21 +for i in range(len(data)):
22 + # 제목 정제
23 + idx = data[i].title.find('(')
24 + if idx != -1:
25 + data[i].title = data[i].title[:idx]
26 + # 가사 정제
27 + if data[i].lyrics != '' and data[i].title != '거꾸로 걷는다':
28 + texts = data[i].lyrics.split('\n')
29 + sents = []
30 + for text in texts:
31 + tokened_text = komoran.get_plain_text(text)
32 + if tokened_text != '':
33 + sents.append(tokened_text)
34 + keyword_extractor = KeywordSummarizer(
35 + tokenize = komoran_tokenize,
36 + window = -1,
37 + verbose = False
38 + )
39 + if len(sents) != 0:
40 + keywords = keyword_extractor.summarize(sents, topk=5)
41 + data[i].keywords = list(map(lambda x : x[0][:x[0].find('/')], keywords))
42 +
43 + wr.writerow(data[i].getRow())
44 + data[i].saveImg()
45 +
46 +f.close()
...\ No newline at end of file ...\ No newline at end of file