pickle_to_csv_song.py
1.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re, csv, pickle, nltk
from song import *
from PyKomoran import *
from textrank import KeywordSummarizer
#nltk.download('averaged_perceptron_tagger')
def komoran_tokenize(sent):
words = sent.split()
for i in range(len(words)):
if words[i].endswith('/SL') and len(words[i]) > 4:
words[i] = words[i][:-3]
words[i] = '/'.join(nltk.pos_tag(nltk.word_tokenize(words[i]))[0])
if words[i].endswith('/NN'):
words[i] += 'P'
words = [w for w in words if '/NNP' in w or '/NNG' in w or '/FW' in w or '/JJ' in w]
return words
data = []
for filename in range(1112, 2122, 202):
with open('data/'+str(filename)+'.pickle', 'rb') as f:
tmp = pickle.load(f)
data.extend(tmp)
f = open('dataaaa.csv', 'w', newline='', encoding='UTF-8')
wr = csv.writer(f)
komoran = Komoran('STABLE')
for i in range(len(data)):
# 제목 정제
idx = data[i].title.find('(')
if idx != -1:
data[i].title = data[i].title[:idx]
# 가사 정제
if data[i].lyrics != '' and data[i].title != '거꾸로 걷는다':
texts = data[i].lyrics.split('\n')
sents = []
for text in texts:
tokened_text = komoran.get_plain_text(text)
if tokened_text != '':
sents.append(tokened_text)
keyword_extractor = KeywordSummarizer(
tokenize = komoran_tokenize,
window = -1,
verbose = False
)
if len(sents) > 5:
keywords = keyword_extractor.summarize(sents, topk=5)
data[i].keywords = list(map(lambda x : x[0][:x[0].find('/')], keywords))
wr.writerow(data[i].getRow())
data[i].saveImg()
f.close()