[Add] data preprocessing code

GyuhoLee
Commit 0c3fc8179d9937f4dae76620cf44a2e7c4b715e7 0c3fc817 1 parent a95a62fb
Showing 1 changed file with 46 additions and 0 deletions
src/pickle_to_csv.py
--- a/src/pickle_to_csv.py 0 → 100644
View file @0c3fc81
+++ b/src/pickle_to_csv.py 0 → 100644
View file @0c3fc81
+import re, csv, pickle
+from song import *
+from PyKomoran import *
+from textrank import KeywordSummarizer
+
+def komoran_tokenize(sent):
+    words = sent.split()
+    words = [w for w in words if ('/NNP' in w or '/NNG' in w or '/SL' in w)]
+    return words
+
+data = []
+for filename in range(1112, 2122, 202):
+    with open(str(filename)+'.pickle', 'rb') as f:
+        tmp = pickle.load(f)
+    data.extend(tmp)
+
+f = open('data.csv', 'w', newline='', encoding='UTF-8')
+wr = csv.writer(f)
+komoran = Komoran('STABLE')
+
+for i in range(len(data)):
+    # 제목 정제
+    idx = data[i].title.find('(')
+    if idx != -1:
+        data[i].title = data[i].title[:idx]
+    # 가사 정제
+    if data[i].lyrics != '' and data[i].title != '거꾸로 걷는다':
+        texts = data[i].lyrics.split('\n')
+        sents = []
+        for text in texts:
+            tokened_text = komoran.get_plain_text(text)
+            if tokened_text != '':
+                sents.append(tokened_text)
+        keyword_extractor = KeywordSummarizer(
+            tokenize = komoran_tokenize,
+            window = -1,
+            verbose = False
+        )
+        if len(sents) != 0:
+            keywords = keyword_extractor.summarize(sents, topk=5)
+            data[i].keywords = list(map(lambda x : x[0][:x[0].find('/')], keywords))
+
+    wr.writerow(data[i].getRow())
+    data[i].saveImg()
+
+f.close()
\ No newline at end of file