
[Update] 자막 요약 알고리즘 구현 완료

from pytube import YouTube
from PyKomoran import *
from xml.etree import ElementTree
from textrank import KeywordSummarizer
from textrank import KeysentenceSummarizer
import numpy as np
from operator import itemgetter
def komoran_tokenize(sent):
words = sent.split()
words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)]
return words
#youtube url의 자막 -> xml으로 가져오기
topk_size = 30
video_url = ''
yt = YouTube(video_url)
title = yt.title
caption = yt.captions.get_by_language_code('ko')
caption_xml = caption.xml_captions
#xml -> string list로 파싱(문장별)
#xml -> string list로 파싱
root = ElementTree.fromstring(caption_xml)
sentences = []
print(root.tag, root.attrib)
texts = []
for child in root.findall("text"):
sentences.append(child.text.replace('\n', ' '))
text = child.text.replace('\n', ' ')
topk_size = texts.size() * 100 // topk_size
#Komoran을 통해 형태소 단위로 분리 후 태깅
komoran = Komoran('STABLE')
sents = []
for text in texts:
tokened_text = komoran.get_plain_text(text)
keyword_extractor = KeywordSummarizer(
tokenize = komoran_tokenize,
window = -1,
verbose = False
keywords = keyword_extractor.summarize(sents, topk=30)
summarizer = KeysentenceSummarizer(
tokenize = lambda x:x.split(),
min_sim = 0.5,
verbose = False
bias = np.ones(len(texts))
bias[0] = 5
keysents = summarizer.summarize(texts, topk=topk_size, bias=bias)
for _, _, sent in keysents:
sent = sent.replace('&#39;', "'")
......@@ -183,7 +183,6 @@ class KeysentenceSummarizer:
raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape))
elif bias is not None:
raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias)))
self.train_textrank(sents, bias)
idxs = self.R.argsort()[-topk:]
keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)]