GyuhoLee

[Update] 자막 요약 알고리즘 구현 완료

...@@ -19,7 +19,11 @@ ...@@ -19,7 +19,11 @@
19 <select /> 19 <select />
20 </component> 20 </component>
21 <component name="ChangeListManager"> 21 <component name="ChangeListManager">
22 - <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="" /> 22 + <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="">
23 + <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
24 + <change beforePath="$PROJECT_DIR$/subtitle.py" beforeDir="false" afterPath="$PROJECT_DIR$/subtitle.py" afterDir="false" />
25 + <change beforePath="$PROJECT_DIR$/textrank/summarizer.py" beforeDir="false" afterPath="$PROJECT_DIR$/textrank/summarizer.py" afterDir="false" />
26 + </list>
23 <option name="SHOW_DIALOG" value="false" /> 27 <option name="SHOW_DIALOG" value="false" />
24 <option name="HIGHLIGHT_CONFLICTS" value="true" /> 28 <option name="HIGHLIGHT_CONFLICTS" value="true" />
25 <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> 29 <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
...@@ -103,21 +107,25 @@ ...@@ -103,21 +107,25 @@
103 <screen x="0" y="0" width="1920" height="1040" /> 107 <screen x="0" y="0" width="1920" height="1040" />
104 </state> 108 </state>
105 <state x="740" y="275" key="FileChooserDialogImpl/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1604836455343" /> 109 <state x="740" y="275" key="FileChooserDialogImpl/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1604836455343" />
106 - <state width="1899" height="282" key="GridCell.Tab.0.bottom" timestamp="1604306110978"> 110 + <state width="1899" height="282" key="GridCell.Tab.0.bottom" timestamp="1607070330004">
107 - <screen x="1920" y="0" width="1920" height="1040" /> 111 + <screen x="0" y="0" width="1920" height="1040" />
108 </state> 112 </state>
113 + <state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" />
109 <state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> 114 <state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" />
110 - <state width="1899" height="282" key="GridCell.Tab.0.center" timestamp="1604306110978"> 115 + <state width="1899" height="282" key="GridCell.Tab.0.center" timestamp="1607070330004">
111 - <screen x="1920" y="0" width="1920" height="1040" /> 116 + <screen x="0" y="0" width="1920" height="1040" />
112 </state> 117 </state>
118 + <state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" />
113 <state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> 119 <state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" />
114 - <state width="1899" height="282" key="GridCell.Tab.0.left" timestamp="1604306110978"> 120 + <state width="1899" height="282" key="GridCell.Tab.0.left" timestamp="1607070330004">
115 - <screen x="1920" y="0" width="1920" height="1040" /> 121 + <screen x="0" y="0" width="1920" height="1040" />
116 </state> 122 </state>
123 + <state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" />
117 <state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> 124 <state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" />
118 - <state width="1899" height="282" key="GridCell.Tab.0.right" timestamp="1604306110978"> 125 + <state width="1899" height="282" key="GridCell.Tab.0.right" timestamp="1607070330004">
119 - <screen x="1920" y="0" width="1920" height="1040" /> 126 + <screen x="0" y="0" width="1920" height="1040" />
120 </state> 127 </state>
128 + <state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" />
121 <state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> 129 <state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" />
122 <state x="2381" y="164" key="SettingsEditor" timestamp="1604303734485"> 130 <state x="2381" y="164" key="SettingsEditor" timestamp="1604303734485">
123 <screen x="1920" y="0" width="1920" height="1040" /> 131 <screen x="1920" y="0" width="1920" height="1040" />
......
1 from pytube import YouTube 1 from pytube import YouTube
2 +from PyKomoran import *
2 from xml.etree import ElementTree 3 from xml.etree import ElementTree
4 +from textrank import KeywordSummarizer
5 +from textrank import KeysentenceSummarizer
6 +import numpy as np
7 +from operator import itemgetter
8 +
9 +def komoran_tokenize(sent):
10 + words = sent.split()
11 + words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)]
12 + return words
3 13
4 #youtube url의 자막 -> xml으로 가져오기 14 #youtube url의 자막 -> xml으로 가져오기
15 +topk_size = 30
5 video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318' 16 video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318'
6 yt = YouTube(video_url) 17 yt = YouTube(video_url)
7 title = yt.title 18 title = yt.title
...@@ -9,10 +20,38 @@ description = yt.description ...@@ -9,10 +20,38 @@ description = yt.description
9 caption = yt.captions.get_by_language_code('ko') 20 caption = yt.captions.get_by_language_code('ko')
10 caption_xml = caption.xml_captions 21 caption_xml = caption.xml_captions
11 22
12 -#xml -> string list로 파싱(문장별) 23 +#xml -> string list로 파싱
13 root = ElementTree.fromstring(caption_xml) 24 root = ElementTree.fromstring(caption_xml)
14 -sentences = [] 25 +texts = []
15 -print(root.tag, root.attrib) 26 +texts.append(title)
16 for child in root.findall("text"): 27 for child in root.findall("text"):
17 - sentences.append(child.text.replace('\n', ' '))
18 -print(sentences)
...\ No newline at end of file ...\ No newline at end of file
28 + text = child.text.replace('\n', ' ')
29 + texts.append(text)
30 +topk_size = texts.size() * 100 // topk_size
31 +
32 +#Komoran을 통해 형태소 단위로 분리 후 태깅
33 +komoran = Komoran('STABLE')
34 +sents = []
35 +for text in texts:
36 + tokened_text = komoran.get_plain_text(text)
37 + sents.append(tokened_text)
38 +
39 +keyword_extractor = KeywordSummarizer(
40 + tokenize = komoran_tokenize,
41 + window = -1,
42 + verbose = False
43 +)
44 +keywords = keyword_extractor.summarize(sents, topk=30)
45 +
46 +summarizer = KeysentenceSummarizer(
47 + tokenize = lambda x:x.split(),
48 + min_sim = 0.5,
49 + verbose = False
50 +)
51 +bias = np.ones(len(texts))
52 +bias[0] = 5
53 +keysents = summarizer.summarize(texts, topk=topk_size, bias=bias)
54 +keysents.sort(key=itemgetter(0))
55 +for _, _, sent in keysents:
56 + sent = sent.replace('&#39;', "'")
57 + print(sent)
......
No preview for this file type
No preview for this file type
No preview for this file type
...@@ -183,7 +183,6 @@ class KeysentenceSummarizer: ...@@ -183,7 +183,6 @@ class KeysentenceSummarizer:
183 raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape)) 183 raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape))
184 elif bias is not None: 184 elif bias is not None:
185 raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias))) 185 raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias)))
186 -
187 self.train_textrank(sents, bias) 186 self.train_textrank(sents, bias)
188 idxs = self.R.argsort()[-topk:] 187 idxs = self.R.argsort()[-topk:]
189 keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)] 188 keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)]
......