Toggle navigation
Toggle navigation
This project
Loading...
Sign in
2020-2-capstone-design1
/
HCG_project
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
GyuhoLee
2020-12-06 15:52:10 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
bfc394330c3d7b141641da625dceefd4292b6ab4
bfc39433
1 parent
e7624830
[Update] subtitle 메소드화
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
52 additions
and
41 deletions
src/.idea/workspace.xml
src/subtitle.py
src/.idea/workspace.xml
View file @
bfc3943
...
...
@@ -19,7 +19,9 @@
<select
/>
</component>
<component
name=
"ChangeListManager"
>
<list
default=
"true"
id=
"b9decb0c-dc9e-4239-bdad-09ea8dd5179d"
name=
"Default Changelist"
comment=
""
/>
<list
default=
"true"
id=
"b9decb0c-dc9e-4239-bdad-09ea8dd5179d"
name=
"Default Changelist"
comment=
""
>
<change
beforePath=
"$PROJECT_DIR$/subtitle.py"
beforeDir=
"false"
afterPath=
"$PROJECT_DIR$/subtitle.py"
afterDir=
"false"
/>
</list>
<option
name=
"SHOW_DIALOG"
value=
"false"
/>
<option
name=
"HIGHLIGHT_CONFLICTS"
value=
"true"
/>
<option
name=
"HIGHLIGHT_NON_ACTIVE_CHANGELIST"
value=
"false"
/>
...
...
src/subtitle.py
View file @
bfc3943
...
...
@@ -11,47 +11,56 @@ def komoran_tokenize(sent):
words
=
[
w
for
w
in
words
if
(
'/NN'
in
w
or
'/XR'
in
w
or
'/VA'
in
w
or
'/VV'
in
w
)]
return
words
#youtube url의 자막 -> xml으로 가져오기
topk_size
=
30
video_url
=
'https://www.youtube.com/watch?v=ecUWKU_v318'
yt
=
YouTube
(
video_url
)
title
=
yt
.
title
description
=
yt
.
description
caption
=
yt
.
captions
.
get_by_language_code
(
'ko'
)
caption_xml
=
caption
.
xml_captions
def
subtitle
(
video_url
,
topk_size
):
#youtube url의 자막 -> xml으로 가져오기
topk_size
=
30
video_url
=
'https://www.youtube.com/watch?v=ecUWKU_v318'
yt
=
YouTube
(
video_url
)
title
=
yt
.
title
description
=
yt
.
description
caption
=
yt
.
captions
.
get_by_language_code
(
'ko'
)
caption_xml
=
caption
.
xml_captions
#xml -> string list로 파싱
root
=
ElementTree
.
fromstring
(
caption_xml
)
texts
=
[]
texts
.
append
(
title
)
for
child
in
root
.
findall
(
"text"
):
text
=
child
.
text
.
replace
(
'
\n
'
,
' '
)
texts
.
append
(
text
)
topk_size
=
texts
.
size
()
*
100
//
topk_size
#xml -> string list로 파싱
root
=
ElementTree
.
fromstring
(
caption_xml
)
texts
=
[]
texts
.
append
(
title
)
for
child
in
root
.
findall
(
"text"
):
text
=
child
.
text
.
replace
(
'
\n
'
,
' '
)
texts
.
append
(
text
)
topk_size
=
texts
.
size
()
*
100
//
topk_size
#Komoran을 통해 형태소 단위로 분리 후 태깅
komoran
=
Komoran
(
'STABLE'
)
sents
=
[]
for
text
in
texts
:
tokened_text
=
komoran
.
get_plain_text
(
text
)
sents
.
append
(
tokened_text
)
#Komoran을 통해 형태소 단위로 분리 후 태깅
komoran
=
Komoran
(
'STABLE'
)
sents
=
[]
for
text
in
texts
:
tokened_text
=
komoran
.
get_plain_text
(
text
)
sents
.
append
(
tokened_text
)
keyword_extractor
=
KeywordSummarizer
(
tokenize
=
komoran_tokenize
,
window
=
-
1
,
verbose
=
False
)
keywords
=
keyword_extractor
.
summarize
(
sents
,
topk
=
30
)
keyword_extractor
=
KeywordSummarizer
(
tokenize
=
komoran_tokenize
,
window
=
-
1
,
verbose
=
False
)
keywords
=
keyword_extractor
.
summarize
(
sents
,
topk
=
30
)
summarizer
=
KeysentenceSummarizer
(
tokenize
=
lambda
x
:
x
.
split
(),
min_sim
=
0.5
,
verbose
=
False
)
bias
=
np
.
ones
(
len
(
texts
))
bias
[
0
]
=
5
keysents
=
summarizer
.
summarize
(
texts
,
topk
=
topk_size
,
bias
=
bias
)
keysents
.
sort
(
key
=
itemgetter
(
0
))
for
_
,
_
,
sent
in
keysents
:
sent
=
sent
.
replace
(
'''
,
"'"
)
print
(
sent
)
summarizer
=
KeysentenceSummarizer
(
tokenize
=
lambda
x
:
x
.
split
(),
min_sim
=
0.5
,
verbose
=
False
)
bias
=
np
.
ones
(
len
(
texts
))
bias
[
0
]
=
5
keysents
=
summarizer
.
summarize
(
texts
,
topk
=
topk_size
,
bias
=
bias
)
keysents
.
sort
(
key
=
itemgetter
(
0
))
first
=
True
ret
=
''
for
_
,
_
,
sent
in
keysents
:
sent
=
sent
.
replace
(
'''
,
"'"
)
ret
=
ret
+
sent
if
first
:
ret
+=
'
\n
'
first
=
False
else
:
ret
+=
' '
return
ret
;
...
...
Please
register
or
login
to post a comment