Toggle navigation
Toggle navigation
This project
Loading...
Sign in
2020-2-capstone-design1
/
HCG_project
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
GyuhoLee
2020-11-08 21:06:24 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
1686df14f4a18b37580825fec87df90597107d79
1686df14
1 parent
ae99d65c
[Add] 클래스화한 Summarizer 추가
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
190 additions
and
0 deletions
src/textrank/summarizer.py
src/textrank/summarizer.py
0 → 100644
View file @
1686df1
import
numpy
as
np
from
.rank
import
pagerank
from
.sentence
import
sent_graph
from
.word
import
word_graph
class
KeywordSummarizer
:
"""
Arguments
---------
sents : list of str
Sentence list
tokenize : callable
Tokenize function: tokenize(str) = list of str
min_count : int
Minumum frequency of words will be used to construct sentence graph
window : int
Word cooccurrence window size. Default is -1.
'-1' means there is cooccurrence between two words if the words occur in a sentence
min_cooccurrence : int
Minimum cooccurrence frequency of two words
vocab_to_idx : dict or None
Vocabulary to index mapper
df : float
PageRank damping factor
max_iter : int
Number of PageRank iterations
verbose : Boolean
If True, it shows training progress
"""
def
__init__
(
self
,
sents
=
None
,
tokenize
=
None
,
min_count
=
2
,
window
=-
1
,
min_cooccurrence
=
2
,
vocab_to_idx
=
None
,
df
=
0.85
,
max_iter
=
30
,
verbose
=
False
):
self
.
tokenize
=
tokenize
self
.
min_count
=
min_count
self
.
window
=
window
self
.
min_cooccurrence
=
min_cooccurrence
self
.
vocab_to_idx
=
vocab_to_idx
self
.
df
=
df
self
.
max_iter
=
max_iter
self
.
verbose
=
verbose
if
sents
is
not
None
:
self
.
train_textrank
(
sents
)
def
train_textrank
(
self
,
sents
,
bias
=
None
):
"""
Arguments
---------
sents : list of str
Sentence list
bias : None or numpy.ndarray
PageRank bias term
Returns
-------
None
"""
g
,
self
.
idx_to_vocab
=
word_graph
(
sents
,
self
.
tokenize
,
self
.
min_count
,
self
.
window
,
self
.
min_cooccurrence
,
self
.
vocab_to_idx
,
self
.
verbose
)
self
.
R
=
pagerank
(
g
,
self
.
df
,
self
.
max_iter
,
bias
)
.
reshape
(
-
1
)
if
self
.
verbose
:
print
(
'trained TextRank. n words = {}'
.
format
(
self
.
R
.
shape
[
0
]))
def
keywords
(
self
,
topk
=
30
):
"""
Arguments
---------
topk : int
Number of keywords selected from TextRank
Returns
-------
keywords : list of tuple
Each tuple stands for (word, rank)
"""
if
not
hasattr
(
self
,
'R'
):
raise
RuntimeError
(
'Train textrank first or use summarize function'
)
idxs
=
self
.
R
.
argsort
()[
-
topk
:]
keywords
=
[(
self
.
idx_to_vocab
[
idx
],
self
.
R
[
idx
])
for
idx
in
reversed
(
idxs
)]
return
keywords
def
summarize
(
self
,
sents
,
topk
=
30
):
"""
Arguments
---------
sents : list of str
Sentence list
topk : int
Number of keywords selected from TextRank
Returns
-------
keywords : list of tuple
Each tuple stands for (word, rank)
"""
self
.
train_textrank
(
sents
)
return
self
.
keywords
(
topk
)
class
KeysentenceSummarizer
:
"""
Arguments
---------
sents : list of str
Sentence list
tokenize : callable
Tokenize function: tokenize(str) = list of str
min_count : int
Minumum frequency of words will be used to construct sentence graph
min_sim : float
Minimum similarity between sentences in sentence graph
similarity : str
available similarity = ['cosine', 'textrank']
vocab_to_idx : dict or None
Vocabulary to index mapper
df : float
PageRank damping factor
max_iter : int
Number of PageRank iterations
verbose : Boolean
If True, it shows training progress
"""
def
__init__
(
self
,
sents
=
None
,
tokenize
=
None
,
min_count
=
2
,
min_sim
=
0.3
,
similarity
=
None
,
vocab_to_idx
=
None
,
df
=
0.85
,
max_iter
=
30
,
verbose
=
False
):
self
.
tokenize
=
tokenize
self
.
min_count
=
min_count
self
.
min_sim
=
min_sim
self
.
similarity
=
similarity
self
.
vocab_to_idx
=
vocab_to_idx
self
.
df
=
df
self
.
max_iter
=
max_iter
self
.
verbose
=
verbose
if
sents
is
not
None
:
self
.
train_textrank
(
sents
)
def
train_textrank
(
self
,
sents
,
bias
=
None
):
"""
Arguments
---------
sents : list of str
Sentence list
bias : None or numpy.ndarray
PageRank bias term
Shape must be (n_sents,)
Returns
-------
None
"""
g
=
sent_graph
(
sents
,
self
.
tokenize
,
self
.
min_count
,
self
.
min_sim
,
self
.
similarity
,
self
.
vocab_to_idx
,
self
.
verbose
)
self
.
R
=
pagerank
(
g
,
self
.
df
,
self
.
max_iter
,
bias
)
.
reshape
(
-
1
)
if
self
.
verbose
:
print
(
'trained TextRank. n sentences = {}'
.
format
(
self
.
R
.
shape
[
0
]))
def
summarize
(
self
,
sents
,
topk
=
30
,
bias
=
None
):
"""
Arguments
---------
sents : list of str
Sentence list
topk : int
Number of key-sentences to be selected.
bias : None or numpy.ndarray
PageRank bias term
Shape must be (n_sents,)
Returns
-------
keysents : list of tuple
Each tuple stands for (sentence index, rank, sentence)
Usage
-----
>>> from textrank import KeysentenceSummarizer
>>> summarizer = KeysentenceSummarizer(tokenize = tokenizer, min_sim = 0.5)
>>> keysents = summarizer.summarize(texts, topk=30)
"""
n_sents
=
len
(
sents
)
if
isinstance
(
bias
,
np
.
ndarray
):
if
bias
.
shape
!=
(
n_sents
,):
raise
ValueError
(
'The shape of bias must be (n_sents,) but {}'
.
format
(
bias
.
shape
))
elif
bias
is
not
None
:
raise
ValueError
(
'The type of bias must be None or numpy.ndarray but the type is {}'
.
format
(
type
(
bias
)))
self
.
train_textrank
(
sents
,
bias
)
idxs
=
self
.
R
.
argsort
()[
-
topk
:]
keysents
=
[(
idx
,
self
.
R
[
idx
],
sents
[
idx
])
for
idx
in
reversed
(
idxs
)]
return
keysents
\ No newline at end of file
Please
register
or
login
to post a comment