util.py
1.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
import re
from bs4 import BeautifulSoup
from newspaper import Article
import requests
def get_HTML_from_url(url):
return requests.get(url).text
def get_text_from_HTML(html):
soup = BeautifulSoup(html)
script_tag = soup.find_all(['script', 'style', 'header', 'footer', 'form'])
for script in script_tag:
script.extract()
content = soup.get_text('\n', strip=True)
return content
# def get_HTML_from_regexp_url(url_pattern):
def is_string(target):
return type(target) == str
def cut_corpus(corpus):
if not is_string(corpus):
return []
return corpus.split('.')[:-1]
def postag_sentence(sentence):
if not is_string(sentence):
return []
tags = word_tokenize(sentence)
return nltk.pos_tag(tags)
# verb의 index를 return 합니다.
# 만약, 존재하지 않는다면, -1을 return 합니다.
def find_verb_idx(tags):
idx = 0
for tag in tags:
if tag[0][1] == 'V':
return idx
return -1
def make_be_verb(subj):
if subj == 'I':
return 'am'
elif subj in ['You', 'you']:
return 'are'
else:
return 'is'
def cut_quot(sentence):
return re.sub("[\'\"\`]", '', sentence)
# 예외
# 1. brace가 닫히지 않음
# 2. target_str가 없음
def make_brace_triple(target_str, brace_tags):
if target_str == '':
return []
idx = find_verb_idx(brace_tags)
subj = target_str
pred = ''
if idx != -1:
pred = brace_tags[idx]
obj = ' '.join([value for value, _ in brace_tags[idx:]])
else:
pred = make_be_verb(subj)
obj = ' '.join([value for value, _ in brace_tags])
return [subj, pred, obj]
url = 'https://en.wikipedia.org/wiki/Korea'
def get_bodytext_from_url(url):
news = Article(url, language='en')
news.download()
news.parse()
text = news.text
pattern = r'\[[^]]*\]'
text = re.sub(pattern=pattern, repl='', string=text)
return text