정의동

Add: source lib 추가

1 -# OS Generated files
2 -*.DS_Store
3 -*.DS_Store?
4 -._*
...\ No newline at end of file ...\ No newline at end of file
1 +**/__pycache__
......
1 +from nltk.tokenize import word_tokenize
2 +import nltk
3 +import re
4 +from bs4 import BeautifulSoup
5 +import requests
6 +
7 +
8 +def get_HTML_from_url(url):
9 + return requests.get(url).text
10 +
11 +
12 +def get_text_from_HTML(html):
13 + soup = BeautifulSoup(html)
14 + script_tag = soup.find_all(['script', 'style', 'header', 'footer', 'form'])
15 +
16 + for script in script_tag:
17 + script.extract()
18 + content = soup.get_text('\n', strip=True)
19 + return content
20 +
21 +
22 +# def get_HTML_from_regexp_url(url_pattern):
23 +
24 +
25 +def is_string(target):
26 + return type(target) == str
27 +
28 +
29 +def cut_corpus(corpus):
30 + if not is_string(corpus):
31 + return []
32 + return corpus.split('.')[:-1]
33 +
34 +
35 +def postag_sentence(sentence):
36 + if not is_string(sentence):
37 + return []
38 + tags = word_tokenize(sentence)
39 + return nltk.pos_tag(tags)
40 +
41 +
42 +# verb의 index를 return 합니다.
43 +# 만약, 존재하지 않는다면, -1을 return 합니다.
44 +def find_verb_idx(tags):
45 + idx = 0
46 + for tag in tags:
47 + if tag[0][1] == 'V':
48 + return idx
49 + return -1
50 +
51 +
52 +def make_be_verb(subj):
53 + if subj == 'I':
54 + return 'am'
55 + elif subj in ['You', 'you']:
56 + return 'are'
57 + else:
58 + return 'is'
59 +
60 +
61 +def cut_quot(sentence):
62 + return re.sub("[\'\"\`]", '', sentence)
63 +
64 +
65 +# 예외
66 +# 1. brace가 닫히지 않음
67 +# 2. target_str가 없음
68 +def make_brace_triple(target_str, brace_tags):
69 + if target_str == '':
70 + return []
71 + idx = find_verb_idx(brace_tags)
72 + subj = target_str
73 + pred = ''
74 + if idx != -1:
75 + pred = brace_tags[idx]
76 + obj = ' '.join([value for value, _ in brace_tags[idx:]])
77 + else:
78 + pred = make_be_verb(subj)
79 + obj = ' '.join([value for value, _ in brace_tags])
80 + return [subj, pred, obj]