Showing
2 changed files
with
81 additions
and
4 deletions
src/lib/util.py
0 → 100644
| 1 | +from nltk.tokenize import word_tokenize | ||
| 2 | +import nltk | ||
| 3 | +import re | ||
| 4 | +from bs4 import BeautifulSoup | ||
| 5 | +import requests | ||
| 6 | + | ||
| 7 | + | ||
| 8 | +def get_HTML_from_url(url): | ||
| 9 | + return requests.get(url).text | ||
| 10 | + | ||
| 11 | + | ||
| 12 | +def get_text_from_HTML(html): | ||
| 13 | + soup = BeautifulSoup(html) | ||
| 14 | + script_tag = soup.find_all(['script', 'style', 'header', 'footer', 'form']) | ||
| 15 | + | ||
| 16 | + for script in script_tag: | ||
| 17 | + script.extract() | ||
| 18 | + content = soup.get_text('\n', strip=True) | ||
| 19 | + return content | ||
| 20 | + | ||
| 21 | + | ||
| 22 | +# def get_HTML_from_regexp_url(url_pattern): | ||
| 23 | + | ||
| 24 | + | ||
| 25 | +def is_string(target): | ||
| 26 | + return type(target) == str | ||
| 27 | + | ||
| 28 | + | ||
| 29 | +def cut_corpus(corpus): | ||
| 30 | + if not is_string(corpus): | ||
| 31 | + return [] | ||
| 32 | + return corpus.split('.')[:-1] | ||
| 33 | + | ||
| 34 | + | ||
| 35 | +def postag_sentence(sentence): | ||
| 36 | + if not is_string(sentence): | ||
| 37 | + return [] | ||
| 38 | + tags = word_tokenize(sentence) | ||
| 39 | + return nltk.pos_tag(tags) | ||
| 40 | + | ||
| 41 | + | ||
| 42 | +# verb의 index를 return 합니다. | ||
| 43 | +# 만약, 존재하지 않는다면, -1을 return 합니다. | ||
| 44 | +def find_verb_idx(tags): | ||
| 45 | + idx = 0 | ||
| 46 | + for tag in tags: | ||
| 47 | + if tag[0][1] == 'V': | ||
| 48 | + return idx | ||
| 49 | + return -1 | ||
| 50 | + | ||
| 51 | + | ||
| 52 | +def make_be_verb(subj): | ||
| 53 | + if subj == 'I': | ||
| 54 | + return 'am' | ||
| 55 | + elif subj in ['You', 'you']: | ||
| 56 | + return 'are' | ||
| 57 | + else: | ||
| 58 | + return 'is' | ||
| 59 | + | ||
| 60 | + | ||
| 61 | +def cut_quot(sentence): | ||
| 62 | + return re.sub("[\'\"\`]", '', sentence) | ||
| 63 | + | ||
| 64 | + | ||
| 65 | +# 예외 | ||
| 66 | +# 1. brace가 닫히지 않음 | ||
| 67 | +# 2. target_str가 없음 | ||
| 68 | +def make_brace_triple(target_str, brace_tags): | ||
| 69 | + if target_str == '': | ||
| 70 | + return [] | ||
| 71 | + idx = find_verb_idx(brace_tags) | ||
| 72 | + subj = target_str | ||
| 73 | + pred = '' | ||
| 74 | + if idx != -1: | ||
| 75 | + pred = brace_tags[idx] | ||
| 76 | + obj = ' '.join([value for value, _ in brace_tags[idx:]]) | ||
| 77 | + else: | ||
| 78 | + pred = make_be_verb(subj) | ||
| 79 | + obj = ' '.join([value for value, _ in brace_tags]) | ||
| 80 | + return [subj, pred, obj] |
-
Please register or login to post a comment