jamo_utils.py
1.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import re
from soynlp.hangle import compose, decompose, character_is_korean
doublespace_pattern = re.compile('\s+')
def jamo_sentence(sent):
def transform(char):
if char == ' ':
return char
cjj = decompose(char)
if len(cjj) == 1:
return cjj
cjj_ = ''.join(c if c != ' ' else '-' for c in cjj)
return cjj_
sent_ = []
for char in sent:
if character_is_korean(char):
sent_.append(transform(char))
else:
sent_.append(char)
sent_ = doublespace_pattern.sub(' ', ''.join(sent_))
return sent_
def jamo_to_word(jamo):
jamo_list, idx = [], 0
while idx < len(jamo):
if not character_is_korean(jamo[idx]):
jamo_list.append(jamo[idx])
idx += 1
else:
jamo_list.append(jamo[idx:idx + 3])
idx += 3
word = ""
for jamo_char in jamo_list:
if len(jamo_char) == 1:
word += jamo_char
elif jamo_char[2] == "-":
word += compose(jamo_char[0], jamo_char[1], " ")
else: word += compose(jamo_char[0], jamo_char[1], jamo_char[2])
return word
def break_char (jamo_sentence):
idx = 0
corpus = []
while idx < len(jamo_sentence):
if not character_is_korean(jamo_sentence[idx]):
corpus.append(jamo_sentence[idx])
idx += 1
else:
corpus.append(jamo_sentence[idx : idx+3])
idx += 3
return corpus