신은섭(Shin Eun Seop)

first commit

File mode changed
1 +# -*- coding: utf-8 -*-
2 +
3 +"""
4 +Copyright 2018 NAVER Corp.
5 +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
6 +associated documentation files (the "Software"), to deal in the Software without restriction, including
7 +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
9 +the following conditions:
10 +The above copyright notice and this permission notice shall be included in all copies or substantial
11 +portions of the Software.
12 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
13 +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
14 +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
15 +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
16 +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
17 +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18 +"""
19 +
20 +cho = "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ" # len = 19
21 +jung = "ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ" # len = 21
22 +# len = 27
23 +jong = "ㄱ/ㄲ/ㄱㅅ/ㄴ/ㄴㅈ/ㄴㅎ/ㄷ/ㄹ/ㄹㄱ/ㄹㅁ/ㄹㅂ/ㄹㅅ/ㄹㅌ/ㄹㅍ/ㄹㅎ/ㅁ/ㅂ/ㅂㅅ/ㅅ/ㅆ/ㅇ/ㅈ/ㅊ/ㅋ/ㅌ/ㅍ/ㅎ".split(
24 + '/')
25 +test = cho + jung + ''.join(jong)
26 +
27 +hangul_length = len(cho) + len(jung) + len(jong) # 67
28 +
29 +
30 +def is_valid_decomposition_atom(x):
31 + return x in test
32 +
33 +
34 +def decompose(x):
35 + in_char = x
36 + if x < ord('가') or x > ord('힣'):
37 + return chr(x)
38 + x = x - ord('가')
39 + y = x // 28
40 + z = x % 28
41 + x = y // 21
42 + y = y % 21
43 + # if there is jong, then is z > 0. So z starts from 1 index.
44 + zz = jong[z - 1] if z > 0 else ''
45 + if x >= len(cho):
46 + print('Unknown Exception: ', in_char, chr(in_char), x, y, z, zz)
47 + return cho[x] + jung[y] + zz
48 +
49 +
50 +def decompose_as_one_hot(in_char, warning=True):
51 + one_hot = []
52 + # print(ord('ㅣ'), chr(0xac00))
53 + # [0,66]: hangul / [67,194]: ASCII / [195,245]: hangul danja,danmo / [246,249]: special characters
54 + # Total 250 dimensions.
55 + if ord('가') <= in_char <= ord('힣'): # 가:44032 , 힣: 55203
56 + x = in_char - 44032 # in_char - ord('가')
57 + y = x // 28
58 + z = x % 28
59 + x = y // 21
60 + y = y % 21
61 + # if there is jong, then is z > 0. So z starts from 1 index.
62 + zz = jong[z - 1] if z > 0 else ''
63 + if x >= len(cho):
64 + if warning:
65 + print('Unknown Exception: ', in_char,
66 + chr(in_char), x, y, z, zz)
67 +
68 + one_hot.append(x)
69 + one_hot.append(len(cho) + y)
70 + if z > 0:
71 + one_hot.append(len(cho) + len(jung) + (z - 1))
72 + return one_hot
73 + else:
74 + if in_char < 128:
75 + result = hangul_length + in_char # 67~
76 + elif ord('ㄱ') <= in_char <= ord('ㅣ'):
77 + # 194~ # [ㄱ:12593]~[ㅣ:12643] (len = 51)
78 + result = hangul_length + 128 + (in_char - 12593)
79 + elif in_char == ord('♡'):
80 + result = hangul_length + 128 + 51 # 245~ # ♡
81 + elif in_char == ord('♥'):
82 + result = hangul_length + 128 + 51 + 1 # ♥
83 + elif in_char == ord('★'):
84 + result = hangul_length + 128 + 51 + 2 # ★
85 + elif in_char == ord('☆'):
86 + result = hangul_length + 128 + 51 + 3 # ☆
87 + else:
88 + if warning:
89 + print('Unhandled character:', chr(in_char), in_char)
90 + # unknown character
91 + result = hangul_length + 128 + 51 + 4 # for unknown character
92 +
93 + return [result]
94 +
95 +
96 +def decompose_str(string):
97 + return ''.join([decompose(ord(x)) for x in string])
98 +
99 +
100 +def decompose_str_as_one_hot(string, warning=True):
101 + tmp_list = []
102 + for x in string:
103 + da = decompose_as_one_hot(ord(x), warning=warning)
104 + tmp_list.extend(da)
105 + return tmp_list