Showing
2 changed files
with
105 additions
and
0 deletions
movie/setpu.py
0 → 100644
File mode changed
utill/kor_char_parser.py
0 → 100644
1 | +# -*- coding: utf-8 -*- | ||
2 | + | ||
3 | +""" | ||
4 | +Copyright 2018 NAVER Corp. | ||
5 | +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and | ||
6 | +associated documentation files (the "Software"), to deal in the Software without restriction, including | ||
7 | +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
8 | +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to | ||
9 | +the following conditions: | ||
10 | +The above copyright notice and this permission notice shall be included in all copies or substantial | ||
11 | +portions of the Software. | ||
12 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | ||
13 | +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | ||
14 | +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | ||
15 | +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF | ||
16 | +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
17 | +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
18 | +""" | ||
19 | + | ||
20 | +cho = "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ" # len = 19 | ||
21 | +jung = "ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ" # len = 21 | ||
22 | +# len = 27 | ||
23 | +jong = "ㄱ/ㄲ/ㄱㅅ/ㄴ/ㄴㅈ/ㄴㅎ/ㄷ/ㄹ/ㄹㄱ/ㄹㅁ/ㄹㅂ/ㄹㅅ/ㄹㅌ/ㄹㅍ/ㄹㅎ/ㅁ/ㅂ/ㅂㅅ/ㅅ/ㅆ/ㅇ/ㅈ/ㅊ/ㅋ/ㅌ/ㅍ/ㅎ".split( | ||
24 | + '/') | ||
25 | +test = cho + jung + ''.join(jong) | ||
26 | + | ||
27 | +hangul_length = len(cho) + len(jung) + len(jong) # 67 | ||
28 | + | ||
29 | + | ||
30 | +def is_valid_decomposition_atom(x): | ||
31 | + return x in test | ||
32 | + | ||
33 | + | ||
34 | +def decompose(x): | ||
35 | + in_char = x | ||
36 | + if x < ord('가') or x > ord('힣'): | ||
37 | + return chr(x) | ||
38 | + x = x - ord('가') | ||
39 | + y = x // 28 | ||
40 | + z = x % 28 | ||
41 | + x = y // 21 | ||
42 | + y = y % 21 | ||
43 | + # if there is jong, then is z > 0. So z starts from 1 index. | ||
44 | + zz = jong[z - 1] if z > 0 else '' | ||
45 | + if x >= len(cho): | ||
46 | + print('Unknown Exception: ', in_char, chr(in_char), x, y, z, zz) | ||
47 | + return cho[x] + jung[y] + zz | ||
48 | + | ||
49 | + | ||
50 | +def decompose_as_one_hot(in_char, warning=True): | ||
51 | + one_hot = [] | ||
52 | + # print(ord('ㅣ'), chr(0xac00)) | ||
53 | + # [0,66]: hangul / [67,194]: ASCII / [195,245]: hangul danja,danmo / [246,249]: special characters | ||
54 | + # Total 250 dimensions. | ||
55 | + if ord('가') <= in_char <= ord('힣'): # 가:44032 , 힣: 55203 | ||
56 | + x = in_char - 44032 # in_char - ord('가') | ||
57 | + y = x // 28 | ||
58 | + z = x % 28 | ||
59 | + x = y // 21 | ||
60 | + y = y % 21 | ||
61 | + # if there is jong, then is z > 0. So z starts from 1 index. | ||
62 | + zz = jong[z - 1] if z > 0 else '' | ||
63 | + if x >= len(cho): | ||
64 | + if warning: | ||
65 | + print('Unknown Exception: ', in_char, | ||
66 | + chr(in_char), x, y, z, zz) | ||
67 | + | ||
68 | + one_hot.append(x) | ||
69 | + one_hot.append(len(cho) + y) | ||
70 | + if z > 0: | ||
71 | + one_hot.append(len(cho) + len(jung) + (z - 1)) | ||
72 | + return one_hot | ||
73 | + else: | ||
74 | + if in_char < 128: | ||
75 | + result = hangul_length + in_char # 67~ | ||
76 | + elif ord('ㄱ') <= in_char <= ord('ㅣ'): | ||
77 | + # 194~ # [ㄱ:12593]~[ㅣ:12643] (len = 51) | ||
78 | + result = hangul_length + 128 + (in_char - 12593) | ||
79 | + elif in_char == ord('♡'): | ||
80 | + result = hangul_length + 128 + 51 # 245~ # ♡ | ||
81 | + elif in_char == ord('♥'): | ||
82 | + result = hangul_length + 128 + 51 + 1 # ♥ | ||
83 | + elif in_char == ord('★'): | ||
84 | + result = hangul_length + 128 + 51 + 2 # ★ | ||
85 | + elif in_char == ord('☆'): | ||
86 | + result = hangul_length + 128 + 51 + 3 # ☆ | ||
87 | + else: | ||
88 | + if warning: | ||
89 | + print('Unhandled character:', chr(in_char), in_char) | ||
90 | + # unknown character | ||
91 | + result = hangul_length + 128 + 51 + 4 # for unknown character | ||
92 | + | ||
93 | + return [result] | ||
94 | + | ||
95 | + | ||
96 | +def decompose_str(string): | ||
97 | + return ''.join([decompose(ord(x)) for x in string]) | ||
98 | + | ||
99 | + | ||
100 | +def decompose_str_as_one_hot(string, warning=True): | ||
101 | + tmp_list = [] | ||
102 | + for x in string: | ||
103 | + da = decompose_as_one_hot(ord(x), warning=warning) | ||
104 | + tmp_list.extend(da) | ||
105 | + return tmp_list |
-
Please register or login to post a comment