Showing
2 changed files
with
105 additions
and
0 deletions
movie/setpu.py
0 → 100644
File mode changed
utill/kor_char_parser.py
0 → 100644
| 1 | +# -*- coding: utf-8 -*- | ||
| 2 | + | ||
| 3 | +""" | ||
| 4 | +Copyright 2018 NAVER Corp. | ||
| 5 | +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and | ||
| 6 | +associated documentation files (the "Software"), to deal in the Software without restriction, including | ||
| 7 | +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| 8 | +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to | ||
| 9 | +the following conditions: | ||
| 10 | +The above copyright notice and this permission notice shall be included in all copies or substantial | ||
| 11 | +portions of the Software. | ||
| 12 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | ||
| 13 | +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | ||
| 14 | +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | ||
| 15 | +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF | ||
| 16 | +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
| 17 | +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
| 18 | +""" | ||
| 19 | + | ||
| 20 | +cho = "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ" # len = 19 | ||
| 21 | +jung = "ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ" # len = 21 | ||
| 22 | +# len = 27 | ||
| 23 | +jong = "ㄱ/ㄲ/ㄱㅅ/ㄴ/ㄴㅈ/ㄴㅎ/ㄷ/ㄹ/ㄹㄱ/ㄹㅁ/ㄹㅂ/ㄹㅅ/ㄹㅌ/ㄹㅍ/ㄹㅎ/ㅁ/ㅂ/ㅂㅅ/ㅅ/ㅆ/ㅇ/ㅈ/ㅊ/ㅋ/ㅌ/ㅍ/ㅎ".split( | ||
| 24 | + '/') | ||
| 25 | +test = cho + jung + ''.join(jong) | ||
| 26 | + | ||
| 27 | +hangul_length = len(cho) + len(jung) + len(jong) # 67 | ||
| 28 | + | ||
| 29 | + | ||
| 30 | +def is_valid_decomposition_atom(x): | ||
| 31 | + return x in test | ||
| 32 | + | ||
| 33 | + | ||
| 34 | +def decompose(x): | ||
| 35 | + in_char = x | ||
| 36 | + if x < ord('가') or x > ord('힣'): | ||
| 37 | + return chr(x) | ||
| 38 | + x = x - ord('가') | ||
| 39 | + y = x // 28 | ||
| 40 | + z = x % 28 | ||
| 41 | + x = y // 21 | ||
| 42 | + y = y % 21 | ||
| 43 | + # if there is jong, then is z > 0. So z starts from 1 index. | ||
| 44 | + zz = jong[z - 1] if z > 0 else '' | ||
| 45 | + if x >= len(cho): | ||
| 46 | + print('Unknown Exception: ', in_char, chr(in_char), x, y, z, zz) | ||
| 47 | + return cho[x] + jung[y] + zz | ||
| 48 | + | ||
| 49 | + | ||
| 50 | +def decompose_as_one_hot(in_char, warning=True): | ||
| 51 | + one_hot = [] | ||
| 52 | + # print(ord('ㅣ'), chr(0xac00)) | ||
| 53 | + # [0,66]: hangul / [67,194]: ASCII / [195,245]: hangul danja,danmo / [246,249]: special characters | ||
| 54 | + # Total 250 dimensions. | ||
| 55 | + if ord('가') <= in_char <= ord('힣'): # 가:44032 , 힣: 55203 | ||
| 56 | + x = in_char - 44032 # in_char - ord('가') | ||
| 57 | + y = x // 28 | ||
| 58 | + z = x % 28 | ||
| 59 | + x = y // 21 | ||
| 60 | + y = y % 21 | ||
| 61 | + # if there is jong, then is z > 0. So z starts from 1 index. | ||
| 62 | + zz = jong[z - 1] if z > 0 else '' | ||
| 63 | + if x >= len(cho): | ||
| 64 | + if warning: | ||
| 65 | + print('Unknown Exception: ', in_char, | ||
| 66 | + chr(in_char), x, y, z, zz) | ||
| 67 | + | ||
| 68 | + one_hot.append(x) | ||
| 69 | + one_hot.append(len(cho) + y) | ||
| 70 | + if z > 0: | ||
| 71 | + one_hot.append(len(cho) + len(jung) + (z - 1)) | ||
| 72 | + return one_hot | ||
| 73 | + else: | ||
| 74 | + if in_char < 128: | ||
| 75 | + result = hangul_length + in_char # 67~ | ||
| 76 | + elif ord('ㄱ') <= in_char <= ord('ㅣ'): | ||
| 77 | + # 194~ # [ㄱ:12593]~[ㅣ:12643] (len = 51) | ||
| 78 | + result = hangul_length + 128 + (in_char - 12593) | ||
| 79 | + elif in_char == ord('♡'): | ||
| 80 | + result = hangul_length + 128 + 51 # 245~ # ♡ | ||
| 81 | + elif in_char == ord('♥'): | ||
| 82 | + result = hangul_length + 128 + 51 + 1 # ♥ | ||
| 83 | + elif in_char == ord('★'): | ||
| 84 | + result = hangul_length + 128 + 51 + 2 # ★ | ||
| 85 | + elif in_char == ord('☆'): | ||
| 86 | + result = hangul_length + 128 + 51 + 3 # ☆ | ||
| 87 | + else: | ||
| 88 | + if warning: | ||
| 89 | + print('Unhandled character:', chr(in_char), in_char) | ||
| 90 | + # unknown character | ||
| 91 | + result = hangul_length + 128 + 51 + 4 # for unknown character | ||
| 92 | + | ||
| 93 | + return [result] | ||
| 94 | + | ||
| 95 | + | ||
| 96 | +def decompose_str(string): | ||
| 97 | + return ''.join([decompose(ord(x)) for x in string]) | ||
| 98 | + | ||
| 99 | + | ||
| 100 | +def decompose_str_as_one_hot(string, warning=True): | ||
| 101 | + tmp_list = [] | ||
| 102 | + for x in string: | ||
| 103 | + da = decompose_as_one_hot(ord(x), warning=warning) | ||
| 104 | + tmp_list.extend(da) | ||
| 105 | + return tmp_list |
-
Please register or login to post a comment