jamo.py 1.57 KB
import re 
from soynlp.hangle import compose, decompose, character_is_korean 


doublespace_pattern = re.compile('\s+') 

def jamo_sentence(sent): 
    def transform(char): 
        if char == ' ': 
            return char 
            
        cjj = decompose(char) 
        if len(cjj) == 1: 
            return cjj 
        
        cjj_ = ''.join(c if c != ' ' else '-' for c in cjj) 
        return cjj_ 
        
    sent_ = [] 
    for char in sent: 
        if character_is_korean(char): 
            sent_.append(transform(char)) 
        else: 
            sent_.append(char) 
    sent_ = doublespace_pattern.sub(' ', ''.join(sent_)) 
    return sent_ 
        
def jamo_to_word(jamo): 
    jamo_list, idx = [], 0 
    
    while idx < len(jamo): 
        if not character_is_korean(jamo[idx]): 
            jamo_list.append(jamo[idx]) 
            idx += 1 
        else: 
            jamo_list.append(jamo[idx:idx + 3]) 
            idx += 3 
        
    word = "" 
    for jamo_char in jamo_list: 
        if len(jamo_char) == 1: 
            word += jamo_char 
        elif jamo_char[2] == "-":
            word += compose(jamo_char[0], jamo_char[1], " ")
        else: word += compose(jamo_char[0], jamo_char[1], jamo_char[2]) 
            
    return word

def break_char (jamo_sentence):
    idx = 0
    corpus = []

    while idx < len(jamo_sentence):
        if not character_is_korean(jamo_sentence[idx]): 
            corpus.append(jamo_sentence[idx]) 
            idx += 1
        else:
            corpus.append(jamo_sentence[idx : idx+3])
            idx += 3
    return corpus