kospacing.py 7.37 KB
# -*- coding: utf-8 -*-
import os
import re

import numpy as np
import pkg_resources

import gluonnlp as nlp
import mxnet as mx
import mxnet.autograd as autograd
import numpy as np
from mxnet import gluon
from mxnet.gluon import nn, rnn
from tqdm import tqdm

from kospacing.embedding_maker import (encoding_and_padding, load_embedding, load_vocab)

from gensim.models import FastText
from soynlp.hangle import character_is_korean
import kospacing.jamo as jamo

class korean_autospacing_base(gluon.HybridBlock):
    def __init__(self, n_hidden, vocab_size, embed_dim, max_seq_length,
                 **kwargs):
        super(korean_autospacing_base, self).__init__(**kwargs)
        # 입력 시퀀스 길이
        self.in_seq_len = max_seq_length
        # 출력 시퀀스 길이
        self.out_seq_len = max_seq_length
        # GRU의 hidden 개수
        self.n_hidden = n_hidden
        # 고유문자개수
        self.vocab_size = vocab_size
        # max_seq_length
        self.max_seq_length = max_seq_length
        # 임베딩 차원수
        self.embed_dim = embed_dim

        with self.name_scope():
            self.embedding = nn.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embed_dim)

            self.conv_unigram = nn.Conv2D(channels=128,
                                          kernel_size=(1, self.embed_dim))

            self.conv_bigram = nn.Conv2D(channels=256,
                                         kernel_size=(2, self.embed_dim),
                                         padding=(1, 0))

            self.conv_trigram = nn.Conv2D(channels=128,
                                          kernel_size=(3, self.embed_dim),
                                          padding=(1, 0))

            self.conv_forthgram = nn.Conv2D(channels=64,
                                            kernel_size=(4, self.embed_dim),
                                            padding=(2, 0))

            self.conv_fifthgram = nn.Conv2D(channels=32,
                                            kernel_size=(5, self.embed_dim),
                                            padding=(2, 0))

            self.bi_gru = rnn.GRU(hidden_size=self.n_hidden, layout='NTC', bidirectional=True)
            self.dense_sh = nn.Dense(100, activation='relu', flatten=False)
            self.dense = nn.Dense(1, activation='sigmoid', flatten=False)

    def hybrid_forward(self, F, inputs):
        embed = self.embedding(inputs)
        embed = F.expand_dims(embed, axis=1)
        unigram = self.conv_unigram(embed)
        bigram = self.conv_bigram(embed)
        trigram = self.conv_trigram(embed)
        forthgram = self.conv_forthgram(embed)
        fifthgram = self.conv_fifthgram(embed)

        grams = F.concat(unigram,
                         F.slice_axis(bigram,
                                      axis=2,
                                      begin=0,
                                      end=self.max_seq_length),
                         trigram,
                         F.slice_axis(forthgram,
                                      axis=2,
                                      begin=0,
                                      end=self.max_seq_length),
                         F.slice_axis(fifthgram,
                                      axis=2,
                                      begin=0,
                                      end=self.max_seq_length),
                         dim=1)

        grams = F.transpose(grams, (0, 2, 3, 1))
        grams = F.reshape(grams, (-1, self.max_seq_length, -3))
        grams = self.bi_gru(grams)
        fc1 = self.dense_sh(grams)
        return (self.dense(fc1))
    
    
def break_len(word):
    idx = 0
    cnt = 0
    while idx < len(word):
        if not character_is_korean(word[idx]):
            idx += 1
            cnt += 1
        else:
            idx += 3
            cnt += 1
    return cnt

class pred_spacing:
    def __init__(self, model, w2idx):
        self.model = model
        self.w2idx = w2idx
        self.pattern = re.compile(r'\s+')

    #@lru_cache(maxsize=None)
    def get_spaced_sent(self, raw_sent):
        raw_sent_ = "«" + raw_sent + "»"
        raw_sent_ = raw_sent_.replace(' ', '^')
        sents_in = [
            raw_sent_,
        ]
        mat_in = encoding_and_padding(word2idx_dic=self.w2idx,
                                      sequences=sents_in,
                                      fasttext=fasttext,
                                      maxlen=200,
                                      padding='post',
                                      truncating='post')
        mat_in = mx.nd.array(mat_in, ctx=mx.cpu(0))
        results = self.model(mat_in)
        mat_set = results[0, ]
        # preds = np.array(
        #     ['1' if i > 0.5 else '0' for i in mat_set[:break_len(raw_sent_)]])
        
        #print(mat_set[:break_len(raw_sent_)])
        r = 255
        c = 1 / np.log(1+r)
        log_scaled = c * mx.nd.log(1 + r*mat_set[:break_len(raw_sent_)])
        #print(log_scaled)
        d_2 = [1]
        for i in range(1,break_len(raw_sent_)):
            d_2.append(mat_set[i-1] - (2 * mat_set[i]) + mat_set[i+1])
        #print(d_2)
        preds = np.array(
            ['1' if log_scaled[i] > 0.09 and d_2[i] < 0 else '0' for i in range(break_len(raw_sent_))])
        return self.make_pred_sents(raw_sent_, preds)

    def make_pred_sents(self, x_sents, y_pred):
        res_sent = []
        # for i, j in zip(x_sents, y_pred):
        #     if j == '1':                    
        #         res_sent.append(i)
        #         res_sent.append(' ')
        #     else:
        #         res_sent.append(i)
        
        idx_x = 0
        # print('#'*20)
        # print('x_sents:', len(x_sents), x_sents)
        # print('pred:', len(y_pred), y_pred)
        for pred in y_pred:
            if pred == '1':
                if not character_is_korean(x_sents[idx_x]):
                    res_sent.append(x_sents[idx_x])
                    idx_x += 1
                else:
                    res_sent.append(x_sents[idx_x : idx_x + 3])
                    idx_x += 3
                
                res_sent.append(' ')
            else:
                if not character_is_korean(x_sents[idx_x]):
                    res_sent.append(x_sents[idx_x])
                    idx_x += 1
                else:
                    res_sent.append(x_sents[idx_x : idx_x + 3])
                    idx_x += 3
               
        subs = re.sub(self.pattern, ' ', ''.join(res_sent).replace('^', ' '))
        subs = subs.replace('«', '')
        subs = subs.replace('»', '')
        subs = jamo.jamo_to_word(subs)
        return subs


__all__ = ['spacing', ]
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
ctx = mx.gpu(0)

# 사전 파일 로딩
w2idx, idx2w = load_vocab('./kospacing/model/w2idx.dic')
# 임베딩 파일 로딩
weights = load_embedding('./kospacing/model/kospacing_wv.np')
vocab_size = weights.shape[0]
embed_dim = weights.shape[1]
model = korean_autospacing_base(n_hidden=200,
                                vocab_size=vocab_size,
                                embed_dim=embed_dim,
                                max_seq_length=200)

model.load_parameters('./kospacing/model/kospacing.params', ctx=mx.cpu(0))
predictor = pred_spacing(model, w2idx)
fasttext = FastText.load('./kospacing/model/fasttext')


def spacing(sent):
    sent = jamo.jamo_sentence(sent)
    spaced = predictor.get_spaced_sent(sent)
    return spaced.strip()