embedding_maker.py 6.52 KB
__all__ = [
    'create_embeddings', 'load_embedding', 'load_vocab',
    'encoding_and_padding', 'get_embedding_model'
]

import bz2
import json
import os

import numpy as np
import pkg_resources
from gensim.models import FastText

from utils.spacing_utils import sent_to_spacing_chars
from tqdm import tqdm
from utils.jamo_utils import jamo_sentence, jamo_to_word

def pad_sequences(sequences,
                  maxlen=None,
                  dtype='int32',
                  padding='pre',
                  truncating='pre',
                  value=0.):

    if not hasattr(sequences, '__len__'):
        raise ValueError('`sequences` must be iterable.')
    lengths = []
    for x in sequences:
        if not hasattr(x, '__len__'):
            raise ValueError('`sequences` must be a list of iterables. '
                             'Found non-iterable: ' + str(x))
        lengths.append(len(x))

    num_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if not len(s):
            continue  # empty list/array was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' %
                             truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError(
                'Shape of sample %s of sequence at position %s is different from expected shape %s'
                % (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x


def create_embeddings(data_dir,
                      model_file,
                      embeddings_file,
                      vocab_file,
                      splitc=' ',
                      **params):
    """
    making embedding from files.
    :**params additional Word2Vec() parameters
    :splitc   char for splitting in  data_dir files
    :model_file output object from Word2Vec()
    :data_dir data dir to be process
    :embeddings_file numpy object file path from Word2Vec()
    :vocab_file item to index json dictionary
    """
    class SentenceGenerator(object):
        def __init__(self, dirname):
            self.dirname = dirname

        def __iter__(self):
            for fname in os.listdir(self.dirname):
                print("processing~  '{}'".format(fname))
                for line in bz2.open(os.path.join(self.dirname, fname), "rt"):
                    yield sent_to_spacing_chars(line.strip()).split(splitc)

    sentences = SentenceGenerator(data_dir)

    model = FastText.load(model_file)
    model.save(model_file)
    weights = model.wv.syn0
    default_vec = np.mean(weights, axis=0, keepdims=True)
    padding_vec = np.zeros((1, weights.shape[1]))

    weights_default = np.concatenate([weights, default_vec, padding_vec],
                                     axis=0)

    np.save(open(embeddings_file, 'wb'), weights_default)

    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    vocab['__PAD__'] = weights_default.shape[0] - 1
    with open(vocab_file, 'w') as f:
        f.write(json.dumps(vocab))


def load_embedding(embeddings_file):
    return (np.load(embeddings_file))


def load_vocab(vocab_path):
    with open(vocab_path, 'r') as f:
        data = json.loads(f.read())
    word2idx = data
    idx2word = dict([(v, k) for k, v in data.items()])
    return word2idx, idx2word

def get_similar_char(word2idx_dic, model, jamo_model, text, try_cnt, OOV_CNT, HIT_CNT):
    OOV_CNT += 1
    jamo_text = jamo_sentence(text)
    simialr_list = jamo_model.wv.most_similar(jamo_text)[:try_cnt]
    for char in simialr_list:
        result = jamo_to_word(char[0])
        
        if result in word2idx_dic.keys(): 
            # print('#' * 20)
            # print('hit')
            # print('origin: ', text, 'reuslt: ', result)
            HIT_CNT += 1
            return OOV_CNT, HIT_CNT,result

    # print('#' * 20)
    # print('no hit')
    # print('origin: ', text)
    return OOV_CNT, HIT_CNT, model.wv.most_similar(text)[0][0]


def encoding_and_padding(word2idx_dic, sequences, **params):
    """
    1. making item to idx
    2. padding
    :word2idx_dic
    :sequences: list of lists where each element is a sequence
    :maxlen: int, maximum length
    :dtype: type to cast the resulting sequence.
    :padding: 'pre' or 'post', pad either before or after each sequence.
    :truncating: 'pre' or 'post', remove values from sequences larger than
        maxlen either in the beginning or in the end of the sequence
    :value: float, value to pad the sequences to the desired value.
    """
    model_file = 'model/fasttext'
    jamo_model_path = 'jamo_model/fasttext'
    print('seq_idx start')
    model = FastText.load(model_file)
    jamo_model = FastText.load(jamo_model_path)
    seq_idx = []
    OOV_CNT = 0
    HIT_CNT = 0
    TOTAL_CNT = 0
    
    for word in tqdm(sequences):
        temp = []
        for char in word:
            TOTAL_CNT += 1
            if char in word2idx_dic.keys():
                temp.append(word2idx_dic[char])
            else:
                OOV_CNT, HIT_CNT, result = get_similar_char(word2idx_dic, model, jamo_model, char, 3, OOV_CNT, HIT_CNT)
                temp.append(word2idx_dic[result])
        seq_idx.append(temp)
    print('TOTAL CNT: ', TOTAL_CNT, 'OOV CNT: ', OOV_CNT, 'HIT_CNT: ', HIT_CNT)
    if OOV_CNT > 0 and HIT_CNT > 0:
        print('OOV RATE:', float(OOV_CNT) / TOTAL_CNT * 100, '%' ,'HIT_RATE: ', float(HIT_CNT) / float(OOV_CNT) * 100, '%')
    
    params['value'] = word2idx_dic['__PAD__']
    return (pad_sequences(seq_idx, **params))


def get_embedding_model(name='fee_prods', path='data/embedding'):
    weights = pkg_resources.resource_filename(
        'dsc', os.path.join(path, name, 'weights.np'))
    w2idx = pkg_resources.resource_filename(
        'dsc', os.path.join(path, name, 'idx.json'))
    return ((load_embedding(weights), load_vocab(w2idx)[0]))