bongminkim

kobert

# coding=utf-8
# Copyright 2019 SK T-Brain Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = '0.1.1'
\ No newline at end of file
# coding=utf-8
# Copyright 2019 SK T-Brain Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import requests
import hashlib
import mxnet as mx
import gluonnlp as nlp
from gluonnlp.model import BERTModel, BERTEncoder
from .utils import download as _download
from .utils import tokenizer
mxnet_kobert = {
'url':
'https://kobert.blob.core.windows.net/models/kobert/mxnet/mxnet_kobert_45b6957552.params',
'fname': 'mxnet_kobert_45b6957552.params',
'chksum': '45b6957552'
}
def get_mxnet_kobert_model(use_pooler=True,
use_decoder=True,
use_classifier=True,
ctx=mx.cpu(0),
cachedir='~/kobert/'):
# download model
model_info = mxnet_kobert
model_path = _download(model_info['url'],
model_info['fname'],
model_info['chksum'],
cachedir=cachedir)
# download vocab
vocab_info = tokenizer
vocab_path = _download(vocab_info['url'],
vocab_info['fname'],
vocab_info['chksum'],
cachedir=cachedir)
return get_kobert_model(model_path, vocab_path, use_pooler, use_decoder,
use_classifier, ctx)
def get_kobert_model(model_file,
vocab_file,
use_pooler=True,
use_decoder=True,
use_classifier=True,
ctx=mx.cpu(0)):
vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file,
padding_token='[PAD]')
predefined_args = {
'attention_cell': 'multi_head',
'num_layers': 12,
'units': 768,
'hidden_size': 3072,
'max_length': 512,
'num_heads': 12,
'scaled': True,
'dropout': 0.1,
'use_residual': True,
'embed_size': 768,
'embed_dropout': 0.1,
'token_type_vocab_size': 2,
'word_embed': None,
}
encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
num_layers=predefined_args['num_layers'],
units=predefined_args['units'],
hidden_size=predefined_args['hidden_size'],
max_length=predefined_args['max_length'],
num_heads=predefined_args['num_heads'],
scaled=predefined_args['scaled'],
dropout=predefined_args['dropout'],
output_attention=False,
output_all_encodings=False,
use_residual=predefined_args['use_residual'])
# BERT
net = BERTModel(
encoder,
len(vocab_b_obj.idx_to_token),
token_type_vocab_size=predefined_args['token_type_vocab_size'],
units=predefined_args['units'],
embed_size=predefined_args['embed_size'],
embed_dropout=predefined_args['embed_dropout'],
word_embed=predefined_args['word_embed'],
use_pooler=use_pooler,
use_decoder=use_decoder,
use_classifier=use_classifier)
net.initialize(ctx=ctx)
net.load_parameters(model_file, ctx, ignore_extra=True)
return (net, vocab_b_obj)
# coding=utf-8
# Copyright 2019 SK T-Brain Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import requests
import hashlib
import torch
from transformers import BertModel, BertConfig
import gluonnlp as nlp
from .utils import download as _download
from .utils import tokenizer
pytorch_kobert = {
'url':
'https://kobert.blob.core.windows.net/models/kobert/pytorch/pytorch_kobert_2439f391a6.params',
'fname': 'pytorch_kobert_2439f391a6.params',
'chksum': '2439f391a6'
}
bert_config = {
'attention_probs_dropout_prob': 0.1,
'hidden_act': 'gelu',
'hidden_dropout_prob': 0.1,
'hidden_size': 768,
'initializer_range': 0.02,
'intermediate_size': 3072,
'max_position_embeddings': 512,
'num_attention_heads': 12,
'num_hidden_layers': 12,
'type_vocab_size': 2,
'vocab_size': 8002
}
def get_pytorch_kobert_model(ctx='cpu', cachedir='~/kobert/'):
# download model
model_info = pytorch_kobert
model_path = _download(model_info['url'],
model_info['fname'],
model_info['chksum'],
cachedir=cachedir)
# download vocab
vocab_info = tokenizer
vocab_path = _download(vocab_info['url'],
vocab_info['fname'],
vocab_info['chksum'],
cachedir=cachedir)
return get_kobert_model(model_path, vocab_path, ctx)
def get_kobert_model(model_file, vocab_file, ctx="cpu"):
bertmodel = BertModel(config=BertConfig.from_dict(bert_config))
bertmodel.load_state_dict(torch.load(model_file))
#bertmodel = bertmodel.from_pretrained('https://kobert.blob.core.windows.net/models/kobert/pytorch/pytorch_kobert_2439f391a6.params', output_hidden_states=True)
device = torch.device(ctx)
bertmodel.to(device)
bertmodel.eval()
vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file,
padding_token='[PAD]')
return bertmodel, vocab_b_obj
# coding=utf-8
# Copyright 2019 SK T-Brain Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import requests
import hashlib
onnx_kobert = {
'url':
'https://kobert.blob.core.windows.net/models/kobert/onnx/onnx_kobert_44529811f0.onnx',
'fname': 'onnx_kobert_44529811f0.onnx',
'chksum': '44529811f0'
}
tokenizer = {
'url':
'https://kobert.blob.core.windows.net/models/kobert/tokenizer/kobert_news_wiki_ko_cased-ae5711deb3.spiece',
'fname': 'kobert_news_wiki_ko_cased-1087f8699e.spiece',
'chksum': 'ae5711deb3'
}
def download(url, filename, chksum, cachedir='~/kobert/'):
f_cachedir = os.path.expanduser(cachedir)
os.makedirs(f_cachedir, exist_ok=True)
file_path = os.path.join(f_cachedir, filename)
if os.path.isfile(file_path):
if hashlib.md5(open(file_path,
'rb').read()).hexdigest()[:10] == chksum:
print('using cached model')
return file_path
with open(file_path, 'wb') as f:
response = requests.get(url, stream=True)
total = response.headers.get('content-length')
if total is None:
f.write(response.content)
else:
downloaded = 0
total = int(total)
for data in response.iter_content(
chunk_size=max(int(total / 1000), 1024 * 1024)):
downloaded += len(data)
f.write(data)
done = int(50 * downloaded / total)
sys.stdout.write('\r[{}{}]'.format('█' * done,
'.' * (50 - done)))
sys.stdout.flush()
sys.stdout.write('\n')
assert chksum == hashlib.md5(open(
file_path, 'rb').read()).hexdigest()[:10], 'corrupted file!'
return file_path
def get_onnx(cachedir='~/kobert/'):
"""Get KoBERT ONNX file path after downloading
"""
model_info = onnx_kobert
return download(model_info['url'],
model_info['fname'],
model_info['chksum'],
cachedir=cachedir)
def get_tokenizer(cachedir='~/kobert/'):
"""Get KoBERT Tokenizer file path after downloading
"""
model_info = tokenizer
return download(model_info['url'],
model_info['fname'],
model_info['chksum'],
cachedir=cachedir)