tokenize_stastics.py 1.21 KB
import torch
from tqdm import tqdm
from util.data_loader import ArticleDataset, ToTensor
from torch.utils.data import DataLoader
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from kogpt2.utils import get_tokenizer

max_len=1024

ctx='cuda' if torch.cuda.is_available() else 'cpu'
device=torch.device(ctx)
tokenizer_path = get_tokenizer(cachedir='/code/model')
model, vocab = get_pytorch_kogpt2_model(ctx=ctx,cachedir='/code/model')
tokenizer = SentencepieceTokenizer(tokenizer_path,  num_best=0, alpha=0)
transform=ToTensor(tokenizer,vocab,max_len=max_len)
batch_size=64
trainset=DataLoader(ArticleDataset('/dataset', label='train', transform=transform, use_cache=False),batch_size=batch_size, num_workers=32,shuffle=True)
count_dict=dict((idx,0)for idx in range(256,max_len,256)) 
for (data, original_len) in tqdm(trainset):
    original_len.to(device)
    for bound in count_dict:
        count_dict[bound]+= torch.sum(torch.where( original_len<=bound , torch.ones_like(original_len), torch.zeros_like(original_len))).item()
for bound in count_dict:
    print(f"count[{bound}]: {count_dict[bound]}/{len(trainset.dataset)} ({100*count_dict[bound]/len(trainset.dataset):.1f}%)")