fianl 1st

신은섭(Shin Eun Seop)
Commit 305a0a0f0d0f143f30c98690d2d5c68f08da4778 305a0a0f 1 parent 6c0ef0fd
Showing 8 changed files with 536 additions and 0 deletions
movie2/dataset.py
movie2/doc2vec.model
movie2/embadding.py
movie2/kor_char_parser.py
movie2/main.py
movie2/setup.py
movie2/test.txt
movie2/text_helpers.py
--- a/movie2/dataset.py 0 → 100644
View file @305a0a0
+++ b/movie2/dataset.py 0 → 100644
View file @305a0a0
+ """
+ kin dataset 
+ """
+ 
+ import os
+ import numpy as np
+ # from kor_char_parser import decompose_str_as_one_hot
+ 
+ import text_helpers
+ from konlpy.tag import Twitter
+ pos_tagger = Twitter()
+ 
+ class KinQueryDataset:
+     """
+         지식인 데이터를 읽어서, tuple (데이터, 레이블)의 형태로 리턴하는 파이썬 오브젝트 입니다.
+     """
+     def __init__(self, dataset_path: str, max_length: int):
+         """
+         :param dataset_path: 데이터셋 root path
+         :param max_length: 문자열의 최대 길이
+         """
+         # 데이터, 레이블 각각의 경로
+         queries_path = os.path.join(dataset_path, 'train', 'train_data')
+         labels_path = os.path.join(dataset_path, 'train', 'train_label')
+ 
+         # 지식인 데이터를 읽고 preprocess까지 진행합니다
+         with open(queries_path, 'rt', encoding='utf8') as f:
+             self.queries = preprocess(f.readlines(), max_length)
+         # 지식인 레이블을 읽고 preprocess까지 진행합니다.
+         with open(labels_path) as f:
+             self.labels = np.array([[np.float32(x)] for x in f.readlines()])
+ 
+     def __len__(self):
+         """
+         :return: 전체 데이터의 수를 리턴합니다
+         """
+         return len(self.queries)
+ 
+     def __getitem__(self, idx):
+         """
+         :param idx: 필요한 데이터의 인덱스
+         :return: 인덱스에 맞는 데이터, 레이블 pair를 리턴합니다
+         """
+         return self.queries[idx], self.labels[idx]
+ 
+ def tokenize(doc):
+     # norm, stem은 optional
+     return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
+ 
+ def preprocess(data: list, max_length: int):
+     train_docs = [(tokenize(row[0]), tokenize(row[1])) for row in data]
+ 
--- a/movie2/doc2vec.model 0 → 100644
View file @305a0a0
+++ b/movie2/doc2vec.model 0 → 100644
View file @305a0a0
--- a/movie2/embadding.py 0 → 100644
View file @305a0a0
+++ b/movie2/embadding.py 0 → 100644
View file @305a0a0
+ # -*- coding: utf-8 -*-
+ from konlpy.corpus import kolaw
+ def read_data(filename):
+     with open(filename, 'r') as f:
+         data = [line.split('\t') for line in f.read().splitlines()]
+         data = data[1:]   # header 제외
+     return data
+ 
+ train_data = kolaw.open('constitution.txt').read()
+ 
+ print(len(train_data))      # nrows: 150000
+ print(len(train_data[0]))
+ 
+ from konlpy.tag import Twitter
+ pos_tagger = Twitter()
+ 
+ def tokenize(doc):
+     # norm, stem은 optional
+     return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
+ 
+ train_docs = []
+ for row in train_data:
+     train_docs.append((tokenize(row[0]), '0'))
+     # train_docs.append((tokenize(row[1]), '0'))
+ 
+ # 잘 들어갔는지 확인
+ from pprint import pprint
+ pprint(train_docs[0])
+ 
+ from gensim.models.doc2vec import TaggedDocument
+ tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
+ 
+ from gensim.models import doc2vec
+ import multiprocessing
+ cores = multiprocessing.cpu_count()
+ 
+ # 사전 구축
+ doc_vectorizer = doc2vec.Doc2Vec(vector_size=1000, alpha=0.025, min_alpha=0.025, seed=1234, epochs=100, workers=cores, hs=1)
+ doc_vectorizer.build_vocab(tagged_train_docs)
+ doc_vectorizer.train(tagged_train_docs, epochs=doc_vectorizer.epochs, total_examples=doc_vectorizer.corpus_count)
+ 
+ # To save
+ doc_vectorizer.save('doc2vec.model')
+ 
+ doc_vectorizer = doc2vec.Doc2Vec.load('doc2vec.model')
+ pprint(doc_vectorizer.wv.most_similar('한국/Noun'))
--- a/movie2/kor_char_parser.py 0 → 100644
View file @305a0a0
+++ b/movie2/kor_char_parser.py 0 → 100644
View file @305a0a0
+ # -*- coding: utf-8 -*-
+ 
+ """
+ Copyright 2018 NAVER Corp.
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ associated documentation files (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ The above copyright notice and this permission notice shall be included in all copies or substantial
+ portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ """
+ 
+ cho = "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ"  # len = 19
+ jung = "ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ"  # len = 21
+ # len = 27
+ jong = "ㄱ/ㄲ/ㄱㅅ/ㄴ/ㄴㅈ/ㄴㅎ/ㄷ/ㄹ/ㄹㄱ/ㄹㅁ/ㄹㅂ/ㄹㅅ/ㄹㅌ/ㄹㅍ/ㄹㅎ/ㅁ/ㅂ/ㅂㅅ/ㅅ/ㅆ/ㅇ/ㅈ/ㅊ/ㅋ/ㅌ/ㅍ/ㅎ".split(
+     '/')
+ test = cho + jung + ''.join(jong)
+ 
+ hangul_length = len(cho) + len(jung) + len(jong)  # 67
+ 
+ 
+ def is_valid_decomposition_atom(x):
+     return x in test
+ 
+ 
+ def decompose(x):
+     in_char = x
+     if x < ord('가') or x > ord('힣'):
+         return chr(x)
+     x = x - ord('가')
+     y = x // 28
+     z = x % 28
+     x = y // 21
+     y = y % 21
+     # if there is jong, then is z > 0. So z starts from 1 index.
+     zz = jong[z - 1] if z > 0 else ''
+     if x >= len(cho):
+         print('Unknown Exception: ', in_char, chr(in_char), x, y, z, zz)
+     return cho[x] + jung[y] + zz
+ 
+ 
+ def decompose_as_one_hot(in_char, warning=True):
+     one_hot = []
+     # print(ord('ㅣ'), chr(0xac00))
+     # [0,66]: hangul / [67,194]: ASCII / [195,245]: hangul danja,danmo / [246,249]: special characters
+     # Total 250 dimensions.
+     if ord('가') <= in_char <= ord('힣'):  # 가:44032 , 힣: 55203
+         x = in_char - 44032  # in_char - ord('가')
+         y = x // 28
+         z = x % 28
+         x = y // 21
+         y = y % 21
+         # if there is jong, then is z > 0. So z starts from 1 index.
+         zz = jong[z - 1] if z > 0 else ''
+         if x >= len(cho):
+             if warning:
+                 print('Unknown Exception: ', in_char,
+                       chr(in_char), x, y, z, zz)
+ 
+         one_hot.append(x)
+         one_hot.append(len(cho) + y)
+         if z > 0:
+             one_hot.append(len(cho) + len(jung) + (z - 1))
+         return one_hot
+     else:
+         if in_char < 128:
+             result = hangul_length + in_char  # 67~
+         elif ord('ㄱ') <= in_char <= ord('ㅣ'):
+             # 194~ # [ㄱ:12593]~[ㅣ:12643] (len = 51)
+             result = hangul_length + 128 + (in_char - 12593)
+         elif in_char == ord('♡'):
+             result = hangul_length + 128 + 51  # 245~ # ♡
+         elif in_char == ord('♥'):
+             result = hangul_length + 128 + 51 + 1  # ♥
+         elif in_char == ord('★'):
+             result = hangul_length + 128 + 51 + 2  # ★
+         elif in_char == ord('☆'):
+             result = hangul_length + 128 + 51 + 3  # ☆
+         else:
+             if warning:
+                 print('Unhandled character:', chr(in_char), in_char)
+             # unknown character
+             result = hangul_length + 128 + 51 + 4  # for unknown character
+ 
+         return [result]
+ 
+ 
+ def decompose_str(string):
+     return ''.join([decompose(ord(x)) for x in string])
+ 
+ 
+ def decompose_str_as_one_hot(string, warning=True):
+     tmp_list = []
+     for x in string:
+         da = decompose_as_one_hot(ord(x), warning=warning)
+         tmp_list.extend(da)
+     return tmp_list
--- a/movie2/main.py 0 → 100644
View file @305a0a0
+++ b/movie2/main.py 0 → 100644
View file @305a0a0
+ # -*- coding: utf-8 -*-
+ 
+ """
+ Copyright 2018 NAVER Corp.
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ associated documentation files (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ The above copyright notice and this permission notice shall be included in all copies or substantial
+ portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ """
+ 
+ 
+ import argparse
+ import os
+ 
+ import numpy as np
+ import tensorflow as tf
+ 
+ import nsml
+ from nsml import DATASET_PATH, HAS_DATASET, IS_ON_NSML
+ from dataset import KinQueryDataset, preprocess
+ 
+ 
+ # DONOTCHANGE: They are reserved for nsml
+ # This is for nsml leaderboard
+ def bind_model(sess, config):
+     # 학습한 모델을 저장하는 함수입니다.
+     def save(dir_name, *args):
+         # directory
+         os.makedirs(dir_name, exist_ok=True)
+         saver = tf.train.Saver()
+         saver.save(sess, os.path.join(dir_name, 'model'))
+ 
+     # 저장한 모델을 불러올 수 있는 함수입니다.
+     def load(dir_name, *args):
+         saver = tf.train.Saver()
+         # find checkpoint
+         ckpt = tf.train.get_checkpoint_state(dir_name)
+         if ckpt and ckpt.model_checkpoint_path:
+             checkpoint = os.path.basename(ckpt.model_checkpoint_path)
+             saver.restore(sess, os.path.join(dir_name, checkpoint))
+         else:
+             raise NotImplemented('No checkpoint!')
+         print('Model loaded')
+ 
+     def infer(raw_data, **kwargs):
+         """
+         :param raw_data: raw input (여기서는 문자열)을 입력받습니다
+         :param kwargs:
+         :return:
+         """
+         # dataset.py에서 작성한 preprocess 함수를 호출하여, 문자열을 벡터로 변환합니다
+         preprocessed_data = preprocess(raw_data, config.strmaxlen)
+         # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
+         pred = sess.run(output_sigmoid, feed_dict={x: preprocessed_data})
+         clipped = np.array(pred > config.threshold, dtype=np.int)
+         # DONOTCHANGE: They are reserved for nsml
+         # 리턴 결과는 [(확률, 0 or 1)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 확률의 값은 영향을 미치지 않습니다
+         return list(zip(pred.flatten(), clipped.flatten()))
+ 
+     # DONOTCHANGE: They are reserved for nsml
+     # nsml에서 지정한 함수에 접근할 수 있도록 하는 함수입니다.
+     nsml.bind(save=save, load=load, infer=infer)
+ 
+ 
+ def _batch_loader(iterable, n=1):
+     """
+     데이터를 배치 사이즈만큼 잘라서 보내주는 함수입니다. PyTorch의 DataLoader와 같은 역할을 합니다
+     :param iterable: 데이터 list, 혹은 다른 포맷
+     :param n: 배치 사이즈
+     :return:
+     """
+     length = len(iterable)
+     for n_idx in range(0, length, n):
+         yield iterable[n_idx:min(n_idx + n, length)]
+ 
+ 
+ def weight_variable(shape):
+     initial = tf.truncated_normal(shape, stddev=0.1)
+     return tf.Variable(initial)
+ 
+ 
+ def bias_variable(shape):
+     initial = tf.constant(0.1, shape=shape)
+     return tf.Variable(initial)
+ 
+ 
+ if __name__ == '__main__':
+     args = argparse.ArgumentParser()
+     # DONOTCHANGE: They are reserved for nsml
+     args.add_argument('--mode', type=str, default='train')
+     args.add_argument('--pause', type=int, default=0)
+     args.add_argument('--iteration', type=str, default='0')
+ 
+     # User options
+     args.add_argument('--output', type=int, default=1)
+     args.add_argument('--epochs', type=int, default=10)
+     args.add_argument('--batch', type=int, default=2000)
+     args.add_argument('--strmaxlen', type=int, default=400)
+     args.add_argument('--embedding', type=int, default=8)
+     args.add_argument('--threshold', type=float, default=0.5)
+     config = args.parse_args()
+ 
+     if not HAS_DATASET and not IS_ON_NSML:  # It is not running on nsml
+         DATASET_PATH = '../sample_data/kin/'
+ 
+     # 모델의 specification
+     input_size = config.embedding*config.strmaxlen
+     output_size = 1
+     hidden_layer_size = 200
+     learning_rate = 0.001
+     character_size = 251
+ 
+     x = tf.placeholder(tf.int32, [None, config.strmaxlen])
+     y_ = tf.placeholder(tf.float32, [None, output_size])
+     # 임베딩
+     char_embedding = tf.get_variable('char_embedding', [character_size, config.embedding])
+     embedded = tf.nn.embedding_lookup(char_embedding, x)
+ 
+     # 첫 번째 레이어
+     first_layer_weight = weight_variable([input_size, hidden_layer_size])
+     first_layer_bias = bias_variable([hidden_layer_size])
+     hidden_layer = tf.matmul(tf.reshape(embedded, (-1, input_size)),
+                              first_layer_weight) + first_layer_bias
+ 
+     # 두 번째 (아웃풋) 레이어
+     second_layer_weight = weight_variable([hidden_layer_size, output_size])
+     second_layer_bias = bias_variable([output_size])
+     output = tf.matmul(hidden_layer, second_layer_weight) + second_layer_bias
+     output_sigmoid = tf.sigmoid(output)
+ 
+     # loss와 optimizer
+     binary_cross_entropy = tf.reduce_mean(-(y_ * tf.log(output_sigmoid)) - (1-y_) * tf.log(1-output_sigmoid))
+     train_step = tf.train.AdamOptimizer(learning_rate).minimize(binary_cross_entropy)
+ 
+     sess = tf.InteractiveSession()
+     tf.global_variables_initializer().run()
+ 
+     # DONOTCHANGE: Reserved for nsml
+     bind_model(sess=sess, config=config)
+ 
+     # DONOTCHANGE: Reserved for nsml
+     if config.pause:
+         nsml.paused(scope=locals())
+ 
+     if config.mode == 'train':
+         # 데이터를 로드합니다.
+         dataset = KinQueryDataset(DATASET_PATH, config.strmaxlen)
+         dataset_len = len(dataset)
+         one_batch_size = dataset_len//config.batch
+         if dataset_len % config.batch != 0:
+             one_batch_size += 1
+         # epoch마다 학습을 수행합니다.
+         for epoch in range(config.epochs):
+             avg_loss = 0.0
+             for i, (data, labels) in enumerate(_batch_loader(dataset, config.batch)):
+                 _, loss = sess.run([train_step, binary_cross_entropy],
+                                    feed_dict={x: data, y_: labels})
+                 print('Batch : ', i + 1, '/', one_batch_size,
+                       ', BCE in this minibatch: ', float(loss))
+                 avg_loss += float(loss)
+             print('epoch:', epoch, ' train_loss:', float(avg_loss/one_batch_size))
+             nsml.report(summary=True, scope=locals(), epoch=epoch, epoch_total=config.epochs,
+                         train__loss=float(avg_loss/one_batch_size), step=epoch)
+             # DONOTCHANGE (You can decide how often you want to save the model)
+             nsml.save(epoch)
+ 
+     # 로컬 테스트 모드일때 사용합니다
+     # 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
+     # [(0.3, 0), (0.7, 1), ... ]
+     elif config.mode == 'test_local':
+         with open(os.path.join(DATASET_PATH, 'train/train_data'), 'rt', encoding='utf-8') as f:
+             queries = f.readlines()
+         res = []
+         for batch in _batch_loader(queries, config.batch):
+             temp_res = nsml.infer(batch)
+             res += temp_res
+     print(res)
--- a/movie2/setup.py 0 → 100644
View file @305a0a0
+++ b/movie2/setup.py 0 → 100644
View file @305a0a0
+ """
+ Copyright 2018 NAVER Corp.
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+ associated documentation files (the "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ The above copyright notice and this permission notice shall be included in all copies or substantial
+ portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ """
+ 
+ from distutils.core import setup
+ setup(
+     name='nsml movie review',
+     version='1.0',
+     description='',
+     install_requires=[
+         'nltk',
+         'konlpy',
+         'twython'
+ 
+     ]
+ )
\ No newline at end of file
--- a/movie2/test.txt 0 → 100644
View file @305a0a0
+++ b/movie2/test.txt 0 → 100644
View file @305a0a0
--- a/movie2/text_helpers.py 0 → 100644
View file @305a0a0
+++ b/movie2/text_helpers.py 0 → 100644
View file @305a0a0
+ # Text Helper Functions
+ #---------------------------------------
+ #
+ # We pull out text helper functions to reduce redundant code
+ 
+ import string
+ import os
+ import urllib.request
+ import io
+ import tarfile
+ import collections
+ import numpy as np
+ 
+ # Normalize text
+ def normalize_text(texts, stops):
+     # Lower case
+     texts = [x.lower() for x in texts]
+ 
+     # Remove punctuation
+     texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
+ 
+     # Remove numbers
+     texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
+ 
+     # Remove stopwords
+     texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]
+ 
+     # Trim extra whitespace
+     texts = [' '.join(x.split()) for x in texts]
+     
+     return(texts)
+ 
+ 
+ # Build dictionary of words
+ def build_dictionary(sentences, vocabulary_size):
+     # Turn sentences (list of strings) into lists of words
+     split_sentences = [s.split() for s in sentences]
+     words = [x for sublist in split_sentences for x in sublist]
+     
+     # Initialize list of [word, word_count] for each word, starting with unknown
+     count = [['RARE', -1]]
+     
+     # Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
+     count.extend(collections.Counter(words).most_common(vocabulary_size-1))
+     
+     # Now create the dictionary
+     word_dict = {}
+     # For each word, that we want in the dictionary, add it, then make it
+     # the value of the prior dictionary length
+     for word, word_count in count:
+         word_dict[word] = len(word_dict)
+     
+     return(word_dict)
+     
+ 
+ # Turn text data into lists of integers from dictionary
+ def text_to_numbers(sentences, word_dict):
+     # Initialize the returned data
+     data = []
+     for sentence in sentences:
+         sentence_data = []
+         # For each word, either use selected index or rare word index
+         for word in sentence.split():
+             if word in word_dict:
+                 word_ix = word_dict[word]
+             else:
+                 word_ix = 0
+             sentence_data.append(word_ix)
+         data.append(sentence_data)
+     return(data)
+     
+ 
+ # Generate data randomly (N words behind, target, N words ahead)
+ def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
+     # Fill up data batch
+     batch_data = []
+     label_data = []
+     while len(batch_data) < batch_size:
+         # select random sentence to start
+         rand_sentence_ix = int(np.random.choice(len(sentences), size=1))
+         rand_sentence = sentences[rand_sentence_ix]
+         # Generate consecutive windows to look at
+         window_sequences = [rand_sentence[max((ix-window_size),0):(ix+window_size+1)] for ix, x in enumerate(rand_sentence)]
+         # Denote which element of each window is the center word of interest
+         label_indices = [ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]
+         
+         # Pull out center word of interest for each window and create a tuple for each window
+         if method=='skip_gram':
+             batch_and_labels = [(x[y], x[:y] + x[(y+1):]) for x,y in zip(window_sequences, label_indices)]
+             # Make it in to a big list of tuples (target word, surrounding word)
+             tuple_data = [(x, y_) for x,y in batch_and_labels for y_ in y]
+             batch, labels = [list(x) for x in zip(*tuple_data)]
+         elif method=='cbow':
+             batch_and_labels = [(x[:y] + x[(y+1):], x[y]) for x,y in zip(window_sequences, label_indices)]
+             # Only keep windows with consistent 2*window_size
+             batch_and_labels = [(x,y) for x,y in batch_and_labels if len(x)==2*window_size]
+             batch, labels = [list(x) for x in zip(*batch_and_labels)]
+         elif method=='doc2vec':
+             # For doc2vec we keep LHS window only to predict target word
+             batch_and_labels = [(rand_sentence[i:i+window_size], rand_sentence[i+window_size]) for i in range(0, len(rand_sentence)-window_size)]
+             batch, labels = [list(x) for x in zip(*batch_and_labels)]
+             # Add document index to batch!! Remember that we must extract the last index in batch for the doc-index
+             batch = [x + [rand_sentence_ix] for x in batch]
+         else:
+             raise ValueError('Method {} not implmented yet.'.format(method))
+             
+         # extract batch and labels
+         batch_data.extend(batch[:batch_size])
+         label_data.extend(labels[:batch_size])
+     # Trim batch and label at the end
+     batch_data = batch_data[:batch_size]
+     label_data = label_data[:batch_size]
+     
+     # Convert to numpy array
+     batch_data = np.array(batch_data)
+     label_data = np.transpose(np.array([label_data]))
+     
+     return(batch_data, label_data)
\ No newline at end of file