신은섭(Shin Eun Seop)

t2

...@@ -84,10 +84,10 @@ if __name__ == '__main__': ...@@ -84,10 +84,10 @@ if __name__ == '__main__':
84 84
85 # User options 85 # User options
86 args.add_argument('--output', type=int, default=1) 86 args.add_argument('--output', type=int, default=1)
87 - args.add_argument('--epochs', type=int, default=100) 87 + args.add_argument('--epochs', type=int, default=200)
88 args.add_argument('--batch', type=int, default=3000) 88 args.add_argument('--batch', type=int, default=3000)
89 args.add_argument('--strmaxlen', type=int, default=400) 89 args.add_argument('--strmaxlen', type=int, default=400)
90 - args.add_argument('--embedding', type=int, default=30) 90 + args.add_argument('--embedding', type=int, default=50)
91 args.add_argument('--threshold', type=float, default=0.5) 91 args.add_argument('--threshold', type=float, default=0.5)
92 config = args.parse_args() 92 config = args.parse_args()
93 93
...@@ -97,18 +97,17 @@ if __name__ == '__main__': ...@@ -97,18 +97,17 @@ if __name__ == '__main__':
97 # 모델의 specification 97 # 모델의 specification
98 input_size = config.embedding*config.strmaxlen 98 input_size = config.embedding*config.strmaxlen
99 output_size = 1 99 output_size = 1
100 - learning_rate = 0.001 100 + learning_rate = 0.0003
101 character_size = 251 101 character_size = 251
102 102
103 x = tf.placeholder(tf.int32, [None, config.strmaxlen]) 103 x = tf.placeholder(tf.int32, [None, config.strmaxlen])
104 y_ = tf.placeholder(tf.float32, [None, output_size]) 104 y_ = tf.placeholder(tf.float32, [None, output_size])
105 keep_probs = tf.placeholder(tf.float32) 105 keep_probs = tf.placeholder(tf.float32)
106 # 임베딩 106 # 임베딩
107 - with tf.name_scope('embedding'): 107 + char_embedding = tf.get_variable('char_embedding', [character_size, config.embedding])
108 - char_embedding = tf.get_variable('char_embedding', [character_size, config.embedding]) 108 + embedded_chars_base = tf.nn.embedding_lookup(char_embedding, x)
109 - embedded_chars_base = tf.nn.embedding_lookup(char_embedding, x) 109 + embedded = tf.expand_dims(embedded_chars_base, -1)
110 - embedded = tf.expand_dims(embedded_chars_base, -1) 110 + print("emb", embedded.shape)
111 - print("emb", embedded.shape)
112 111
113 # MODEL 112 # MODEL
114 l2_conv = tf.layers.conv2d(embedded, 256, [2, config.embedding], activation=tf.nn.relu) 113 l2_conv = tf.layers.conv2d(embedded, 256, [2, config.embedding], activation=tf.nn.relu)
......
No preview for this file type
1 +# -*- coding: utf-8 -*-
2 +
3 +"""
4 +Copyright 2018 NAVER Corp.
5 +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
6 +associated documentation files (the "Software"), to deal in the Software without restriction, including
7 +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
9 +the following conditions:
10 +The above copyright notice and this permission notice shall be included in all copies or substantial
11 +portions of the Software.
12 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
13 +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
14 +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
15 +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
16 +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
17 +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18 +"""
19 +
20 +import os
21 +
22 +import numpy as np
23 +
24 +from kor_char_parser import decompose_str_as_one_hot
25 +
26 +
27 +class MovieReviewDataset():
28 + """
29 + 영화리뷰 데이터를 읽어서, tuple (데이터, 레이블)의 형태로 리턴하는 파이썬 오브젝트 입니다.
30 + """
31 + def __init__(self, dataset_path: str, max_length: int):
32 + """
33 + initializer
34 + :param dataset_path: 데이터셋 root path
35 + :param max_length: 문자열의 최대 길이
36 + """
37 + # 데이터, 레이블 각각의 경로
38 + data_review = os.path.join(dataset_path, 'train', 'train_data')
39 + data_label = os.path.join(dataset_path, 'train', 'train_label')
40 +
41 + # 영화리뷰 데이터를 읽고 preprocess까지 진행합니다
42 + with open(data_review, 'rt', encoding='utf-8') as f:
43 + self.reviews = preprocess(f.readlines(), max_length)
44 + # 영화리뷰 레이블을 읽고 preprocess까지 진행합니다.
45 + with open(data_label) as f:
46 + self.labels = [[np.float32(x)] for x in f.readlines()]
47 +
48 + def __len__(self):
49 + """
50 + :return: 전체 데이터의 수를 리턴합니다
51 + """
52 + return len(self.reviews)
53 +
54 + def __getitem__(self, idx):
55 + """
56 + :param idx: 필요한 데이터의 인덱스
57 + :return: 인덱스에 맞는 데이터, 레이블 pair를 리턴합니다
58 + """
59 + return self.reviews[idx], self.labels[idx]
60 +
61 +
62 +def preprocess(data: list, max_length: int):
63 + """
64 + 입력을 받아서 딥러닝 모델이 학습 가능한 포맷으로 변경하는 함수입니다.
65 + 기본 제공 알고리즘은 char2vec이며, 기본 모델이 MLP이기 때문에, 입력 값의 크기를 모두 고정한 벡터를 리턴합니다.
66 + 문자열의 길이가 고정값보다 길면 긴 부분을 제거하고, 짧으면 0으로 채웁니다.
67 + :param data: 문자열 리스트 ([문자열1, 문자열2, ...])
68 + :param max_length: 문자열의 최대 길이
69 + :return: 벡터 리스트 ([[0, 1, 5, 6], [5, 4, 10, 200], ...]) max_length가 4일 때
70 + """
71 + vectorized_data = [decompose_str_as_one_hot(datum, warning=False) for datum in data]
72 + zero_padding = np.zeros((len(data), max_length), dtype=np.int32)
73 + for idx, seq in enumerate(vectorized_data):
74 + length = len(seq)
75 + if length >= max_length:
76 + length = max_length
77 + zero_padding[idx, :length] = np.array(seq)[:length]
78 + else:
79 + zero_padding[idx, :length] = np.array(seq)
80 + return zero_padding
1 +# -*- coding: utf-8 -*-
2 +
3 +import argparse
4 +import os
5 +
6 +import numpy as np
7 +import tensorflow as tf
8 +
9 +import nsml
10 +from nsml import DATASET_PATH, HAS_DATASET, IS_ON_NSML
11 +from dataset import MovieReviewDataset, preprocess
12 +
13 +
14 +# DONOTCHANGE: They are reserved for nsml
15 +# This is for nsml leaderboard
16 +def bind_model(sess, config):
17 + # 학습한 모델을 저장하는 함수입니다.
18 + def save(dir_name, *args):
19 + # directory
20 + os.makedirs(dir_name, exist_ok=True)
21 + saver = tf.train.Saver()
22 + saver.save(sess, os.path.join(dir_name, 'model'))
23 +
24 + # 저장한 모델을 불러올 수 있는 함수입니다.
25 + def load(dir_name, *args):
26 + saver = tf.train.Saver()
27 + # find checkpoint
28 + ckpt = tf.train.get_checkpoint_state(dir_name)
29 + if ckpt and ckpt.model_checkpoint_path:
30 + checkpoint = os.path.basename(ckpt.model_checkpoint_path)
31 + saver.restore(sess, os.path.join(dir_name, checkpoint))
32 + else:
33 + raise NotImplemented('No checkpoint!')
34 + print('Model loaded')
35 +
36 + def infer(raw_data, **kwargs):
37 + """
38 + :param raw_data: raw input (여기서는 문자열)을 입력받습니다
39 + :param kwargs:
40 + :return:
41 + """
42 + # dataset.py에서 작성한 preprocess 함수를 호출하여, 문자열을 벡터로 변환합니다
43 + preprocessed_data = preprocess(raw_data, config.strmaxlen)
44 + # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
45 + pred = sess.run(output_sigmoid, feed_dict={x: preprocessed_data})
46 + clipped = np.array(pred > config.threshold, dtype=np.int)
47 + # DONOTCHANGE: They are reserved for nsml
48 + # 리턴 결과는 [(확률, 0 or 1)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 확률의 값은 영향을 미치지 않습니다
49 + return list(zip(pred.flatten(), clipped.flatten()))
50 +
51 + # DONOTCHANGE: They are reserved for nsml
52 + # nsml에서 지정한 함수에 접근할 수 있도록 하는 함수입니다.
53 + nsml.bind(save=save, load=load, infer=infer)
54 +
55 +
56 +def _batch_loader(iterable, n=1):
57 + """
58 + 데이터를 배치 사이즈만큼 잘라서 보내주는 함수입니다. PyTorch의 DataLoader와 같은 역할을 합니다
59 + :param iterable: 데이터 list, 혹은 다른 포맷
60 + :param n: 배치 사이즈
61 + :return:
62 + """
63 + length = len(iterable)
64 + for n_idx in range(0, length, n):
65 + yield iterable[n_idx:min(n_idx + n, length)]
66 +
67 +
68 +def weight_variable(shape):
69 + initial = tf.truncated_normal(shape, stddev=0.1)
70 + return tf.Variable(initial)
71 +
72 +
73 +def bias_variable(shape):
74 + initial = tf.constant(0.1, shape=shape)
75 + return tf.Variable(initial)
76 +
77 +
78 +if __name__ == '__main__':
79 + args = argparse.ArgumentParser()
80 + # DONOTCHANGE: They are reserved for nsml
81 + args.add_argument('--mode', type=str, default='train')
82 + args.add_argument('--pause', type=int, default=0)
83 + args.add_argument('--iteration', type=str, default='0')
84 +
85 + # User options
86 + args.add_argument('--output', type=int, default=1)
87 + args.add_argument('--epochs', type=int, default=10)
88 + args.add_argument('--batch', type=int, default=3000)
89 + args.add_argument('--strmaxlen', type=int, default=300)
90 + args.add_argument('--embedding', type=int, default=50)
91 + args.add_argument('--threshold', type=float, default=0.5)
92 + config = args.parse_args()
93 +
94 + if not HAS_DATASET and not IS_ON_NSML: # It is not running on nsml
95 + DATASET_PATH = '../sample_data/movie/'
96 +
97 + # 모델의 specification
98 + input_size = config.embedding*config.strmaxlen
99 + output_size = 1
100 + learning_rate = 0.001
101 + character_size = 251
102 +
103 + x = tf.placeholder(tf.int32, [None, config.strmaxlen])
104 + y_ = tf.placeholder(tf.float32, [None, output_size])
105 + keep_probs = tf.placeholder(tf.float32)
106 + # 임베딩
107 + char_embedding = tf.get_variable('char_embedding', [character_size, config.embedding])
108 + embedded_chars_base = tf.nn.embedding_lookup(char_embedding, x)
109 + embedded = tf.expand_dims(embedded_chars_base, -1)
110 + print("emb", embedded.shape)
111 +
112 + # MODEL
113 + l2_conv = tf.layers.conv2d(embedded, 256, [2, config.embedding], activation=tf.nn.relu)
114 + print("l2", l2_conv.shape)
115 + l2_pool = tf.layers.max_pooling2d(l2_conv, [character_size-2+1, 1], strides=(1,1))
116 + print("l2 pool", l2_pool.shape)
117 +
118 + l3_conv = tf.layers.conv2d(embedded, 256, [3, config.embedding], activation=tf.nn.relu)
119 + print("l3", l3_conv.shape)
120 + l3_pool = tf.layers.max_pooling2d(l3_conv, [character_size-3+1, 1], strides=(1,1))
121 + print("l3 pool", l3_pool.shape)
122 +
123 + concat = tf.concat([l2_pool, l3_pool], 3)
124 + print('concat', concat.shape)
125 + flatten = tf.contrib.layers.flatten(concat)
126 + print('flattne', flatten.shape)
127 +
128 + dense = tf.layers.dense(flatten, 256, activation=tf.nn.relu)
129 +
130 + drop = tf.layers.dropout(dense, keep_probs)
131 + output_sigmoid = 10*tf.layers.dense(drop, output_size, activation=tf.nn.sigmoid)
132 +
133 +
134 + # loss와 optimizer
135 + binary_cross_entropy = -tf.reduce_sum(y_*tf.log(tf.clip_by_value(tf.nn.log_softmax(output_sigmoid),1e-10,1.0)))
136 + train_step = tf.train.AdamOptimizer(learning_rate).minimize(binary_cross_entropy)
137 +
138 + sess = tf.InteractiveSession()
139 + tf.global_variables_initializer().run()
140 +
141 + # DONOTCHANGE: Reserved for nsml
142 + bind_model(sess=sess, config=config)
143 +
144 + # DONOTCHANGE: Reserved for nsml
145 + if config.pause:
146 + nsml.paused(scope=locals())
147 +
148 + if config.mode == 'train':
149 + # 데이터를 로드합니다.
150 + dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen)
151 + dataset_len = len(dataset)
152 + one_batch_size = dataset_len//config.batch
153 + if dataset_len % config.batch != 0:
154 + one_batch_size += 1
155 + # epoch마다 학습을 수행합니다.
156 + for epoch in range(config.epochs):
157 + avg_loss = 0.0
158 + for i, (data, labels) in enumerate(_batch_loader(dataset, config.batch)):
159 + _, loss = sess.run([train_step, binary_cross_entropy],
160 + feed_dict={x: data, y_: labels, keep_probs: 1.})
161 + print('Batch : ', i + 1, '/', one_batch_size,
162 + ', BCE in this minibatch: ', float(loss))
163 + avg_loss += float(loss)
164 + print('epoch:', epoch, ' train_loss:', float(avg_loss/one_batch_size))
165 + nsml.report(summary=True, scope=locals(), epoch=epoch, epoch_total=config.epochs,
166 + train__loss=float(avg_loss/one_batch_size), step=epoch)
167 + # DONOTCHANGE (You can decide how often you want to save the model)
168 + nsml.save(epoch)
169 +
170 + # 로컬 테스트 모드일때 사용합니다
171 + # 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
172 + # [(0.3, 0), (0.7, 1), ... ]
173 + elif config.mode == 'test_local':
174 + with open(os.path.join(DATASET_PATH, 'train/train_data'), 'rt', encoding='utf-8') as f:
175 + queries = f.readlines()
176 + res = []
177 + for batch in _batch_loader(queries, config.batch):
178 + temp_res = nsml.infer(batch)
179 + res += temp_res
180 + print(res)