이현규

Get legacy models

# env
.idea/
__pycache__/
# mediapipe
mediapipe/
\ No newline at end of file
......
No preview for this file type
import numpy as np
import tensorflow as tf
from tensorflow import logging
from tensorflow import gfile
import operator
import esot3ria.pb_util as pbutil
import esot3ria.video_recommender as recommender
import esot3ria.video_util as videoutil
# Define model paths.
MODEL_PATH = "./model/inference_model/segment_inference_model"
TAG_VECTOR_MODEL_PATH = "./tag_vectors.model"
VIDEO_VECTOR_MODEL_PATH = "./video_vectors.model"
# Define static file paths.
SEGMENT_LABEL_PATH = "./statics/segment_label_ids.csv"
VIDEO_TAGS_PATH = "./statics/kaggle_solution_40k.csv"
VOCAB_PATH = "./statics/vocabulary.csv"
# Define parameters.
TAG_TOP_K = 5
VIDEO_TOP_K = 10
def get_segments(batch_video_mtx, batch_num_frames, segment_size):
"""Get segment-level inputs from frame-level features."""
video_batch_size = batch_video_mtx.shape[0]
max_frame = batch_video_mtx.shape[1]
feature_dim = batch_video_mtx.shape[-1]
padded_segment_sizes = (batch_num_frames + segment_size - 1) // segment_size
padded_segment_sizes *= segment_size
segment_mask = (
0 < (padded_segment_sizes[:, np.newaxis] - np.arange(0, max_frame)))
# Segment bags.
frame_bags = batch_video_mtx.reshape((-1, feature_dim))
segment_frames = frame_bags[segment_mask.reshape(-1)].reshape(
(-1, segment_size, feature_dim))
# Segment num frames.
segment_start_times = np.arange(0, max_frame, segment_size)
num_segments = batch_num_frames[:, np.newaxis] - segment_start_times
num_segment_bags = num_segments.reshape((-1))
valid_segment_mask = num_segment_bags > 0
segment_num_frames = num_segment_bags[valid_segment_mask]
segment_num_frames[segment_num_frames > segment_size] = segment_size
max_segment_num = (max_frame + segment_size - 1) // segment_size
video_idxs = np.tile(
np.arange(0, video_batch_size)[:, np.newaxis], [1, max_segment_num])
segment_idxs = np.tile(segment_start_times, [video_batch_size, 1])
idx_bags = np.stack([video_idxs, segment_idxs], axis=-1).reshape((-1, 2))
video_segment_ids = idx_bags[valid_segment_mask]
return {
"video_batch": segment_frames,
"num_frames_batch": segment_num_frames,
"video_segment_ids": video_segment_ids
}
def format_predictions(video_ids, predictions, top_k, whitelisted_cls_mask=None):
batch_size = len(video_ids)
for video_index in range(batch_size):
video_prediction = predictions[video_index]
if whitelisted_cls_mask is not None:
# Whitelist classes.
video_prediction *= whitelisted_cls_mask
top_indices = np.argpartition(video_prediction, -top_k)[-top_k:]
line = [(class_index, predictions[video_index][class_index])
for class_index in top_indices]
line = sorted(line, key=lambda p: -p[1])
yield (video_ids[video_index] + "," +
" ".join("%i %g" % (label, score) for (label, score) in line) +
"\n").encode("utf8")
def normalize_tag(tag):
if isinstance(tag, str):
new_tag = tag.lower().replace('[^a-zA-Z]', ' ')
if new_tag.find(" (") != -1:
new_tag = new_tag[:new_tag.find(" (")]
new_tag = new_tag.replace(" ", "-")
return new_tag
else:
return tag
def inference_pb(file_path, threshold):
VIDEO_TOP_K = int(threshold)
inference_result = {}
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
# 0. Import SequenceExample type target from pb.
target_video = pbutil.convert_pb(file_path)
# 1. Load video features from pb.
video_id_batch_val = np.array([b'video'])
n_frames = len(target_video.feature_lists.feature_list['rgb'].feature)
# Restrict frame size to 300
if n_frames > 300:
n_frames = 300
video_batch_val = np.zeros((300, 1152))
for i in range(n_frames):
video_batch_rgb_raw = target_video.feature_lists.feature_list['rgb'].feature[i].bytes_list.value[0]
video_batch_rgb = np.array(tf.cast(tf.decode_raw(video_batch_rgb_raw, tf.float32), tf.float32).eval())
video_batch_audio_raw = target_video.feature_lists.feature_list['audio'].feature[i].bytes_list.value[0]
video_batch_audio = np.array(tf.cast(tf.decode_raw(video_batch_audio_raw, tf.float32), tf.float32).eval())
video_batch_val[i] = np.concatenate([video_batch_rgb, video_batch_audio], axis=0)
video_batch_val = np.array([video_batch_val])
num_frames_batch_val = np.array([n_frames])
# Restore checkpoint and meta-graph file.
if not gfile.Exists(MODEL_PATH + ".meta"):
raise IOError("Cannot find %s. Did you run eval.py?" % MODEL_PATH)
meta_graph_location = MODEL_PATH + ".meta"
logging.info("loading meta-graph: " + meta_graph_location)
with tf.device("/cpu:0"):
saver = tf.train.import_meta_graph(meta_graph_location, clear_devices=True)
logging.info("restoring variables from " + MODEL_PATH)
saver.restore(sess, MODEL_PATH)
input_tensor = tf.get_collection("input_batch_raw")[0]
num_frames_tensor = tf.get_collection("num_frames")[0]
predictions_tensor = tf.get_collection("predictions")[0]
# Workaround for num_epochs issue.
def set_up_init_ops(variables):
init_op_list = []
for variable in list(variables):
if "train_input" in variable.name:
init_op_list.append(tf.assign(variable, 1))
variables.remove(variable)
init_op_list.append(tf.variables_initializer(variables))
return init_op_list
sess.run(
set_up_init_ops(tf.get_collection_ref(tf.GraphKeys.LOCAL_VARIABLES)))
whitelisted_cls_mask = np.zeros((predictions_tensor.get_shape()[-1],),
dtype=np.float32)
with tf.io.gfile.GFile(SEGMENT_LABEL_PATH) as fobj:
for line in fobj:
try:
cls_id = int(line)
whitelisted_cls_mask[cls_id] = 1.
except ValueError:
# Simply skip the non-integer line.
continue
# 2. Make segment features.
results = get_segments(video_batch_val, num_frames_batch_val, 5)
video_segment_ids = results["video_segment_ids"]
video_id_batch_val = video_id_batch_val[video_segment_ids[:, 0]]
video_id_batch_val = np.array([
"%s:%d" % (x.decode("utf8"), y)
for x, y in zip(video_id_batch_val, video_segment_ids[:, 1])
])
video_batch_val = results["video_batch"]
num_frames_batch_val = results["num_frames_batch"]
if input_tensor.get_shape()[1] != video_batch_val.shape[1]:
raise ValueError("max_frames mismatch. Please re-run the eval.py "
"with correct segment_labels settings.")
predictions_val, = sess.run([predictions_tensor],
feed_dict={
input_tensor: video_batch_val,
num_frames_tensor: num_frames_batch_val
})
# 3. Make vocabularies.
voca_dict = {}
vocabs = open(VOCAB_PATH, 'r')
while True:
line = vocabs.readline()
if not line: break
vocab_dict_item = line.split(",")
if vocab_dict_item[0] != "Index":
voca_dict[vocab_dict_item[0]] = vocab_dict_item[3]
vocabs.close()
# 4. Make combined scores.
combined_scores = {}
for line in format_predictions(video_id_batch_val, predictions_val, TAG_TOP_K, whitelisted_cls_mask):
segment_id, preds = line.decode("utf8").split(",")
preds = preds.split(" ")
pred_cls_ids = [int(preds[idx]) for idx in range(0, len(preds), 2)]
pred_cls_scores = [float(preds[idx]) for idx in range(1, len(preds), 2)]
for i in range(len(pred_cls_ids)):
if pred_cls_ids[i] in combined_scores:
combined_scores[pred_cls_ids[i]] += pred_cls_scores[i]
else:
combined_scores[pred_cls_ids[i]] = pred_cls_scores[i]
combined_scores = sorted(combined_scores.items(), key=operator.itemgetter(1), reverse=True)
demoninator = float(combined_scores[0][1] + combined_scores[1][1]
+ combined_scores[2][1] + combined_scores[3][1] + combined_scores[4][1])
tag_result = []
for itemIndex in range(TAG_TOP_K):
segment_tag = str(voca_dict[str(combined_scores[itemIndex][0])])
normalized_tag = normalize_tag(segment_tag)
tag_percentage = format(combined_scores[itemIndex][1] / demoninator, ".3f")
tag_result.append((normalized_tag, tag_percentage))
# 5. Create recommend videos info, Combine results.
recommend_video_ids = recommender.recommend_videos(tag_result, TAG_VECTOR_MODEL_PATH,
VIDEO_VECTOR_MODEL_PATH, VIDEO_TOP_K)
video_result = [videoutil.getVideoInfo(ids, VIDEO_TAGS_PATH, TAG_TOP_K) for ids in recommend_video_ids]
inference_result = {
"tag_result": tag_result,
"video_result": video_result
}
# 6. Dispose instances.
sess.close()
return inference_result
if __name__ == '__main__':
filepath = "./featuremaps/features.pb"
result = inference_pb(filepath, 5)
print(result)
model_checkpoint_path: "/root/volume/youtube-8m/saved_model/inference_model/segment_inference_model"
all_model_checkpoint_paths: "/root/volume/youtube-8m/saved_model/inference_model/segment_inference_model"
{"model": "FrameLevelLogisticModel", "feature_sizes": "1024,128", "feature_names": "rgb,audio", "frame_features": true, "label_loss": "CrossEntropyLoss"}
\ No newline at end of file
import tensorflow as tf
import numpy
def _make_bytes(int_array):
if bytes == str: # Python2
return ''.join(map(chr, int_array))
else:
return bytes(int_array)
def quantize(features, min_quantized_value=-2.0, max_quantized_value=2.0):
"""Quantizes float32 `features` into string."""
assert features.dtype == 'float32'
assert len(features.shape) == 1 # 1-D array
features = numpy.clip(features, min_quantized_value, max_quantized_value)
quantize_range = max_quantized_value - min_quantized_value
features = (features - min_quantized_value) * (255.0 / quantize_range)
features = [int(round(f)) for f in features]
return _make_bytes(features)
# for parse feature.pb
contexts = {
'AUDIO/feature/dimensions': tf.io.FixedLenFeature([], tf.int64),
'AUDIO/feature/rate': tf.io.FixedLenFeature([], tf.float32),
'RGB/feature/dimensions': tf.io.FixedLenFeature([], tf.int64),
'RGB/feature/rate': tf.io.FixedLenFeature([], tf.float32),
'clip/data_path': tf.io.FixedLenFeature([], tf.string),
'clip/end/timestamp': tf.io.FixedLenFeature([], tf.int64),
'clip/start/timestamp': tf.io.FixedLenFeature([], tf.int64)
}
features = {
'AUDIO/feature/floats': tf.io.VarLenFeature(dtype=tf.float32),
'AUDIO/feature/timestamp': tf.io.VarLenFeature(tf.int64),
'RGB/feature/floats': tf.io.VarLenFeature(dtype=tf.float32),
'RGB/feature/timestamp': tf.io.VarLenFeature(tf.int64)
}
def parse_exmp(serial_exmp):
_, sequence_parsed = tf.io.parse_single_sequence_example(
serialized=serial_exmp,
context_features=contexts,
sequence_features=features)
sequence_parsed = tf.contrib.learn.run_n(sequence_parsed)[0]
audio = sequence_parsed['AUDIO/feature/floats'].values
rgb = sequence_parsed['RGB/feature/floats'].values
# print(audio.values)
# print(type(audio.values))
# audio is 128 8bit, rgb is 1024 8bit for every second
audio_slices = [audio[128 * i: 128 * (i + 1)] for i in range(len(audio) // 128)]
rgb_slices = [rgb[1024 * i: 1024 * (i + 1)] for i in range(len(rgb) // 1024)]
byte_audio = []
byte_rgb = []
for seg in audio_slices:
# audio_seg = quantize(seg)
audio_seg = _make_bytes(seg)
byte_audio.append(audio_seg)
for seg in rgb_slices:
# rgb_seg = quantize(seg)
rgb_seg = _make_bytes(seg)
byte_rgb.append(rgb_seg)
return byte_audio, byte_rgb
def make_exmp(id, audio, rgb):
audio_features = []
rgb_features = []
for embedding in audio:
embedding_feature = tf.train.Feature(
bytes_list=tf.train.BytesList(value=[embedding]))
audio_features.append(embedding_feature)
for embedding in rgb:
embedding_feature = tf.train.Feature(
bytes_list=tf.train.BytesList(value=[embedding]))
rgb_features.append(embedding_feature)
# for construct yt8m data
seq_exmp = tf.train.SequenceExample(
context=tf.train.Features(
feature={
'id': tf.train.Feature(bytes_list=tf.train.BytesList(
value=[id.encode('utf-8')]))
}),
feature_lists=tf.train.FeatureLists(
feature_list={
'audio': tf.train.FeatureList(
feature=audio_features
),
'rgb': tf.train.FeatureList(
feature=rgb_features
)
})
)
serialized = seq_exmp.SerializeToString()
return serialized
def convert_pb(filename):
sequence_example = open(filename, 'rb').read()
audio, rgb = parse_exmp(sequence_example)
tmp_example = make_exmp('video', audio, rgb)
decoded = tf.train.SequenceExample.FromString(tmp_example)
return decoded
import tensorflow as tf
import numpy as np
frame_lvl_record = "test0000.tfrecord"
feat_rgb = []
feat_audio = []
for example in tf.python_io.tf_record_iterator(frame_lvl_record):
tf_seq_example = tf.train.SequenceExample.FromString(example)
test = tf_seq_example.SerializeToString()
n_frames = len(tf_seq_example.feature_lists.feature_list['audio'].feature)
sess = tf.InteractiveSession()
rgb_frame = []
audio_frame = []
# iterate through frames
for i in range(n_frames):
rgb_frame.append(tf.cast(tf.decode_raw(
tf_seq_example.feature_lists.feature_list['rgb']
.feature[i].bytes_list.value[0], tf.uint8)
, tf.float32).eval())
audio_frame.append(tf.cast(tf.decode_raw(
tf_seq_example.feature_lists.feature_list['audio']
.feature[i].bytes_list.value[0], tf.uint8)
, tf.float32).eval())
sess.close()
feat_audio.append(audio_frame)
feat_rgb.append(rgb_frame)
break
print('The first video has %d frames' %len(feat_rgb[0]))
\ No newline at end of file
This diff could not be displayed because it is too large.
Index
3
7
8
11
12
17
18
19
21
22
23
28
31
30
32
33
34
41
43
45
46
48
53
54
52
55
58
59
60
61
65
68
73
71
74
75
76
77
80
83
90
88
89
92
95
100
101
99
104
105
109
113
112
115
116
118
120
121
123
125
127
131
128
129
130
137
141
143
145
148
152
151
156
155
158
160
164
163
169
170
172
171
173
174
175
176
178
182
184
186
188
187
192
191
190
194
197
196
198
201
202
200
199
205
204
209
207
206
210
213
214
220
218
217
226
227
231
232
229
233
235
237
244
240
249
246
248
239
250
245
255
253
256
261
259
263
262
266
267
268
269
271
276
273
277
274
278
279
280
288
291
295
294
293
297
296
300
299
303
302
304
305
313
307
311
310
312
316
318
321
322
331
333
329
330
334
343
349
340
344
348
358
347
359
355
361
360
364
365
368
369
366
370
374
380
373
385
384
388
389
382
393
381
390
394
399
397
396
402
400
398
401
405
406
410
408
416
415
419
422
414
421
424
429
418
427
434
428
435
430
441
439
437
443
440
442
445
446
448
454
444
453
455
451
452
458
460
465
457
463
462
461
464
469
468
472
473
471
475
474
477
485
491
488
482
490
496
494
483
495
493
507
501
499
503
498
514
504
502
506
508
511
527
526
532
513
519
525
518
528
522
523
535
539
540
533
521
541
547
550
544
549
551
554
543
548
557
560
552
559
563
565
567
555
576
568
564
573
581
580
572
571
584
590
585
587
588
592
598
597
599
603
600
604
605
614
602
610
608
611
612
613
617
620
607
624
627
625
631
629
638
632
634
644
641
642
646
652
647
637
661
635
658
648
663
668
664
656
666
671
683
675
669
676
667
691
685
673
688
702
684
679
694
686
689
680
693
703
697
698
692
705
706
712
711
709
710
726
713
721
720
715
717
730
728
723
716
722
718
732
724
736
725
742
727
735
740
748
738
746
751
749
752
754
760
763
756
758
766
764
757
780
767
769
771
786
785
781
787
778
783
792
791
795
788
805
802
801
793
796
804
803
797
814
813
789
808
818
816
817
811
820
826
829
824
821
825
822
835
833
843
823
827
830
832
837
852
844
841
812
847
862
869
860
838
870
846
858
854
880
876
857
859
877
871
855
875
861
867
892
898
888
884
887
891
906
900
878
885
883
901
903
907
930
897
914
917
910
905
909
933
932
922
913
923
931
911
937
918
955
915
944
952
945
948
946
970
974
958
925
979
942
965
975
950
982
940
973
962
972
957
984
983
964
1007
971
981
954
993
991
996
1005
1015
1009
995
986
1000
985
980
1016
1011
999
1002
994
1013
1010
992
1008
1036
1025
1012
990
1037
1040
1031
1019
1052
1001
1055
1032
1069
1058
1014
1023
1030
1061
1035
1034
1053
1045
1046
1067
1060
1049
1056
1074
1066
1044
1038
1073
1077
1068
1057
1072
1104
1083
1089
1087
1099
1076
1086
1098
1094
1095
1096
1101
1107
1105
1117
1093
1106
1122
1119
1103
1128
1120
1126
1102
1115
1124
1123
1131
1136
1144
1121
1137
1132
1133
1157
1134
1143
1159
1164
1155
1142
1150
1148
1161
1165
1147
1162
1152
1174
1160
1166
1190
1175
1167
1156
1180
1171
1179
1172
1186
1188
1201
1177
1208
1183
1189
1192
1209
1214
1197
1168
1202
1205
1203
1199
1219
1217
1187
1206
1210
1241
1221
1218
1223
1236
1212
1237
1195
1216
1247
1234
1240
1257
1224
1243
1259
1242
1282
1222
1254
1227
1235
1269
1258
1290
1275
1262
1252
1248
1272
1246
1225
1245
1277
1298
1288
1271
1265
1286
1260
1266
1296
1280
1285
1293
1276
1287
1289
1261
1264
1295
1291
1283
1311
1303
1330
1315
1300
1333
1307
1325
1334
1316
1314
1317
1310
1329
1324
1339
1346
1342
1352
1321
1376
1366
1308
1345
1348
1386
1383
1372
1367
1400
1382
1375
1392
1380
1371
1393
1389
1353
1387
1374
1379
1381
1359
1360
1396
1399
1365
1424
1373
1411
1401
1397
1395
1412
1394
1368
1423
1391
1435
1409
1443
1402
1425
1415
1421
1426
1433
1420
1452
1436
1430
1408
1458
1429
1453
1454
1447
1472
1486
1468
1461
1467
1484
1457
1444
1450
1451
1459
1462
1449
1476
1470
1471
1498
1488
1442
1480
1456
1466
1505
1517
1464
1503
1490
1519
1481
1493
1463
1532
1487
1501
1500
1495
1509
1535
1506
1521
1580
1540
1502
1520
1496
1569
1515
1489
1507
1527
1545
1560
1510
1514
1526
1594
1511
1572
1548
1584
1556
1588
1628
1555
1568
1550
1622
1563
1603
1616
1576
1549
1537
1593
1618
1645
1624
1617
1634
1595
1597
1590
1632
1575
1559
1625
1615
1591
1630
1608
1621
1589
1646
1643
1652
1627
1611
1626
1613
1639
1655
1620
1602
1651
1653
1669
1638
1696
1649
1675
1660
1683
1666
1671
1703
1716
1637
1672
1676
1692
1711
1680
1641
1688
1708
1704
1690
1674
1718
1699
1723
1756
1700
1662
1715
1657
1733
1728
1670
1712
1685
1724
1735
1714
1730
1747
1656
1737
1705
1693
1713
1689
1753
1739
1721
1725
1749
1732
1743
1731
1767
1738
1831
1771
1726
1746
1776
1775
1799
1774
1780
1781
1769
1805
1788
1801
This diff could not be displayed because it is too large.
import nltk
import gensim
import pandas as pd
# Load files.
nltk.download('stopwords')
vocab = pd.read_csv('../vocabulary.csv')
# Lower corpus and Remove () from name.
vocab['WikiDescription'] = vocab['WikiDescription'].str.lower().str.replace('[^a-zA-Z0-9]', ' ')
for i in range(vocab['Name'].__len__()):
name = vocab['Name'][i]
if isinstance(name, str) and name.find(" (") != -1:
vocab['Name'][i] = name[:name.find(" (")]
vocab['Name'] = vocab['Name'].str.lower()
# Combine separated names.(mobile phone -> mobile-phone)
for name in vocab['Name']:
if isinstance(name, str) and name.find(" ") != -1:
combined_name = name.replace(" ", "-")
for i in range(vocab['WikiDescription'].__len__()):
if isinstance(vocab['WikiDescription'][i], str):
vocab['WikiDescription'][i] = vocab['WikiDescription'][i].replace(name, combined_name)
# Remove stopwords from corpus.
stop_re = '\\b'+'\\b|\\b'.join(nltk.corpus.stopwords.words('english'))+'\\b'
vocab['WikiDescription'] = vocab['WikiDescription'].str.replace(stop_re, '')
vocab['WikiDescription'] = vocab['WikiDescription'].str.split()
# Tokenize corpus.
tokenlist = [x for x in vocab['WikiDescription'] if str(x) != 'nan']
phrases = gensim.models.phrases.Phrases(tokenlist)
phraser = gensim.models.phrases.Phraser(phrases)
vocab_phrased = phraser[tokenlist]
# Vectorize tags.
w2v = gensim.models.word2vec.Word2Vec(sentences=tokenlist, min_count=1)
w2v.save('tag_vectors.model')
# word_vectors = w2v.wv
# vocabs = word_vectors.vocab.keys()
# word_vectors_list = [word_vectors[v] for v in vocabs]
This file is too large to display.
from gensim.models import Word2Vec
import numpy as np
def recommend_videos(tags, tag_model_path, video_model_path, top_k):
tag_vectors = Word2Vec.load(tag_model_path).wv
video_vectors = Word2Vec().wv.load(video_model_path)
error_tags = []
video_vector = np.zeros(100)
for (tag, weight) in tags:
if tag in tag_vectors.index_to_key:
video_vector = video_vector + (tag_vectors[tag] * float(weight))
else:
# Pass if tag is unknown
if tag not in error_tags:
error_tags.append(tag)
similar_ids = [x[0] for x in video_vectors.similar_by_vector(video_vector, top_k)]
return similar_ids
import requests
import pandas as pd
base_URL = 'https://data.yt8m.org/2/j/i/'
youtube_url = 'https://www.youtube.com/watch?v='
def getURL(vid_id):
URL = base_URL + vid_id[:-2] + '/' + vid_id + '.js'
response = requests.get(URL, verify = False)
if response.status_code == 200:
return youtube_url + response.text[10:-3]
def getVideoInfo(vid_id, video_tags_path, top_k):
video_url = getURL(vid_id)
entire_video_tags = pd.read_csv(video_tags_path)
video_tags_info = entire_video_tags.loc[entire_video_tags["vid_id"] == vid_id]
video_tags = []
for i in range(1, top_k + 1):
video_tag_tuple = video_tags_info["segment" + str(i)].values[0] # ex: "mobile-phone:0.361"
video_tags.append(video_tag_tuple.split(":")[0])
return {
"video_url": video_url,
"video_tags": video_tags
}
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
BATCH_SIZE = 1000
def vectorization_video():
print('[0.1 0.2]')
if __name__ == '__main__':
tag_vectors = Word2Vec.load("tag_vectors.model").wv
video_vectors = Word2Vec().wv # Empty model
# Load video recommendation tags.
video_tags = pd.read_csv('statics/kaggle_solution_40k.csv')
# Define batch variables.
batch_video_ids = []
batch_video_vectors = []
error_tags = []
for i, row in video_tags.iterrows():
video_id = row[0]
video_vector = np.zeros(100)
for segment_index in range(1, 6):
tag, weight = row[segment_index].split(":")
if tag in tag_vectors.vocab:
video_vector = video_vector + (tag_vectors[tag] * float(weight))
else:
# Pass if tag is unknown
if tag not in error_tags:
error_tags.append(tag)
batch_video_ids.append(video_id)
batch_video_vectors.append(video_vector)
# Add video vectors.
if (i+1) % BATCH_SIZE == 0:
video_vectors.add(batch_video_ids, batch_video_vectors)
batch_video_ids = []
batch_video_vectors = []
print("Video vectors created: ", i+1)
# Add rest of video vectors.
video_vectors.add(batch_video_ids, batch_video_vectors)
print("error tags: ")
print(error_tags)
video_vectors.save("video_vectors.model")
# Usage
# video_vectors = Word2Vec().wv.load("video_vectors.model")
# video_vectors.most_similar("XwFj", topn=5)
This file is too large to display.