Showing
12 changed files
with
1995 additions
and
0 deletions
code/yolov3/args.py
0 → 100644
1 | +from __future__ import division, print_function | ||
2 | + | ||
3 | +import numpy as np | ||
4 | +import tensorflow as tf | ||
5 | +import random | ||
6 | +import math | ||
7 | + | ||
8 | +from misc_utils import parse_anchors, read_class_names | ||
9 | +from tfrecord_utils import TFRecordIterator | ||
10 | + | ||
11 | +### Some paths | ||
12 | +data_path = '../../data/' | ||
13 | +train_file = data_path + 'train.tfrecord' # The path of the training txt file. | ||
14 | +val_file = data_path + 'val.tfrecord' # The path of the validation txt file. | ||
15 | +restore_path = data_path + 'darknet_weights/yolov3.ckpt' # The path of the weights to restore. | ||
16 | +save_dir = '../../checkpoint/' # The directory of the weights to save. | ||
17 | + | ||
18 | +### we are not using tensorboard logs in this code | ||
19 | + | ||
20 | +log_dir = data_path + 'logs/' # The directory to store the tensorboard log files. | ||
21 | +progress_log_path = data_path + 'progress.log' # The path to record the training progress. | ||
22 | + | ||
23 | +anchor_path = data_path + 'yolo_anchors.txt' # The path of the anchor txt file. | ||
24 | +class_name_path = data_path + 'classes.txt' # The path of the class names. | ||
25 | + | ||
26 | +### Training releated numbers | ||
27 | +batch_size = 6 | ||
28 | +img_size = [416, 416] # Images will be resized to `img_size` and fed to the network, size format: [width, height] | ||
29 | +letterbox_resize = True # Whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized image. | ||
30 | +total_epoches = 50 | ||
31 | +train_evaluation_step = 10 # Evaluate on the training batch after some steps. | ||
32 | +val_evaluation_epoch = 2 # Evaluate on the whole validation dataset after some epochs. Set to None to evaluate every epoch. | ||
33 | +save_epoch = 5 # Save the model after some epochs. | ||
34 | +batch_norm_decay = 0.99 # decay in bn ops | ||
35 | +weight_decay = 5e-4 # l2 weight decay | ||
36 | +global_step = 0 # used when resuming training | ||
37 | + | ||
38 | +### tf.data parameters | ||
39 | +num_threads = 10 # Number of threads for image processing used in tf.data pipeline. | ||
40 | +prefetech_buffer = 5 # Prefetech_buffer used in tf.data pipeline. | ||
41 | + | ||
42 | +### Learning rate and optimizer | ||
43 | +optimizer_name = 'momentum' # Chosen from [sgd, momentum, adam, rmsprop] | ||
44 | +save_optimizer = True # Whether to save the optimizer parameters into the checkpoint file. | ||
45 | +learning_rate_init = 1e-4 | ||
46 | +lr_type = 'piecewise' # Chosen from [fixed, exponential, cosine_decay, cosine_decay_restart, piecewise] | ||
47 | +lr_decay_epoch = 5 # Epochs after which learning rate decays. Int or float. Used when chosen `exponential` and `cosine_decay_restart` lr_type. | ||
48 | +lr_decay_factor = 0.96 # The learning rate decay factor. Used when chosen `exponential` lr_type. | ||
49 | +lr_lower_bound = 1e-6 # The minimum learning rate. | ||
50 | +# only used in piecewise lr type | ||
51 | +pw_boundaries = [30, 50] # epoch based boundaries | ||
52 | +pw_values = [learning_rate_init, 3e-5, 1e-5] | ||
53 | + | ||
54 | +### Load and finetune | ||
55 | +# Choose the parts you want to restore the weights. List form. | ||
56 | +# restore_include: None, restore_exclude: None => restore the whole model | ||
57 | +# restore_include: None, restore_exclude: scope => restore the whole model except `scope` | ||
58 | +# restore_include: scope1, restore_exclude: scope2 => if scope1 contains scope2, restore scope1 and not restore scope2 (scope1 - scope2) | ||
59 | +# choise 1: only restore the darknet body | ||
60 | +# restore_include = ['yolov3/darknet53_body'] | ||
61 | +# restore_exclude = None | ||
62 | +# choise 2: restore all layers except the last 3 conv2d layers in 3 scale | ||
63 | +restore_include = None | ||
64 | +restore_exclude = ['yolov3/yolov3_head/Conv_14', 'yolov3/yolov3_head/Conv_6', 'yolov3/yolov3_head/Conv_22'] | ||
65 | +# Choose the parts you want to finetune. List form. | ||
66 | +# Set to None to train the whole model. | ||
67 | + | ||
68 | +update_part = ['yolov3/yolov3_head'] | ||
69 | + | ||
70 | +### other training strategies | ||
71 | +multi_scale_train = True # Whether to apply multi-scale training strategy. Image size varies from [320, 320] to [640, 640] by default. | ||
72 | +use_label_smooth = True # Whether to use class label smoothing strategy. | ||
73 | +use_focal_loss = True # Whether to apply focal loss on the conf loss. | ||
74 | +use_mix_up = True # Whether to use mix up data augmentation strategy. | ||
75 | +use_warm_up = True # whether to use warm up strategy to prevent from gradient exploding. | ||
76 | +warm_up_epoch = 3 # Warm up training epoches. Set to a larger value if gradient explodes. | ||
77 | + | ||
78 | +### some constants in validation | ||
79 | +# nms | ||
80 | +nms_threshold = 0.45 # iou threshold in nms operation | ||
81 | +score_threshold = 0.01 # threshold of the probability of the classes in nms operation, i.e. score = pred_confs * pred_probs. set lower for higher recall. | ||
82 | +nms_topk = 150 # keep at most nms_topk outputs after nms | ||
83 | +# mAP eval | ||
84 | +eval_threshold = 0.5 # the iou threshold applied in mAP evaluation | ||
85 | +use_voc_07_metric = False # whether to use voc 2007 evaluation metric, i.e. the 11-point metric | ||
86 | + | ||
87 | +### parse some params | ||
88 | +anchors = parse_anchors(anchor_path) | ||
89 | +classes = read_class_names(class_name_path) | ||
90 | +class_num = len(classes) | ||
91 | +train_img_cnt = TFRecordIterator(train_file, 'GZIP').count() | ||
92 | +val_img_cnt = TFRecordIterator(val_file, 'GZIP').count() | ||
93 | +train_batch_num = int(math.ceil(float(train_img_cnt) / batch_size)) | ||
94 | + | ||
95 | +lr_decay_freq = int(train_batch_num * lr_decay_epoch) | ||
96 | +pw_boundaries = [float(i) * train_batch_num + global_step for i in pw_boundaries] |
code/yolov3/data_utils.py
0 → 100644
1 | +from __future__ import division, print_function | ||
2 | + | ||
3 | +import tensorflow as tf | ||
4 | +import numpy as np | ||
5 | +import cv2 | ||
6 | +import sys | ||
7 | +import random | ||
8 | + | ||
9 | +PY_VERSION = sys.version_info[0] | ||
10 | +iter_cnt = 0 | ||
11 | + | ||
12 | +FEATURE_DESCRIPTION = { | ||
13 | + 'index': tf.FixedLenFeature([], tf.int64), | ||
14 | + 'image': tf.FixedLenFeature([], tf.string), | ||
15 | + 'width': tf.FixedLenFeature([], tf.int64), | ||
16 | + 'height': tf.FixedLenFeature([], tf.int64), | ||
17 | + 'boxes': tf.VarLenFeature(tf.int64) | ||
18 | +} | ||
19 | + | ||
20 | +def parse_tfrecord(data): | ||
21 | + # tfrecord parser for TFRecordDataset (raw data) | ||
22 | + features = tf.parse_single_example(data, FEATURE_DESCRIPTION) | ||
23 | + index = int(features['index']) | ||
24 | + encoded_image = np.frombuffer(features['image'], dtype = np.uint8) | ||
25 | + width = int(features['width']) | ||
26 | + height = int(features['height']) | ||
27 | + boxes = features['boxes'].eval() | ||
28 | + | ||
29 | + assert len(boxes) % 5 == 0, 'Annotation error occured in box array.' | ||
30 | + box_cnt = len(boxes) // 5 | ||
31 | + | ||
32 | + aligned_boxes = [] | ||
33 | + labels = [] | ||
34 | + | ||
35 | + for i in range(box_cnt): | ||
36 | + label, x_min, y_min, x_max, y_max = int(boxes[i * 5]), float(boxes[i * 5 + 1]), float(boxes[i * 5 + 2]), float(boxes[i * 5 + 3]) ## do we need to change int to float? is there float rectangle sample? | ||
37 | + aligned_boxes.append([x_min, y_min, x_max, y_max]) | ||
38 | + labels.append(label) | ||
39 | + | ||
40 | + aligned_boxes = np.asarray(aligned_boxes, np.float32) | ||
41 | + labels = np.asarray(labels, np.int64) | ||
42 | + | ||
43 | + return index, encoded_image, aligned_boxes, labels, width, height | ||
44 | + | ||
45 | +def parse_record(features): | ||
46 | + # tfrecord parser for TFRecordIterator (primitive data) | ||
47 | + | ||
48 | + index = int(features['index']) | ||
49 | + encoded_image = np.frombuffer(features['image'], dtype = np.uint8) | ||
50 | + width = int(features['width']) | ||
51 | + height = int(features['height']) | ||
52 | + boxes = features['boxes'] | ||
53 | + | ||
54 | + assert len(boxes) % 5 == 0, 'Annotation error occured in box array.' | ||
55 | + box_cnt = len(boxes) // 5 | ||
56 | + | ||
57 | + aligned_boxes = [] | ||
58 | + labels = [] | ||
59 | + | ||
60 | + for i in range(box_cnt): | ||
61 | + label, x_min, y_min, x_max, y_max = int(boxes[i * 5]), float(boxes[i * 5 + 1]), float(boxes[i * 5 + 2]), float(boxes[i * 5 + 3]) | ||
62 | + aligned_boxes.append([x_min, y_min, x_max, y_max]) | ||
63 | + labels.append(label) | ||
64 | + | ||
65 | + aligned_boxes = np.asarray(aligned_boxes, np.float32) | ||
66 | + labels = np.asarray(labels, np.int64) | ||
67 | + | ||
68 | + return index, encoded_image, aligned_boxes, labels, width, height | ||
69 | + | ||
70 | +def bbox_crop(bbox, crop_box=None, allow_outside_center=True): | ||
71 | + bbox = bbox.copy() | ||
72 | + if crop_box is None: | ||
73 | + return bbox | ||
74 | + if not len(crop_box) == 4: | ||
75 | + raise ValueError( | ||
76 | + "Invalid crop_box parameter, requires length 4, given {}".format(str(crop_box))) | ||
77 | + if sum([int(c is None) for c in crop_box]) == 4: | ||
78 | + return bbox | ||
79 | + | ||
80 | + l, t, w, h = crop_box | ||
81 | + | ||
82 | + left = l if l else 0 | ||
83 | + top = t if t else 0 | ||
84 | + right = left + (w if w else np.inf) | ||
85 | + bottom = top + (h if h else np.inf) | ||
86 | + crop_bbox = np.array((left, top, right, bottom)) | ||
87 | + | ||
88 | + if allow_outside_center: | ||
89 | + mask = np.ones(bbox.shape[0], dtype=bool) | ||
90 | + else: | ||
91 | + centers = (bbox[:, :2] + bbox[:, 2:4]) / 2 | ||
92 | + mask = np.logical_and(crop_bbox[:2] <= centers, centers < crop_bbox[2:]).all(axis=1) | ||
93 | + | ||
94 | + # transform borders | ||
95 | + bbox[:, :2] = np.maximum(bbox[:, :2], crop_bbox[:2]) | ||
96 | + bbox[:, 2:4] = np.minimum(bbox[:, 2:4], crop_bbox[2:4]) | ||
97 | + bbox[:, :2] -= crop_bbox[:2] | ||
98 | + bbox[:, 2:4] -= crop_bbox[:2] | ||
99 | + | ||
100 | + mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:4]).all(axis=1)) | ||
101 | + bbox = bbox[mask] | ||
102 | + return bbox | ||
103 | + | ||
104 | +def bbox_iou(bbox_a, bbox_b, offset=0): | ||
105 | + if bbox_a.shape[1] < 4 or bbox_b.shape[1] < 4: | ||
106 | + raise IndexError("Bounding boxes axis 1 must have at least length 4") | ||
107 | + | ||
108 | + tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2]) | ||
109 | + br = np.minimum(bbox_a[:, None, 2:4], bbox_b[:, 2:4]) | ||
110 | + | ||
111 | + area_i = np.prod(br - tl + offset, axis=2) * (tl < br).all(axis=2) | ||
112 | + area_a = np.prod(bbox_a[:, 2:4] - bbox_a[:, :2] + offset, axis=1) | ||
113 | + area_b = np.prod(bbox_b[:, 2:4] - bbox_b[:, :2] + offset, axis=1) | ||
114 | + return area_i / (area_a[:, None] + area_b - area_i) | ||
115 | + | ||
116 | + | ||
117 | +def random_crop_with_constraints(bbox, size, min_scale=0.3, max_scale=1, | ||
118 | + max_aspect_ratio=2, constraints=None, | ||
119 | + max_trial=50): | ||
120 | + # default params in paper | ||
121 | + if constraints is None: | ||
122 | + constraints = ( | ||
123 | + (0.1, None), | ||
124 | + (0.3, None), | ||
125 | + (0.5, None), | ||
126 | + (0.7, None), | ||
127 | + (0.9, None), | ||
128 | + (None, 1), | ||
129 | + ) | ||
130 | + | ||
131 | + w, h = size | ||
132 | + | ||
133 | + candidates = [(0, 0, w, h)] | ||
134 | + for min_iou, max_iou in constraints: | ||
135 | + min_iou = -np.inf if min_iou is None else min_iou | ||
136 | + max_iou = np.inf if max_iou is None else max_iou | ||
137 | + | ||
138 | + for _ in range(max_trial): | ||
139 | + scale = random.uniform(min_scale, max_scale) | ||
140 | + aspect_ratio = random.uniform( | ||
141 | + max(1 / max_aspect_ratio, scale * scale), | ||
142 | + min(max_aspect_ratio, 1 / (scale * scale))) | ||
143 | + crop_h = int(h * scale / np.sqrt(aspect_ratio)) | ||
144 | + crop_w = int(w * scale * np.sqrt(aspect_ratio)) | ||
145 | + | ||
146 | + crop_t = random.randrange(h - crop_h) | ||
147 | + crop_l = random.randrange(w - crop_w) | ||
148 | + crop_bb = np.array((crop_l, crop_t, crop_l + crop_w, crop_t + crop_h)) | ||
149 | + | ||
150 | + if len(bbox) == 0: | ||
151 | + top, bottom = crop_t, crop_t + crop_h | ||
152 | + left, right = crop_l, crop_l + crop_w | ||
153 | + return bbox, (left, top, right-left, bottom-top) | ||
154 | + | ||
155 | + iou = bbox_iou(bbox, crop_bb[np.newaxis]) | ||
156 | + if min_iou <= iou.min() and iou.max() <= max_iou: | ||
157 | + top, bottom = crop_t, crop_t + crop_h | ||
158 | + left, right = crop_l, crop_l + crop_w | ||
159 | + candidates.append((left, top, right-left, bottom-top)) | ||
160 | + break | ||
161 | + | ||
162 | + # random select one | ||
163 | + while candidates: | ||
164 | + crop = candidates.pop(np.random.randint(0, len(candidates))) | ||
165 | + new_bbox = bbox_crop(bbox, crop, allow_outside_center=False) | ||
166 | + if new_bbox.size < 1: | ||
167 | + continue | ||
168 | + new_crop = (crop[0], crop[1], crop[2], crop[3]) | ||
169 | + return new_bbox, new_crop | ||
170 | + return bbox, (0, 0, w, h) | ||
171 | + | ||
172 | + | ||
173 | +def random_color_distort(img, brightness_delta=32, hue_vari=18, sat_vari=0.5, val_vari=0.5): | ||
174 | + def random_hue(img_hsv, hue_vari, p=0.5): | ||
175 | + if np.random.uniform(0, 1) > p: | ||
176 | + hue_delta = np.random.randint(-hue_vari, hue_vari) | ||
177 | + img_hsv[:, :, 0] = (img_hsv[:, :, 0] + hue_delta) % 180 | ||
178 | + return img_hsv | ||
179 | + | ||
180 | + def random_saturation(img_hsv, sat_vari, p=0.5): | ||
181 | + if np.random.uniform(0, 1) > p: | ||
182 | + sat_mult = 1 + np.random.uniform(-sat_vari, sat_vari) | ||
183 | + img_hsv[:, :, 1] *= sat_mult | ||
184 | + return img_hsv | ||
185 | + | ||
186 | + def random_value(img_hsv, val_vari, p=0.5): | ||
187 | + if np.random.uniform(0, 1) > p: | ||
188 | + val_mult = 1 + np.random.uniform(-val_vari, val_vari) | ||
189 | + img_hsv[:, :, 2] *= val_mult | ||
190 | + return img_hsv | ||
191 | + | ||
192 | + def random_brightness(img, brightness_delta, p=0.5): | ||
193 | + if np.random.uniform(0, 1) > p: | ||
194 | + img = img.astype(np.float32) | ||
195 | + brightness_delta = int(np.random.uniform(-brightness_delta, brightness_delta)) | ||
196 | + img = img + brightness_delta | ||
197 | + return np.clip(img, 0, 255) | ||
198 | + | ||
199 | + # brightness | ||
200 | + img = random_brightness(img, brightness_delta) | ||
201 | + img = img.astype(np.uint8) | ||
202 | + | ||
203 | + # color jitter | ||
204 | + img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.float32) | ||
205 | + | ||
206 | + if np.random.randint(0, 2): | ||
207 | + img_hsv = random_value(img_hsv, val_vari) | ||
208 | + img_hsv = random_saturation(img_hsv, sat_vari) | ||
209 | + img_hsv = random_hue(img_hsv, hue_vari) | ||
210 | + else: | ||
211 | + img_hsv = random_saturation(img_hsv, sat_vari) | ||
212 | + img_hsv = random_hue(img_hsv, hue_vari) | ||
213 | + img_hsv = random_value(img_hsv, val_vari) | ||
214 | + | ||
215 | + img_hsv = np.clip(img_hsv, 0, 255) | ||
216 | + img = cv2.cvtColor(img_hsv.astype(np.uint8), cv2.COLOR_HSV2BGR) | ||
217 | + | ||
218 | + return img | ||
219 | + | ||
220 | + | ||
221 | +def letterbox_resize(img, new_width, new_height, interp=0): | ||
222 | + ori_height, ori_width = img.shape[:2] | ||
223 | + | ||
224 | + resize_ratio = min(new_width / ori_width, new_height / ori_height) | ||
225 | + | ||
226 | + resize_w = int(resize_ratio * ori_width) | ||
227 | + resize_h = int(resize_ratio * ori_height) | ||
228 | + | ||
229 | + img = cv2.resize(img, (resize_w, resize_h), interpolation=interp) | ||
230 | + image_padded = np.full((new_height, new_width, 3), 128, np.uint8) | ||
231 | + | ||
232 | + dw = int((new_width - resize_w) / 2) | ||
233 | + dh = int((new_height - resize_h) / 2) | ||
234 | + | ||
235 | + image_padded[dh: resize_h + dh, dw: resize_w + dw, :] = img | ||
236 | + | ||
237 | + return image_padded, resize_ratio, dw, dh | ||
238 | + | ||
239 | + | ||
240 | +def resize_with_bbox(img, bbox, new_width, new_height, interp=0, letterbox=False): | ||
241 | + if letterbox: | ||
242 | + image_padded, resize_ratio, dw, dh = letterbox_resize(img, new_width, new_height, interp) | ||
243 | + | ||
244 | + # xmin, xmax | ||
245 | + bbox[:, [0, 2]] = bbox[:, [0, 2]] * resize_ratio + dw | ||
246 | + # ymin, ymax | ||
247 | + bbox[:, [1, 3]] = bbox[:, [1, 3]] * resize_ratio + dh | ||
248 | + | ||
249 | + return image_padded, bbox | ||
250 | + else: | ||
251 | + ori_height, ori_width = img.shape[:2] | ||
252 | + | ||
253 | + img = cv2.resize(img, (new_width, new_height), interpolation=interp) | ||
254 | + | ||
255 | + # xmin, xmax | ||
256 | + bbox[:, [0, 2]] = bbox[:, [0, 2]] / ori_width * new_width | ||
257 | + # ymin, ymax | ||
258 | + bbox[:, [1, 3]] = bbox[:, [1, 3]] / ori_height * new_height | ||
259 | + | ||
260 | + return img, bbox | ||
261 | + | ||
262 | + | ||
263 | +def random_flip(img, bbox, px=0, py=0): | ||
264 | + height, width = img.shape[:2] | ||
265 | + if np.random.uniform(0, 1) < px: | ||
266 | + img = cv2.flip(img, 1) | ||
267 | + xmax = width - bbox[:, 0] | ||
268 | + xmin = width - bbox[:, 2] | ||
269 | + bbox[:, 0] = xmin | ||
270 | + bbox[:, 2] = xmax | ||
271 | + | ||
272 | + if np.random.uniform(0, 1) < py: | ||
273 | + img = cv2.flip(img, 0) | ||
274 | + ymax = height - bbox[:, 1] | ||
275 | + ymin = height - bbox[:, 3] | ||
276 | + bbox[:, 1] = ymin | ||
277 | + bbox[:, 3] = ymax | ||
278 | + return img, bbox | ||
279 | + | ||
280 | + | ||
281 | +def random_expand(img, bbox, max_ratio=4, fill=0, keep_ratio=True): | ||
282 | + h, w, c = img.shape | ||
283 | + ratio_x = random.uniform(1, max_ratio) | ||
284 | + if keep_ratio: | ||
285 | + ratio_y = ratio_x | ||
286 | + else: | ||
287 | + ratio_y = random.uniform(1, max_ratio) | ||
288 | + | ||
289 | + oh, ow = int(h * ratio_y), int(w * ratio_x) | ||
290 | + off_y = random.randint(0, oh - h) | ||
291 | + off_x = random.randint(0, ow - w) | ||
292 | + | ||
293 | + dst = np.full(shape=(oh, ow, c), fill_value=fill, dtype=img.dtype) | ||
294 | + | ||
295 | + dst[off_y:off_y + h, off_x:off_x + w, :] = img | ||
296 | + | ||
297 | + # correct bbox | ||
298 | + bbox[:, :2] += (off_x, off_y) | ||
299 | + bbox[:, 2:4] += (off_x, off_y) | ||
300 | + | ||
301 | + return dst, bbox | ||
302 | + | ||
303 | +def process_box(boxes, labels, img_size, class_num, anchors): | ||
304 | + anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] | ||
305 | + | ||
306 | + # convert boxes form: | ||
307 | + # shape: [N, 2] | ||
308 | + # (x_center, y_center) | ||
309 | + box_centers = (boxes[:, 0:2] + boxes[:, 2:4]) / 2 | ||
310 | + # (width, height) | ||
311 | + box_sizes = boxes[:, 2:4] - boxes[:, 0:2] | ||
312 | + | ||
313 | + # [13, 13, 3, 5+num_class+1] `5` means coords and labels. `1` means mix up weight. | ||
314 | + y_true_13 = np.zeros((img_size[1] // 32, img_size[0] // 32, 3, 6 + class_num), np.float32) | ||
315 | + y_true_26 = np.zeros((img_size[1] // 16, img_size[0] // 16, 3, 6 + class_num), np.float32) | ||
316 | + y_true_52 = np.zeros((img_size[1] // 8, img_size[0] // 8, 3, 6 + class_num), np.float32) | ||
317 | + | ||
318 | + # mix up weight default to 1. | ||
319 | + y_true_13[..., -1] = 1. | ||
320 | + y_true_26[..., -1] = 1. | ||
321 | + y_true_52[..., -1] = 1. | ||
322 | + | ||
323 | + y_true = [y_true_13, y_true_26, y_true_52] | ||
324 | + | ||
325 | + # [N, 1, 2] | ||
326 | + box_sizes = np.expand_dims(box_sizes, 1) | ||
327 | + # broadcast tricks | ||
328 | + # [N, 1, 2] & [9, 2] ==> [N, 9, 2] | ||
329 | + mins = np.maximum(- box_sizes / 2, - anchors / 2) | ||
330 | + maxs = np.minimum(box_sizes / 2, anchors / 2) | ||
331 | + # [N, 9, 2] | ||
332 | + whs = maxs - mins | ||
333 | + | ||
334 | + # [N, 9] | ||
335 | + iou = (whs[:, :, 0] * whs[:, :, 1]) / ( | ||
336 | + box_sizes[:, :, 0] * box_sizes[:, :, 1] + anchors[:, 0] * anchors[:, 1] - whs[:, :, 0] * whs[:, :, | ||
337 | + 1] + 1e-10) | ||
338 | + # [N] | ||
339 | + best_match_idx = np.argmax(iou, axis=1) | ||
340 | + | ||
341 | + ratio_dict = {1.: 8., 2.: 16., 3.: 32.} | ||
342 | + for i, idx in enumerate(best_match_idx): | ||
343 | + # idx: 0,1,2 ==> 2; 3,4,5 ==> 1; 6,7,8 ==> 0 | ||
344 | + feature_map_group = 2 - idx // 3 | ||
345 | + # scale ratio: 0,1,2 ==> 8; 3,4,5 ==> 16; 6,7,8 ==> 32 | ||
346 | + ratio = ratio_dict[np.ceil((idx + 1) / 3.)] | ||
347 | + x = int(np.floor(box_centers[i, 0] / ratio)) | ||
348 | + y = int(np.floor(box_centers[i, 1] / ratio)) | ||
349 | + k = anchors_mask[feature_map_group].index(idx) | ||
350 | + c = labels[i] | ||
351 | + # print(feature_map_group, '|', y,x,k,c) | ||
352 | + | ||
353 | + y_true[feature_map_group][y, x, k, :2] = box_centers[i] | ||
354 | + y_true[feature_map_group][y, x, k, 2:4] = box_sizes[i] | ||
355 | + y_true[feature_map_group][y, x, k, 4] = 1. | ||
356 | + y_true[feature_map_group][y, x, k, 5 + c] = 1. | ||
357 | + y_true[feature_map_group][y, x, k, -1] = boxes[i, -1] | ||
358 | + | ||
359 | + return y_true_13, y_true_26, y_true_52 | ||
360 | + | ||
361 | + | ||
362 | +def parse_data(data, class_num, img_size, anchors, is_training, letterbox_resize): | ||
363 | + | ||
364 | + img_idx, encoded_img, boxes, labels, _, _ = parse_tfrecord(data) | ||
365 | + img = cv2.imdecode(encoded_img, cv2.IMREAD_COLOR) | ||
366 | + boxes = np.concatenate((boxes, np.full(shape=(boxes.shape[0], 1), fill_value=1., dtype=np.float32)), axis=-1) | ||
367 | + | ||
368 | + ## I erased mix-up method here | ||
369 | + | ||
370 | + if is_training: | ||
371 | + # random color distortion | ||
372 | + img = random_color_distort(img) | ||
373 | + | ||
374 | + # random expansion with prob 0.5 | ||
375 | + if np.random.uniform(0, 1) > 0.5: | ||
376 | + img, boxes = random_expand(img, boxes, 4) | ||
377 | + | ||
378 | + # random cropping | ||
379 | + h, w, _ = img.shape | ||
380 | + boxes, crop = random_crop_with_constraints(boxes, (w, h)) | ||
381 | + x0, y0, w, h = crop | ||
382 | + img = img[y0: y0+h, x0: x0+w] | ||
383 | + | ||
384 | + # resize with random interpolation | ||
385 | + h, w, _ = img.shape | ||
386 | + interp = np.random.randint(0, 5) | ||
387 | + img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp=interp, letterbox=letterbox_resize) | ||
388 | + | ||
389 | + # random horizontal flip | ||
390 | + h, w, _ = img.shape | ||
391 | + img, boxes = random_flip(img, boxes, px=0.5) | ||
392 | + else: | ||
393 | + img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp=1, letterbox=letterbox_resize) | ||
394 | + | ||
395 | + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) | ||
396 | + | ||
397 | + # the input of yolo_v3 should be in range 0~1 | ||
398 | + img = img / 255. | ||
399 | + | ||
400 | + y_true_13, y_true_26, y_true_52 = process_box(boxes, labels, img_size, class_num, anchors) | ||
401 | + | ||
402 | + return img_idx, img, y_true_13, y_true_26, y_true_52 | ||
403 | + | ||
404 | + | ||
405 | +def get_batch_data(records, class_num, img_size, anchors, is_training, multi_scale=False, mix_up=False, letterbox_resize=True, interval=10): | ||
406 | + global iter_cnt | ||
407 | + | ||
408 | + # multi_scale training | ||
409 | + if multi_scale and is_training: | ||
410 | + random.seed(iter_cnt // interval) | ||
411 | + random_img_size = [[x * 32, x * 32] for x in range(10, 20)] | ||
412 | + img_size = random.sample(random_img_size, 1)[0] | ||
413 | + iter_cnt += 1 | ||
414 | + | ||
415 | + img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = [], [], [], [], [] | ||
416 | + | ||
417 | + # deleted mix up strategy | ||
418 | + | ||
419 | + for data in records: | ||
420 | + img_idx, img, y_true_13, y_true_26, y_true_52 = parse_data(data, class_num, img_size, anchors, is_training, letterbox_resize) | ||
421 | + | ||
422 | + img_idx_batch.append(img_idx) | ||
423 | + img_batch.append(img) | ||
424 | + y_true_13_batch.append(y_true_13) | ||
425 | + y_true_26_batch.append(y_true_26) | ||
426 | + y_true_52_batch.append(y_true_52) | ||
427 | + | ||
428 | + img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = np.asarray(img_idx_batch, np.int64), np.asarray(img_batch), np.asarray(y_true_13_batch), np.asarray(y_true_26_batch), np.asarray(y_true_52_batch) | ||
429 | + | ||
430 | + return img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
code/yolov3/eval.py
0 → 100644
1 | +from __future__ import division, print_function | ||
2 | + | ||
3 | +import tensorflow as tf | ||
4 | +import numpy as np | ||
5 | +import argparse | ||
6 | +from tqdm import trange | ||
7 | +import os | ||
8 | + | ||
9 | +from data_utils import get_batch_data | ||
10 | +from misc_utils import parse_anchors, read_class_names, AverageMeter | ||
11 | +from eval_utils import evaluate_on_cpu, evaluate_on_gpu, get_preds_gpu, voc_eval, parse_gt_rec | ||
12 | +from nms_utils import gpu_nms | ||
13 | + | ||
14 | +from model import yolov3 | ||
15 | + | ||
16 | +### ArgumentParser | ||
17 | +parser = argparse.ArgumentParser(description="YOLO-V3 eval procedure.") | ||
18 | + | ||
19 | +# paths | ||
20 | +parser.add_argument("--eval_file", type=str, default="./data/my_data/val.txt", | ||
21 | + help="The path of the validation or test txt file.") | ||
22 | + | ||
23 | +parser.add_argument("--restore_path", type=str, default="./data/darknet_weights/yolov3.ckpt", | ||
24 | + help="The path of the weights to restore.") | ||
25 | + | ||
26 | +parser.add_argument("--anchor_path", type=str, default="./data/yolo_anchors.txt", | ||
27 | + help="The path of the anchor txt file.") | ||
28 | + | ||
29 | +parser.add_argument("--class_name_path", type=str, default="./data/coco.names", | ||
30 | + help="The path of the class names.") | ||
31 | + | ||
32 | +# some numbers | ||
33 | +parser.add_argument("--img_size", nargs='*', type=int, default=[416, 416], | ||
34 | + help="Resize the input image to `img_size`, size format: [width, height]") | ||
35 | + | ||
36 | +parser.add_argument("--letterbox_resize", type=lambda x: (str(x).lower() == 'true'), default=False, | ||
37 | + help="Whether to use the letterbox resize, i.e., keep the original image aspect ratio.") | ||
38 | + | ||
39 | +parser.add_argument("--num_threads", type=int, default=10, | ||
40 | + help="Number of threads for image processing used in tf.data pipeline.") | ||
41 | + | ||
42 | +parser.add_argument("--prefetech_buffer", type=int, default=5, | ||
43 | + help="Prefetech_buffer used in tf.data pipeline.") | ||
44 | + | ||
45 | +parser.add_argument("--nms_threshold", type=float, default=0.45, | ||
46 | + help="IOU threshold in nms operation.") | ||
47 | + | ||
48 | +parser.add_argument("--score_threshold", type=float, default=0.01, | ||
49 | + help="Threshold of the probability of the classes in nms operation.") | ||
50 | + | ||
51 | +parser.add_argument("--nms_topk", type=int, default=400, | ||
52 | + help="Keep at most nms_topk outputs after nms.") | ||
53 | + | ||
54 | +parser.add_argument("--use_voc_07_metric", type=lambda x: (str(x).lower() == 'true'), default=False, | ||
55 | + help="Whether to use the voc 2007 mAP metrics.") | ||
56 | + | ||
57 | +args = parser.parse_args() | ||
58 | + | ||
59 | +# args params | ||
60 | +args.anchors = parse_anchors(args.anchor_path) | ||
61 | +args.classes = read_class_names(args.class_name_path) | ||
62 | +args.class_num = len(args.classes) | ||
63 | +args.img_cnt = len(open(args.eval_file, 'r').readlines()) | ||
64 | + | ||
65 | +# setting placeholders | ||
66 | +is_training = tf.placeholder(dtype=tf.bool, name="phase_train") | ||
67 | +handle_flag = tf.placeholder(tf.string, [], name='iterator_handle_flag') | ||
68 | +pred_boxes_flag = tf.placeholder(tf.float32, [1, None, None]) | ||
69 | +pred_scores_flag = tf.placeholder(tf.float32, [1, None, None]) | ||
70 | +gpu_nms_op = gpu_nms(pred_boxes_flag, pred_scores_flag, args.class_num, args.nms_topk, args.score_threshold, args.nms_threshold) | ||
71 | + | ||
72 | +### tf.data pipeline | ||
73 | +val_dataset = tf.data.TFRecordDataset(filenames=args.eval_file, compression_type='GZIP') | ||
74 | +val_dataset = val_dataset.batch(1) | ||
75 | +val_dataset = val_dataset.map( | ||
76 | + lambda x: tf.py_func(get_batch_data, [x, args.class_num, args.img_size, args.anchors, False, False, False, args.letterbox_resize], [tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]), | ||
77 | + num_parallel_calls=args.num_threads | ||
78 | +) | ||
79 | +val_dataset.prefetch(args.prefetech_buffer) | ||
80 | +iterator = val_dataset.make_one_shot_iterator() | ||
81 | + | ||
82 | +image_ids, image, y_true_13, y_true_26, y_true_52 = iterator.get_next() | ||
83 | +image_ids.set_shape([None]) | ||
84 | +y_true = [y_true_13, y_true_26, y_true_52] | ||
85 | +image.set_shape([None, args.img_size[1], args.img_size[0], 3]) | ||
86 | +for y in y_true: | ||
87 | + y.set_shape([None, None, None, None, None]) | ||
88 | + | ||
89 | +### Model definition | ||
90 | +yolo_model = yolov3(args.class_num, args.anchors) | ||
91 | +with tf.variable_scope('yolov3'): | ||
92 | + pred_feature_maps = yolo_model.forward(image, is_training=is_training) | ||
93 | +loss = yolo_model.compute_loss(pred_feature_maps, y_true) | ||
94 | +y_pred = yolo_model.predict(pred_feature_maps) | ||
95 | + | ||
96 | +saver_to_restore = tf.train.Saver() | ||
97 | + | ||
98 | +with tf.Session() as sess: | ||
99 | + sess.run([tf.global_variables_initializer()]) | ||
100 | + if os.path.exists(args.restore_path): | ||
101 | + saver_to_restore.restore(sess, args.restore_path) | ||
102 | + | ||
103 | + print('\nStart evaluation...\n') | ||
104 | + | ||
105 | + val_loss_total, val_loss_xy, val_loss_wh, val_loss_conf, val_loss_class = \ | ||
106 | + AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter() | ||
107 | + val_preds = [] | ||
108 | + | ||
109 | + for j in trange(args.img_cnt): | ||
110 | + __image_ids, __y_pred, __loss = sess.run([image_ids, y_pred, loss], feed_dict={is_training: False}) | ||
111 | + pred_content = get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __image_ids, __y_pred) | ||
112 | + | ||
113 | + val_preds.extend(pred_content) | ||
114 | + val_loss_total.update(__loss[0]) | ||
115 | + val_loss_xy.update(__loss[1]) | ||
116 | + val_loss_wh.update(__loss[2]) | ||
117 | + val_loss_conf.update(__loss[3]) | ||
118 | + val_loss_class.update(__loss[4]) | ||
119 | + | ||
120 | + rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter() | ||
121 | + gt_dict = parse_gt_rec(args.eval_file, 'GZIP', args.img_size, args.letterbox_resize) | ||
122 | + print('mAP eval:') | ||
123 | + for ii in range(args.class_num): | ||
124 | + npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, ii, iou_thres=0.5, use_07_metric=args.use_voc_07_metric) | ||
125 | + rec_total.update(rec, npos) | ||
126 | + prec_total.update(prec, nd) | ||
127 | + ap_total.update(ap, 1) | ||
128 | + print('Class {}: Recall: {:.4f}, Precision: {:.4f}, AP: {:.4f}'.format(ii, rec, prec, ap)) | ||
129 | + | ||
130 | + mAP = ap_total.average | ||
131 | + print('final mAP: {:.4f}'.format(mAP)) | ||
132 | + print("recall: {:.3f}, precision: {:.3f}".format(rec_total.average, prec_total.average)) | ||
133 | + print("total_loss: {:.3f}, loss_xy: {:.3f}, loss_wh: {:.3f}, loss_conf: {:.3f}, loss_class: {:.3f}".format( | ||
134 | + val_loss_total.average, val_loss_xy.average, val_loss_wh.average, val_loss_conf.average, val_loss_class.average | ||
135 | + )) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
code/yolov3/eval_utils.py
0 → 100644
1 | +from __future__ import division, print_function | ||
2 | + | ||
3 | +import tensorflow as tf | ||
4 | +import numpy as np | ||
5 | +import cv2 | ||
6 | +from collections import Counter | ||
7 | + | ||
8 | +from data_utils import parse_record | ||
9 | +from nms_utils import cpu_nms, gpu_nms | ||
10 | +from tfrecord_utils import TFRecordIterator | ||
11 | + | ||
12 | + | ||
13 | +def calc_iou(pred_boxes, true_boxes): | ||
14 | + pred_boxes = np.expand_dims(pred_boxes, -2) | ||
15 | + true_boxes = np.expand_dims(true_boxes, 0) | ||
16 | + | ||
17 | + intersect_mins = np.maximum(pred_boxes[..., :2], true_boxes[..., :2]) | ||
18 | + intersect_maxs = np.minimum(pred_boxes[..., 2:], true_boxes[..., 2:]) | ||
19 | + intersect_wh = np.maximum(intersect_maxs - intersect_mins, 0.) | ||
20 | + | ||
21 | + intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] | ||
22 | + pred_box_wh = pred_boxes[..., 2:] - pred_boxes[..., :2] | ||
23 | + pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1] | ||
24 | + true_boxes_wh = true_boxes[..., 2:] - true_boxes[..., :2] | ||
25 | + true_boxes_area = true_boxes_wh[..., 0] * true_boxes_wh[..., 1] | ||
26 | + | ||
27 | + iou = intersect_area / (pred_box_area + true_boxes_area - intersect_area + 1e-10) | ||
28 | + | ||
29 | + return iou | ||
30 | + | ||
31 | + | ||
32 | +def evaluate_on_cpu(y_pred, y_true, num_classes, calc_now=True, max_boxes=50, score_thresh=0.5, iou_thresh=0.5): | ||
33 | + num_images = y_true[0].shape[0] | ||
34 | + true_labels_dict = {i: 0 for i in range(num_classes)} | ||
35 | + pred_labels_dict = {i: 0 for i in range(num_classes)} | ||
36 | + true_positive_dict = {i: 0 for i in range(num_classes)} | ||
37 | + | ||
38 | + for i in range(num_images): | ||
39 | + true_labels_list, true_boxes_list = [], [] | ||
40 | + for j in range(3): | ||
41 | + true_probs_temp = y_true[j][i][..., 5:-1] | ||
42 | + true_boxes_temp = y_true[j][i][..., 0:4] | ||
43 | + | ||
44 | + object_mask = true_probs_temp.sum(axis=-1) > 0 | ||
45 | + | ||
46 | + true_probs_temp = true_probs_temp[object_mask] | ||
47 | + true_boxes_temp = true_boxes_temp[object_mask] | ||
48 | + | ||
49 | + true_labels_list += np.argmax(true_probs_temp, axis=-1).tolist() | ||
50 | + true_boxes_list += true_boxes_temp.tolist() | ||
51 | + | ||
52 | + if len(true_labels_list) != 0: | ||
53 | + for cls, count in Counter(true_labels_list).items(): | ||
54 | + true_labels_dict[cls] += count | ||
55 | + | ||
56 | + true_boxes = np.array(true_boxes_list) | ||
57 | + box_centers, box_sizes = true_boxes[:, 0:2], true_boxes[:, 2:4] | ||
58 | + true_boxes[:, 0:2] = box_centers - box_sizes / 2. | ||
59 | + true_boxes[:, 2:4] = true_boxes[:, 0:2] + box_sizes | ||
60 | + | ||
61 | + pred_boxes = y_pred[0][i:i + 1] | ||
62 | + pred_confs = y_pred[1][i:i + 1] | ||
63 | + pred_probs = y_pred[2][i:i + 1] | ||
64 | + | ||
65 | + pred_boxes, pred_confs, pred_labels = cpu_nms(pred_boxes, pred_confs * pred_probs, num_classes, max_boxes=max_boxes, score_thresh=score_thresh, iou_thresh=iou_thresh) | ||
66 | + | ||
67 | + pred_labels_list = [] if pred_labels is None else pred_labels.tolist() | ||
68 | + if pred_labels_list == []: | ||
69 | + continue | ||
70 | + | ||
71 | + # calc iou | ||
72 | + iou_matrix = calc_iou(pred_boxes, true_boxes) | ||
73 | + max_iou_idx = np.argmax(iou_matrix, axis=-1) | ||
74 | + | ||
75 | + correct_idx = [] | ||
76 | + correct_conf = [] | ||
77 | + | ||
78 | + for k in range(max_iou_idx.shape[0]): | ||
79 | + pred_labels_dict[pred_labels_list[k]] += 1 | ||
80 | + match_idx = max_iou_idx[k] # V level | ||
81 | + if iou_matrix[k, match_idx] > iou_thresh and true_labels_list[match_idx] == pred_labels_list[k]: | ||
82 | + if match_idx not in correct_idx: | ||
83 | + correct_idx.append(match_idx) | ||
84 | + correct_conf.append(pred_confs[k]) | ||
85 | + else: | ||
86 | + same_idx = correct_idx.index(match_idx) | ||
87 | + if pred_confs[k] > correct_conf[same_idx]: | ||
88 | + correct_idx.pop(same_idx) | ||
89 | + correct_conf.pop(same_idx) | ||
90 | + correct_idx.append(match_idx) | ||
91 | + correct_conf.append(pred_confs[k]) | ||
92 | + | ||
93 | + for t in correct_idx: | ||
94 | + true_positive_dict[true_labels_list[t]] += 1 | ||
95 | + | ||
96 | + if calc_now: | ||
97 | + # avoid divided by 0 | ||
98 | + recall = sum(true_positive_dict.values()) / (sum(true_labels_dict.values()) + 1e-6) | ||
99 | + precision = sum(true_positive_dict.values()) / (sum(pred_labels_dict.values()) + 1e-6) | ||
100 | + | ||
101 | + return recall, precision | ||
102 | + else: | ||
103 | + return true_positive_dict, true_labels_dict, pred_labels_dict | ||
104 | + | ||
105 | + | ||
106 | +def evaluate_on_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, y_pred, y_true, num_classes, iou_thresh=0.5, calc_now=True): | ||
107 | + num_images = y_true[0].shape[0] | ||
108 | + true_labels_dict = {i: 0 for i in range(num_classes)} | ||
109 | + pred_labels_dict = {i: 0 for i in range(num_classes)} | ||
110 | + true_positive_dict = {i: 0 for i in range(num_classes)} | ||
111 | + | ||
112 | + for i in range(num_images): | ||
113 | + true_labels_list, true_boxes_list = [], [] | ||
114 | + for j in range(3): | ||
115 | + true_probs_temp = y_true[j][i][..., 5:-1] | ||
116 | + true_boxes_temp = y_true[j][i][..., 0:4] | ||
117 | + | ||
118 | + object_mask = true_probs_temp.sum(axis=-1) > 0 | ||
119 | + | ||
120 | + true_probs_temp = true_probs_temp[object_mask] | ||
121 | + true_boxes_temp = true_boxes_temp[object_mask] | ||
122 | + | ||
123 | + true_labels_list += np.argmax(true_probs_temp, axis=-1).tolist() | ||
124 | + true_boxes_list += true_boxes_temp.tolist() | ||
125 | + | ||
126 | + if len(true_labels_list) != 0: | ||
127 | + for cls, count in Counter(true_labels_list).items(): | ||
128 | + true_labels_dict[cls] += count | ||
129 | + | ||
130 | + true_boxes = np.array(true_boxes_list) | ||
131 | + box_centers, box_sizes = true_boxes[:, 0:2], true_boxes[:, 2:4] | ||
132 | + true_boxes[:, 0:2] = box_centers - box_sizes / 2. | ||
133 | + true_boxes[:, 2:4] = true_boxes[:, 0:2] + box_sizes | ||
134 | + | ||
135 | + pred_boxes = y_pred[0][i:i + 1] | ||
136 | + pred_confs = y_pred[1][i:i + 1] | ||
137 | + pred_probs = y_pred[2][i:i + 1] | ||
138 | + | ||
139 | + pred_boxes, pred_confs, pred_labels = sess.run(gpu_nms_op, feed_dict={pred_boxes_flag: pred_boxes, pred_scores_flag: pred_confs * pred_probs}) | ||
140 | + | ||
141 | + pred_labels_list = [] if pred_labels is None else pred_labels.tolist() | ||
142 | + if pred_labels_list == []: | ||
143 | + continue | ||
144 | + | ||
145 | + # calc iou | ||
146 | + iou_matrix = calc_iou(pred_boxes, true_boxes) | ||
147 | + max_iou_idx = np.argmax(iou_matrix, axis=-1) | ||
148 | + | ||
149 | + correct_idx = [] | ||
150 | + correct_conf = [] | ||
151 | + for k in range(max_iou_idx.shape[0]): | ||
152 | + pred_labels_dict[pred_labels_list[k]] += 1 | ||
153 | + match_idx = max_iou_idx[k] # V level | ||
154 | + if iou_matrix[k, match_idx] > iou_thresh and true_labels_list[match_idx] == pred_labels_list[k]: | ||
155 | + if match_idx not in correct_idx: | ||
156 | + correct_idx.append(match_idx) | ||
157 | + correct_conf.append(pred_confs[k]) | ||
158 | + else: | ||
159 | + same_idx = correct_idx.index(match_idx) | ||
160 | + if pred_confs[k] > correct_conf[same_idx]: | ||
161 | + correct_idx.pop(same_idx) | ||
162 | + correct_conf.pop(same_idx) | ||
163 | + correct_idx.append(match_idx) | ||
164 | + correct_conf.append(pred_confs[k]) | ||
165 | + | ||
166 | + for t in correct_idx: | ||
167 | + true_positive_dict[true_labels_list[t]] += 1 | ||
168 | + | ||
169 | + if calc_now: | ||
170 | + # avoid divided by 0 | ||
171 | + recall = sum(true_positive_dict.values()) / (sum(true_labels_dict.values()) + 1e-6) | ||
172 | + precision = sum(true_positive_dict.values()) / (sum(pred_labels_dict.values()) + 1e-6) | ||
173 | + | ||
174 | + return recall, precision | ||
175 | + else: | ||
176 | + return true_positive_dict, true_labels_dict, pred_labels_dict | ||
177 | + | ||
178 | + | ||
179 | +def get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, image_ids, y_pred): | ||
180 | + image_id = image_ids[0] | ||
181 | + | ||
182 | + pred_boxes = y_pred[0][0:1] | ||
183 | + pred_confs = y_pred[1][0:1] | ||
184 | + pred_probs = y_pred[2][0:1] | ||
185 | + | ||
186 | + boxes, scores, labels = sess.run(gpu_nms_op, feed_dict={pred_boxes_flag: pred_boxes, pred_scores_flag: pred_confs * pred_probs}) | ||
187 | + | ||
188 | + pred_content = [] | ||
189 | + for i in range(len(labels)): | ||
190 | + x_min, y_min, x_max, y_max = boxes[i] | ||
191 | + score = scores[i] | ||
192 | + label = labels[i] | ||
193 | + pred_content.append([image_id, x_min, y_min, x_max, y_max, score, label]) | ||
194 | + | ||
195 | + return pred_content | ||
196 | + | ||
197 | +gt_dict = {} # key: img_id, value: gt object list | ||
198 | +def parse_gt_rec(gt_filename, compression_type, target_img_size, letterbox_resize=True): | ||
199 | + global gt_dict | ||
200 | + | ||
201 | + if not gt_dict: | ||
202 | + new_width, new_height = target_img_size | ||
203 | + | ||
204 | + with TFRecordIterator(gt_filename, compression_type) as reader: | ||
205 | + for data in reader: | ||
206 | + img_id, image, boxes, labels, ori_width, ori_height = parse_record(data) | ||
207 | + | ||
208 | + objects = [] | ||
209 | + for i in range(len(labels)): | ||
210 | + x_min, y_min, x_max, y_max = boxes[i] | ||
211 | + label = labels[i] | ||
212 | + | ||
213 | + if letterbox_resize: | ||
214 | + resize_ratio = min(new_width / ori_width, new_height / ori_height) | ||
215 | + | ||
216 | + resize_w = int(resize_ratio * ori_width) | ||
217 | + resize_h = int(resize_ratio * ori_height) | ||
218 | + | ||
219 | + dw = int((new_width - resize_w) / 2) | ||
220 | + dh = int((new_height - resize_h) / 2) | ||
221 | + | ||
222 | + objects.append([x_min * resize_ratio + dw, | ||
223 | + y_min * resize_ratio + dh, | ||
224 | + x_max * resize_ratio + dw, | ||
225 | + y_max * resize_ratio + dh, | ||
226 | + label]) | ||
227 | + else: | ||
228 | + objects.append([x_min * new_width / ori_width, | ||
229 | + y_min * new_height / ori_height, | ||
230 | + x_max * new_width / ori_width, | ||
231 | + y_max * new_height / ori_height, | ||
232 | + label]) | ||
233 | + gt_dict[img_id] = objects | ||
234 | + return gt_dict | ||
235 | + | ||
236 | + | ||
237 | +# The following two functions are modified from FAIR's Detectron repo to calculate mAP: | ||
238 | +# https://github.com/facebookresearch/Detectron/blob/master/detectron/datasets/voc_eval.py | ||
239 | +def voc_ap(rec, prec, use_07_metric=False): | ||
240 | + if use_07_metric: | ||
241 | + ap = 0. | ||
242 | + for t in np.arange(0., 1.1, 0.1): | ||
243 | + if np.sum(rec >= t) == 0: | ||
244 | + p = 0 | ||
245 | + else: | ||
246 | + p = np.max(prec[rec >= t]) | ||
247 | + ap = ap + p / 11. | ||
248 | + else: | ||
249 | + mrec = np.concatenate(([0.], rec, [1.])) | ||
250 | + mpre = np.concatenate(([0.], prec, [0.])) | ||
251 | + | ||
252 | + for i in range(mpre.size - 1, 0, -1): | ||
253 | + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) | ||
254 | + | ||
255 | + i = np.where(mrec[1:] != mrec[:-1])[0] | ||
256 | + | ||
257 | + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) | ||
258 | + return ap | ||
259 | + | ||
260 | + | ||
261 | +def voc_eval(gt_dict, val_preds, classidx, iou_thres=0.5, use_07_metric=False): | ||
262 | + # 1.obtain gt: extract all gt objects for this class | ||
263 | + class_recs = {} | ||
264 | + npos = 0 | ||
265 | + for img_id in gt_dict: | ||
266 | + R = [obj for obj in gt_dict[img_id] if obj[-1] == classidx] | ||
267 | + bbox = np.array([x[:4] for x in R]) | ||
268 | + det = [False] * len(R) | ||
269 | + npos += len(R) | ||
270 | + class_recs[img_id] = {'bbox': bbox, 'det': det} | ||
271 | + | ||
272 | + # 2. obtain pred results | ||
273 | + pred = [x for x in val_preds if x[-1] == classidx] | ||
274 | + img_ids = [x[0] for x in pred] | ||
275 | + confidence = np.array([x[-2] for x in pred]) | ||
276 | + BB = np.array([[x[1], x[2], x[3], x[4]] for x in pred]) | ||
277 | + | ||
278 | + # 3. sort by confidence | ||
279 | + sorted_ind = np.argsort(-confidence) | ||
280 | + try: | ||
281 | + BB = BB[sorted_ind, :] | ||
282 | + except: | ||
283 | + print('no box, ignore') | ||
284 | + return 1e-6, 1e-6, 0, 0, 0 | ||
285 | + img_ids = [img_ids[x] for x in sorted_ind] | ||
286 | + | ||
287 | + # 4. mark TPs and FPs | ||
288 | + nd = len(img_ids) | ||
289 | + tp = np.zeros(nd) | ||
290 | + fp = np.zeros(nd) | ||
291 | + | ||
292 | + for d in range(nd): | ||
293 | + R = class_recs[img_ids[d]] | ||
294 | + bb = BB[d, :] | ||
295 | + ovmax = -np.Inf | ||
296 | + BBGT = R['bbox'] | ||
297 | + | ||
298 | + if BBGT.size > 0: | ||
299 | + ixmin = np.maximum(BBGT[:, 0], bb[0]) | ||
300 | + iymin = np.maximum(BBGT[:, 1], bb[1]) | ||
301 | + ixmax = np.minimum(BBGT[:, 2], bb[2]) | ||
302 | + iymax = np.minimum(BBGT[:, 3], bb[3]) | ||
303 | + iw = np.maximum(ixmax - ixmin + 1., 0.) | ||
304 | + ih = np.maximum(iymax - iymin + 1., 0.) | ||
305 | + inters = iw * ih | ||
306 | + | ||
307 | + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * ( | ||
308 | + BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) | ||
309 | + | ||
310 | + overlaps = inters / uni | ||
311 | + ovmax = np.max(overlaps) | ||
312 | + jmax = np.argmax(overlaps) | ||
313 | + | ||
314 | + if ovmax > iou_thres: | ||
315 | + # gt not matched yet | ||
316 | + if not R['det'][jmax]: | ||
317 | + tp[d] = 1. | ||
318 | + R['det'][jmax] = 1 | ||
319 | + else: | ||
320 | + fp[d] = 1. | ||
321 | + else: | ||
322 | + fp[d] = 1. | ||
323 | + | ||
324 | + fp = np.cumsum(fp) | ||
325 | + tp = np.cumsum(tp) | ||
326 | + rec = tp / float(npos) | ||
327 | + # avoid divide by zero in case the first detection matches a difficult | ||
328 | + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) | ||
329 | + ap = voc_ap(rec, prec, use_07_metric) | ||
330 | + | ||
331 | + # return rec, prec, ap | ||
332 | + return npos, nd, tp[-1] / float(npos), tp[-1] / float(nd), ap | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
code/yolov3/misc_utils.py
0 → 100644
1 | +import numpy as np | ||
2 | +import tensorflow as tf | ||
3 | +import random | ||
4 | + | ||
5 | +class AverageMeter(object): | ||
6 | + def __init__(self): | ||
7 | + self.reset() | ||
8 | + | ||
9 | + def reset(self): | ||
10 | + self.val = 0 | ||
11 | + self.average = 0 | ||
12 | + self.sum = 0 | ||
13 | + self.count = 0 | ||
14 | + | ||
15 | + def update(self, val, n=1): | ||
16 | + self.val = val | ||
17 | + self.sum += val * n | ||
18 | + self.count += n | ||
19 | + self.average = self.sum / float(self.count) | ||
20 | + | ||
21 | + | ||
22 | +def parse_anchors(anchor_path): | ||
23 | + anchors = np.reshape(np.asarray(open(anchor_path, 'r').read().split(','), np.float32), [-1, 2]) | ||
24 | + return anchors | ||
25 | + | ||
26 | + | ||
27 | +def read_class_names(class_name_path): | ||
28 | + names = {} | ||
29 | + with open(class_name_path, 'r') as data: | ||
30 | + for ID, name in enumerate(data): | ||
31 | + names[ID] = name.strip('\n') | ||
32 | + return names | ||
33 | + | ||
34 | + | ||
35 | +def shuffle_and_overwrite(file_name): | ||
36 | + content = open(file_name, 'r').readlines() | ||
37 | + random.shuffle(content) | ||
38 | + with open(file_name, 'w') as f: | ||
39 | + for line in content: | ||
40 | + f.write(line) | ||
41 | + | ||
42 | + | ||
43 | +def update_dict(ori_dict, new_dict): | ||
44 | + if not ori_dict: | ||
45 | + return new_dict | ||
46 | + for key in ori_dict: | ||
47 | + ori_dict[key] += new_dict[key] | ||
48 | + return ori_dict | ||
49 | + | ||
50 | + | ||
51 | +def list_add(ori_list, new_list): | ||
52 | + for i in range(len(ori_list)): | ||
53 | + ori_list[i] += new_list[i] | ||
54 | + return ori_list | ||
55 | + | ||
56 | + | ||
57 | +def load_weights(var_list, weights_file): | ||
58 | + with open(weights_file, "rb") as fp: | ||
59 | + np.fromfile(fp, dtype=np.int32, count=5) | ||
60 | + weights = np.fromfile(fp, dtype=np.float32) | ||
61 | + | ||
62 | + ptr = 0 | ||
63 | + i = 0 | ||
64 | + assign_ops = [] | ||
65 | + while i < len(var_list) - 1: | ||
66 | + var1 = var_list[i] | ||
67 | + var2 = var_list[i + 1] | ||
68 | + if 'Conv' in var1.name.split('/')[-2]: | ||
69 | + if 'BatchNorm' in var2.name.split('/')[-2]: | ||
70 | + gamma, beta, mean, var = var_list[i + 1:i + 5] | ||
71 | + batch_norm_vars = [beta, gamma, mean, var] | ||
72 | + for var in batch_norm_vars: | ||
73 | + shape = var.shape.as_list() | ||
74 | + num_params = np.prod(shape) | ||
75 | + var_weights = weights[ptr:ptr + num_params].reshape(shape) | ||
76 | + ptr += num_params | ||
77 | + assign_ops.append(tf.assign(var, var_weights, validate_shape=True)) | ||
78 | + i += 4 | ||
79 | + elif 'Conv' in var2.name.split('/')[-2]: | ||
80 | + # load biases | ||
81 | + bias = var2 | ||
82 | + bias_shape = bias.shape.as_list() | ||
83 | + bias_params = np.prod(bias_shape) | ||
84 | + bias_weights = weights[ptr:ptr + | ||
85 | + bias_params].reshape(bias_shape) | ||
86 | + ptr += bias_params | ||
87 | + assign_ops.append(tf.assign(bias, bias_weights, validate_shape=True)) | ||
88 | + i += 1 | ||
89 | + | ||
90 | + shape = var1.shape.as_list() | ||
91 | + num_params = np.prod(shape) | ||
92 | + | ||
93 | + var_weights = weights[ptr:ptr + num_params].reshape( | ||
94 | + (shape[3], shape[2], shape[0], shape[1])) | ||
95 | + | ||
96 | + var_weights = np.transpose(var_weights, (2, 3, 1, 0)) | ||
97 | + ptr += num_params | ||
98 | + assign_ops.append( | ||
99 | + tf.assign(var1, var_weights, validate_shape=True)) | ||
100 | + i += 1 | ||
101 | + | ||
102 | + return assign_ops | ||
103 | + | ||
104 | + | ||
105 | +def config_learning_rate(args, global_step): | ||
106 | + if args.lr_type == 'exponential': | ||
107 | + lr_tmp = tf.train.exponential_decay(args.learning_rate_init, global_step, args.lr_decay_freq, | ||
108 | + args.lr_decay_factor, staircase=True, name='exponential_learning_rate') | ||
109 | + return tf.maximum(lr_tmp, args.lr_lower_bound) | ||
110 | + elif args.lr_type == 'cosine_decay': | ||
111 | + train_steps = (args.total_epoches - float(args.use_warm_up) * args.warm_up_epoch) * args.train_batch_num | ||
112 | + return args.lr_lower_bound + 0.5 * (args.learning_rate_init - args.lr_lower_bound) * \ | ||
113 | + (1 + tf.cos(global_step / train_steps * np.pi)) | ||
114 | + elif args.lr_type == 'cosine_decay_restart': | ||
115 | + return tf.train.cosine_decay_restarts(args.learning_rate_init, global_step, | ||
116 | + args.lr_decay_freq, t_mul=2.0, m_mul=1.0, | ||
117 | + name='cosine_decay_learning_rate_restart') | ||
118 | + elif args.lr_type == 'fixed': | ||
119 | + return tf.convert_to_tensor(args.learning_rate_init, name='fixed_learning_rate') | ||
120 | + elif args.lr_type == 'piecewise': | ||
121 | + return tf.train.piecewise_constant(global_step, boundaries=args.pw_boundaries, values=args.pw_values, | ||
122 | + name='piecewise_learning_rate') | ||
123 | + else: | ||
124 | + raise ValueError('Unsupported learning rate type!') | ||
125 | + | ||
126 | + | ||
127 | +def config_optimizer(optimizer_name, learning_rate, decay=0.9, momentum=0.9): | ||
128 | + if optimizer_name == 'momentum': | ||
129 | + return tf.train.MomentumOptimizer(learning_rate, momentum=momentum) | ||
130 | + elif optimizer_name == 'rmsprop': | ||
131 | + return tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=momentum) | ||
132 | + elif optimizer_name == 'adam': | ||
133 | + return tf.train.AdamOptimizer(learning_rate) | ||
134 | + elif optimizer_name == 'sgd': | ||
135 | + return tf.train.GradientDescentOptimizer(learning_rate) | ||
136 | + else: | ||
137 | + raise ValueError('Unsupported optimizer type!') | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
code/yolov3/model.py
0 → 100644
1 | +##### layer utils | ||
2 | +from __future__ import division, print_function | ||
3 | + | ||
4 | +import numpy as np | ||
5 | +import tensorflow as tf | ||
6 | +slim = tf.contrib.slim | ||
7 | + | ||
8 | +def conv2d(inputs, filters, kernel_size, strides=1): | ||
9 | + def _fixed_padding(inputs, kernel_size): | ||
10 | + pad_total = kernel_size - 1 | ||
11 | + pad_beg = pad_total // 2 | ||
12 | + pad_end = pad_total - pad_beg | ||
13 | + | ||
14 | + padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end], | ||
15 | + [pad_beg, pad_end], [0, 0]], mode='CONSTANT') | ||
16 | + return padded_inputs | ||
17 | + if strides > 1: | ||
18 | + inputs = _fixed_padding(inputs, kernel_size) | ||
19 | + inputs = slim.conv2d(inputs, filters, kernel_size, stride=strides, | ||
20 | + padding=('SAME' if strides == 1 else 'VALID')) | ||
21 | + return inputs | ||
22 | + | ||
23 | +def darknet53_body(inputs): | ||
24 | + def res_block(inputs, filters): | ||
25 | + shortcut = inputs | ||
26 | + net = conv2d(inputs, filters * 1, 1) | ||
27 | + net = conv2d(net, filters * 2, 3) | ||
28 | + | ||
29 | + net = net + shortcut | ||
30 | + | ||
31 | + return net | ||
32 | + | ||
33 | + # first two conv2d layers | ||
34 | + net = conv2d(inputs, 32, 3, strides=1) | ||
35 | + net = conv2d(net, 64, 3, strides=2) | ||
36 | + | ||
37 | + # res_block * 1 | ||
38 | + net = res_block(net, 32) | ||
39 | + | ||
40 | + net = conv2d(net, 128, 3, strides=2) | ||
41 | + | ||
42 | + # res_block * 2 | ||
43 | + for i in range(2): | ||
44 | + net = res_block(net, 64) | ||
45 | + | ||
46 | + net = conv2d(net, 256, 3, strides=2) | ||
47 | + | ||
48 | + # res_block * 8 | ||
49 | + for i in range(8): | ||
50 | + net = res_block(net, 128) | ||
51 | + | ||
52 | + route_1 = net | ||
53 | + net = conv2d(net, 512, 3, strides=2) | ||
54 | + | ||
55 | + # res_block * 8 | ||
56 | + for i in range(8): | ||
57 | + net = res_block(net, 256) | ||
58 | + | ||
59 | + route_2 = net | ||
60 | + net = conv2d(net, 1024, 3, strides=2) | ||
61 | + | ||
62 | + # res_block * 4 | ||
63 | + for i in range(4): | ||
64 | + net = res_block(net, 512) | ||
65 | + route_3 = net | ||
66 | + | ||
67 | + return route_1, route_2, route_3 | ||
68 | + | ||
69 | + | ||
70 | +def yolo_block(inputs, filters): | ||
71 | + net = conv2d(inputs, filters * 1, 1) | ||
72 | + net = conv2d(net, filters * 2, 3) | ||
73 | + net = conv2d(net, filters * 1, 1) | ||
74 | + net = conv2d(net, filters * 2, 3) | ||
75 | + net = conv2d(net, filters * 1, 1) | ||
76 | + route = net | ||
77 | + net = conv2d(net, filters * 2, 3) | ||
78 | + return route, net | ||
79 | + | ||
80 | + | ||
81 | +def upsample_layer(inputs, out_shape): | ||
82 | + new_height, new_width = out_shape[1], out_shape[2] | ||
83 | + # NOTE: here height is the first | ||
84 | + inputs = tf.image.resize_nearest_neighbor(inputs, (new_height, new_width), name='upsampled') | ||
85 | + return inputs | ||
86 | + | ||
87 | +class yolov3(object): | ||
88 | + | ||
89 | + def __init__(self, class_num, anchors, use_label_smooth=False, use_focal_loss=False, batch_norm_decay=0.999, weight_decay=5e-4, use_static_shape=True): | ||
90 | + self.class_num = class_num | ||
91 | + self.anchors = anchors | ||
92 | + self.batch_norm_decay = batch_norm_decay | ||
93 | + self.use_label_smooth = use_label_smooth | ||
94 | + self.use_focal_loss = use_focal_loss | ||
95 | + self.weight_decay = weight_decay | ||
96 | + self.use_static_shape = use_static_shape | ||
97 | + | ||
98 | + def forward(self, inputs, is_training=False, reuse=False): | ||
99 | + # the input size: [height, weight] format | ||
100 | + self.img_size = tf.shape(inputs)[1:3] | ||
101 | + print("Img size:", self.img_size) | ||
102 | + | ||
103 | + batch_norm_params = { | ||
104 | + 'decay': self.batch_norm_decay, | ||
105 | + 'epsilon': 1e-05, | ||
106 | + 'scale': True, | ||
107 | + 'is_training': is_training, | ||
108 | + 'fused': None, | ||
109 | + } | ||
110 | + | ||
111 | + with slim.arg_scope([slim.conv2d, slim.batch_norm], reuse=reuse): | ||
112 | + with slim.arg_scope([slim.conv2d], | ||
113 | + normalizer_fn=slim.batch_norm, | ||
114 | + normalizer_params=batch_norm_params, | ||
115 | + biases_initializer=None, | ||
116 | + activation_fn=lambda x: tf.nn.leaky_relu(x, alpha=0.1), | ||
117 | + weights_regularizer=slim.l2_regularizer(self.weight_decay)): | ||
118 | + | ||
119 | + with tf.variable_scope('darknet53_body'): | ||
120 | + route_1, route_2, route_3 = darknet53_body(inputs) | ||
121 | + | ||
122 | + with tf.variable_scope('yolov3_head'): | ||
123 | + inter1, net = yolo_block(route_3, 512) | ||
124 | + feature_map_1 = slim.conv2d(net, 3 * (5 + self.class_num), 1, | ||
125 | + stride=1, normalizer_fn=None, | ||
126 | + activation_fn=None, biases_initializer=tf.zeros_initializer()) | ||
127 | + feature_map_1 = tf.identity(feature_map_1, name='feature_map_1') | ||
128 | + | ||
129 | + inter1 = conv2d(inter1, 256, 1) | ||
130 | + inter1 = upsample_layer(inter1, route_2.get_shape().as_list() if self.use_static_shape else tf.shape(route_2)) | ||
131 | + concat1 = tf.concat([inter1, route_2], axis=3) | ||
132 | + | ||
133 | + inter2, net = yolo_block(concat1, 256) | ||
134 | + feature_map_2 = slim.conv2d(net, 3 * (5 + self.class_num), 1, | ||
135 | + stride=1, normalizer_fn=None, | ||
136 | + activation_fn=None, biases_initializer=tf.zeros_initializer()) | ||
137 | + feature_map_2 = tf.identity(feature_map_2, name='feature_map_2') | ||
138 | + | ||
139 | + inter2 = conv2d(inter2, 128, 1) | ||
140 | + inter2 = upsample_layer(inter2, route_1.get_shape().as_list() if self.use_static_shape else tf.shape(route_1)) | ||
141 | + concat2 = tf.concat([inter2, route_1], axis=3) | ||
142 | + | ||
143 | + _, feature_map_3 = yolo_block(concat2, 128) | ||
144 | + feature_map_3 = slim.conv2d(feature_map_3, 3 * (5 + self.class_num), 1, | ||
145 | + stride=1, normalizer_fn=None, | ||
146 | + activation_fn=None, biases_initializer=tf.zeros_initializer()) | ||
147 | + feature_map_3 = tf.identity(feature_map_3, name='feature_map_3') | ||
148 | + | ||
149 | + return feature_map_1, feature_map_2, feature_map_3 | ||
150 | + | ||
151 | + def reorganize_layer(self, feature_map, anchors): | ||
152 | + # size : [h, w] format | ||
153 | + grid_size = feature_map.get_shape().as_list()[1:3] if self.use_static_shape else tf.shape(feature_map)[1:3] # [13, 13] | ||
154 | + ratio = tf.cast(self.img_size / grid_size, tf.float32) | ||
155 | + | ||
156 | + # anchor : [w, h] format | ||
157 | + rescaled_anchors = [(anchor[0] / ratio[1], anchor[1] / ratio[0]) for anchor in anchors] | ||
158 | + | ||
159 | + feature_map = tf.reshape(feature_map, [-1, grid_size[0], grid_size[1], 3, 5 + self.class_num]) | ||
160 | + | ||
161 | + box_centers, box_sizes, conf_logits, prob_logits = tf.split(feature_map, [2, 2, 1, self.class_num], axis=-1) | ||
162 | + box_centers = tf.nn.sigmoid(box_centers) | ||
163 | + | ||
164 | + grid_x = tf.range(grid_size[1], dtype=tf.int32) | ||
165 | + grid_y = tf.range(grid_size[0], dtype=tf.int32) | ||
166 | + grid_x, grid_y = tf.meshgrid(grid_x, grid_y) | ||
167 | + x_offset = tf.reshape(grid_x, (-1, 1)) | ||
168 | + y_offset = tf.reshape(grid_y, (-1, 1)) | ||
169 | + x_y_offset = tf.concat([x_offset, y_offset], axis=-1) | ||
170 | + | ||
171 | + x_y_offset = tf.cast(tf.reshape(x_y_offset, [grid_size[0], grid_size[1], 1, 2]), tf.float32) | ||
172 | + | ||
173 | + box_centers = box_centers + x_y_offset | ||
174 | + box_centers = box_centers * ratio[::-1] | ||
175 | + | ||
176 | + box_sizes = tf.exp(box_sizes) * rescaled_anchors | ||
177 | + box_sizes = box_sizes * ratio[::-1] | ||
178 | + | ||
179 | + boxes = tf.concat([box_centers, box_sizes], axis=-1) | ||
180 | + | ||
181 | + return x_y_offset, boxes, conf_logits, prob_logits | ||
182 | + | ||
183 | + | ||
184 | + def _reshape_logit(result): | ||
185 | + x_y_offset, boxes, conf_logits, prob_logits = result | ||
186 | + grid_size = x_y_offset.get_shape().as_list()[:2] if self.use_static_shape else tf.shape(x_y_offset)[:2] | ||
187 | + boxes = tf.reshape(boxes, [-1, grid_size[0] * grid_size[1] * 3, 4]) | ||
188 | + conf_logits = tf.reshape(conf_logits, [-1, grid_size[0] * grid_size[1] * 3, 1]) | ||
189 | + prob_logits = tf.reshape(prob_logits, [-1, grid_size[0] * grid_size[1] * 3, self.class_num]) | ||
190 | + return boxes, conf_logits, prob_logits | ||
191 | + | ||
192 | + def predict(self, feature_maps): | ||
193 | + feature_map_1, feature_map_2, feature_map_3 = feature_maps | ||
194 | + | ||
195 | + feature_map_anchors = [(feature_map_1, self.anchors[6:9]), | ||
196 | + (feature_map_2, self.anchors[3:6]), | ||
197 | + (feature_map_3, self.anchors[0:3])] | ||
198 | + reorg_results = [self.reorganize_layer(feature_map, anchors) for (feature_map, anchors) in feature_map_anchors] | ||
199 | + | ||
200 | + boxes_list, confs_list, probs_list = [], [], [] | ||
201 | + | ||
202 | + for result in reorg_results: | ||
203 | + boxes, conf_logits, prob_logits = _reshape_logit(result) | ||
204 | + confs = tf.sigmoid(conf_logits) | ||
205 | + probs = tf.sigmoid(prob_logits) | ||
206 | + boxes_list.append(boxes) | ||
207 | + confs_list.append(confs) | ||
208 | + probs_list.append(probs) | ||
209 | + | ||
210 | + boxes = tf.concat(boxes_list, axis=1) | ||
211 | + confs = tf.concat(confs_list, axis=1) | ||
212 | + probs = tf.concat(probs_list, axis=1) | ||
213 | + | ||
214 | + center_x, center_y, width, height = tf.split(boxes, [1, 1, 1, 1], axis=-1) | ||
215 | + x_min = center_x - width / 2 | ||
216 | + y_min = center_y - height / 2 | ||
217 | + x_max = center_x + width / 2 | ||
218 | + y_max = center_y + height / 2 | ||
219 | + | ||
220 | + boxes = tf.concat([x_min, y_min, x_max, y_max], axis=-1) | ||
221 | + | ||
222 | + return boxes, confs, probs | ||
223 | + | ||
224 | + def loss_layer(self, feature_map_i, y_true, anchors): | ||
225 | + grid_size = tf.shape(feature_map_i)[1:3] | ||
226 | + ratio = tf.cast(self.img_size / grid_size, tf.float32) | ||
227 | + # N: batch_size | ||
228 | + N = tf.cast(tf.shape(feature_map_i)[0], tf.float32) | ||
229 | + | ||
230 | + x_y_offset, pred_boxes, pred_conf_logits, pred_prob_logits = self.reorg_layer(feature_map_i, anchors) | ||
231 | + | ||
232 | + ### mask | ||
233 | + object_mask = y_true[..., 4:5] | ||
234 | + ignore_mask = tf.TensorArray(tf.float32, size=0, dynamic_size=True) | ||
235 | + | ||
236 | + def loop_cond(idx, ignore_mask): | ||
237 | + return tf.less(idx, tf.cast(N, tf.int32)) | ||
238 | + | ||
239 | + def loop_body(idx, ignore_mask): | ||
240 | + valid_true_boxes = tf.boolean_mask(y_true[idx, ..., 0:4], tf.cast(object_mask[idx, ..., 0], 'bool')) | ||
241 | + | ||
242 | + iou = self.box_iou(pred_boxes[idx], valid_true_boxes) | ||
243 | + best_iou = tf.reduce_max(iou, axis=-1) | ||
244 | + | ||
245 | + ignore_mask_tmp = tf.cast(best_iou < 0.5, tf.float32) | ||
246 | + | ||
247 | + ignore_mask = ignore_mask.write(idx, ignore_mask_tmp) | ||
248 | + return idx + 1, ignore_mask | ||
249 | + | ||
250 | + _, ignore_mask = tf.while_loop(cond=loop_cond, body=loop_body, loop_vars=[0, ignore_mask]) | ||
251 | + ignore_mask = ignore_mask.stack() | ||
252 | + ignore_mask = tf.expand_dims(ignore_mask, -1) | ||
253 | + | ||
254 | + pred_box_xy = pred_boxes[..., 0:2] | ||
255 | + pred_box_wh = pred_boxes[..., 2:4] | ||
256 | + | ||
257 | + true_xy = y_true[..., 0:2] / ratio[::-1] - x_y_offset | ||
258 | + pred_xy = pred_box_xy / ratio[::-1] - x_y_offset | ||
259 | + | ||
260 | + true_tw_th = y_true[..., 2:4] / anchors | ||
261 | + pred_tw_th = pred_box_wh / anchors | ||
262 | + | ||
263 | + true_tw_th = tf.where(condition=tf.equal(true_tw_th, 0), | ||
264 | + x=tf.ones_like(true_tw_th), y=true_tw_th) | ||
265 | + pred_tw_th = tf.where(condition=tf.equal(pred_tw_th, 0), | ||
266 | + x=tf.ones_like(pred_tw_th), y=pred_tw_th) | ||
267 | + true_tw_th = tf.log(tf.clip_by_value(true_tw_th, 1e-9, 1e9)) | ||
268 | + pred_tw_th = tf.log(tf.clip_by_value(pred_tw_th, 1e-9, 1e9)) | ||
269 | + | ||
270 | + box_loss_scale = 2. - (y_true[..., 2:3] / tf.cast(self.img_size[1], tf.float32)) * (y_true[..., 3:4] / tf.cast(self.img_size[0], tf.float32)) | ||
271 | + | ||
272 | + ### loss | ||
273 | + | ||
274 | + mix_w = y_true[..., -1:] | ||
275 | + | ||
276 | + xy_loss = tf.reduce_sum(tf.square(true_xy - pred_xy) * object_mask * box_loss_scale * mix_w) / N | ||
277 | + wh_loss = tf.reduce_sum(tf.square(true_tw_th - pred_tw_th) * object_mask * box_loss_scale * mix_w) / N | ||
278 | + | ||
279 | + conf_pos_mask = object_mask | ||
280 | + conf_neg_mask = (1 - object_mask) * ignore_mask | ||
281 | + conf_loss_pos = conf_pos_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask, logits=pred_conf_logits) | ||
282 | + conf_loss_neg = conf_neg_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask, logits=pred_conf_logits) | ||
283 | + | ||
284 | + conf_loss = conf_loss_pos + conf_loss_neg | ||
285 | + | ||
286 | + if self.use_focal_loss: | ||
287 | + alpha = 1.0 | ||
288 | + gamma = 2.0 | ||
289 | + focal_mask = alpha * tf.pow(tf.abs(object_mask - tf.sigmoid(pred_conf_logits)), gamma) | ||
290 | + conf_loss *= focal_mask | ||
291 | + conf_loss = tf.reduce_sum(conf_loss * mix_w) / N | ||
292 | + | ||
293 | + if self.use_label_smooth: | ||
294 | + delta = 0.01 | ||
295 | + label_target = (1 - delta) * y_true[..., 5:-1] + delta * 1. / self.class_num | ||
296 | + else: | ||
297 | + label_target = y_true[..., 5:-1] | ||
298 | + | ||
299 | + class_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_target, logits=pred_prob_logits) * mix_w | ||
300 | + class_loss = tf.reduce_sum(class_loss) / N | ||
301 | + | ||
302 | + return xy_loss, wh_loss, conf_loss, class_loss | ||
303 | + | ||
304 | + | ||
305 | + def box_iou(self, pred_boxes, valid_true_boxes): | ||
306 | + pred_box_xy = pred_boxes[..., 0:2] | ||
307 | + pred_box_wh = pred_boxes[..., 2:4] | ||
308 | + | ||
309 | + pred_box_xy = tf.expand_dims(pred_box_xy, -2) | ||
310 | + pred_box_wh = tf.expand_dims(pred_box_wh, -2) | ||
311 | + | ||
312 | + true_box_xy = valid_true_boxes[:, 0:2] | ||
313 | + true_box_wh = valid_true_boxes[:, 2:4] | ||
314 | + | ||
315 | + intersect_mins = tf.maximum(pred_box_xy - pred_box_wh / 2., | ||
316 | + true_box_xy - true_box_wh / 2.) | ||
317 | + intersect_maxs = tf.minimum(pred_box_xy + pred_box_wh / 2., | ||
318 | + true_box_xy + true_box_wh / 2.) | ||
319 | + intersect_wh = tf.maximum(intersect_maxs - intersect_mins, 0.) | ||
320 | + | ||
321 | + intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] | ||
322 | + pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1] | ||
323 | + true_box_area = true_box_wh[..., 0] * true_box_wh[..., 1] | ||
324 | + true_box_area = tf.expand_dims(true_box_area, axis=0) | ||
325 | + | ||
326 | + iou = intersect_area / (pred_box_area + true_box_area - intersect_area + 1e-10) | ||
327 | + | ||
328 | + return iou | ||
329 | + | ||
330 | + | ||
331 | + def compute_loss(self, y_pred, y_true): | ||
332 | + loss_xy, loss_wh, loss_conf, loss_class = 0., 0., 0., 0. | ||
333 | + anchor_group = [self.anchors[6:9], self.anchors[3:6], self.anchors[0:3]] | ||
334 | + | ||
335 | + for i in range(len(y_pred)): | ||
336 | + result = self.loss_layer(y_pred[i], y_true[i], anchor_group[i]) | ||
337 | + loss_xy += result[0] | ||
338 | + loss_wh += result[1] | ||
339 | + loss_conf += result[2] | ||
340 | + loss_class += result[3] | ||
341 | + total_loss = loss_xy + loss_wh + loss_conf + loss_class | ||
342 | + return [total_loss, loss_xy, loss_wh, loss_conf, loss_class] | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
code/yolov3/nms_utils.py
0 → 100644
1 | +from __future__ import division, print_function | ||
2 | + | ||
3 | +import numpy as np | ||
4 | +import tensorflow as tf | ||
5 | + | ||
6 | +def gpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, nms_thresh=0.5): | ||
7 | + boxes_list, label_list, score_list = [], [], [] | ||
8 | + max_boxes = tf.constant(max_boxes, dtype='int32') | ||
9 | + | ||
10 | + boxes = tf.reshape(boxes, [-1, 4]) # '-1' means we don't konw the exact number of boxes | ||
11 | + score = tf.reshape(scores, [-1, num_classes]) | ||
12 | + | ||
13 | + # Step 1: Create a filtering mask based on "box_class_scores" by using "threshold". | ||
14 | + mask = tf.greater_equal(score, tf.constant(score_thresh)) | ||
15 | + # Step 2: Do non_max_suppression for each class | ||
16 | + for i in range(num_classes): | ||
17 | + # Step 3: Apply the mask to scores, boxes and pick them out | ||
18 | + filter_boxes = tf.boolean_mask(boxes, mask[:,i]) | ||
19 | + filter_score = tf.boolean_mask(score[:,i], mask[:,i]) | ||
20 | + nms_indices = tf.image.non_max_suppression(boxes=filter_boxes, | ||
21 | + scores=filter_score, | ||
22 | + max_output_size=max_boxes, | ||
23 | + iou_threshold=nms_thresh, name='nms_indices') | ||
24 | + label_list.append(tf.ones_like(tf.gather(filter_score, nms_indices), 'int32')*i) | ||
25 | + boxes_list.append(tf.gather(filter_boxes, nms_indices)) | ||
26 | + score_list.append(tf.gather(filter_score, nms_indices)) | ||
27 | + | ||
28 | + boxes = tf.concat(boxes_list, axis=0) | ||
29 | + score = tf.concat(score_list, axis=0) | ||
30 | + label = tf.concat(label_list, axis=0) | ||
31 | + | ||
32 | + return boxes, score, label | ||
33 | + | ||
34 | + | ||
35 | +def py_nms(boxes, scores, max_boxes=50, iou_thresh=0.5): | ||
36 | + assert boxes.shape[1] == 4 and len(scores.shape) == 1 | ||
37 | + | ||
38 | + x1 = boxes[:, 0] | ||
39 | + y1 = boxes[:, 1] | ||
40 | + x2 = boxes[:, 2] | ||
41 | + y2 = boxes[:, 3] | ||
42 | + | ||
43 | + areas = (x2 - x1) * (y2 - y1) | ||
44 | + order = scores.argsort()[::-1] | ||
45 | + | ||
46 | + keep = [] | ||
47 | + while order.size > 0: | ||
48 | + i = order[0] | ||
49 | + keep.append(i) | ||
50 | + xx1 = np.maximum(x1[i], x1[order[1:]]) | ||
51 | + yy1 = np.maximum(y1[i], y1[order[1:]]) | ||
52 | + xx2 = np.minimum(x2[i], x2[order[1:]]) | ||
53 | + yy2 = np.minimum(y2[i], y2[order[1:]]) | ||
54 | + | ||
55 | + w = np.maximum(0.0, xx2 - xx1 + 1) | ||
56 | + h = np.maximum(0.0, yy2 - yy1 + 1) | ||
57 | + inter = w * h | ||
58 | + ovr = inter / (areas[i] + areas[order[1:]] - inter) | ||
59 | + | ||
60 | + inds = np.where(ovr <= iou_thresh)[0] | ||
61 | + order = order[inds + 1] | ||
62 | + | ||
63 | + return keep[:max_boxes] | ||
64 | + | ||
65 | + | ||
66 | +def cpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, iou_thresh=0.5): | ||
67 | + boxes = boxes.reshape(-1, 4) | ||
68 | + scores = scores.reshape(-1, num_classes) | ||
69 | + picked_boxes, picked_score, picked_label = [], [], [] | ||
70 | + | ||
71 | + for i in range(num_classes): | ||
72 | + indices = np.where(scores[:,i] >= score_thresh) | ||
73 | + filter_boxes = boxes[indices] | ||
74 | + filter_scores = scores[:,i][indices] | ||
75 | + if len(filter_boxes) == 0: | ||
76 | + continue | ||
77 | + | ||
78 | + indices = py_nms(filter_boxes, filter_scores, | ||
79 | + max_boxes=max_boxes, iou_thresh=iou_thresh) | ||
80 | + picked_boxes.append(filter_boxes[indices]) | ||
81 | + picked_score.append(filter_scores[indices]) | ||
82 | + picked_label.append(np.ones(len(indices), dtype='int32')*i) | ||
83 | + if len(picked_boxes) == 0: | ||
84 | + return None, None, None | ||
85 | + | ||
86 | + boxes = np.concatenate(picked_boxes, axis=0) | ||
87 | + score = np.concatenate(picked_score, axis=0) | ||
88 | + label = np.concatenate(picked_label, axis=0) | ||
89 | + | ||
90 | + return boxes, score, label | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
code/yolov3/plot_utils.py
0 → 100644
1 | +from __future__ import division, print_function | ||
2 | + | ||
3 | +import cv2 | ||
4 | +import random | ||
5 | + | ||
6 | + | ||
7 | +def get_color_table(class_num, seed=2): | ||
8 | + random.seed(seed) | ||
9 | + color_table = {} | ||
10 | + for i in range(class_num): | ||
11 | + color_table[i] = [random.randint(0, 255) for _ in range(3)] | ||
12 | + return color_table | ||
13 | + | ||
14 | + | ||
15 | +def plot_one_box(img, coord, label=None, color=None, line_thickness=None): | ||
16 | + tl = line_thickness or int(round(0.002 * max(img.shape[0:2]))) # line thickness | ||
17 | + color = color or [random.randint(0, 255) for _ in range(3)] | ||
18 | + c1, c2 = (int(coord[0]), int(coord[1])), (int(coord[2]), int(coord[3])) | ||
19 | + cv2.rectangle(img, c1, c2, color, thickness=tl) | ||
20 | + if label: | ||
21 | + tf = max(tl - 1, 1) # font thickness | ||
22 | + t_size = cv2.getTextSize(label, 0, fontScale=float(tl) / 3, thickness=tf)[0] | ||
23 | + c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 | ||
24 | + cv2.rectangle(img, c1, c2, color, -1) # filled | ||
25 | + cv2.putText(img, label, (c1[0], c1[1] - 2), 0, float(tl) / 3, [0, 0, 0], thickness=tf, lineType=cv2.LINE_AA) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
code/yolov3/test_single_image.py
0 → 100644
1 | +from __future__ import division, print_function | ||
2 | + | ||
3 | +import tensorflow as tf | ||
4 | +import numpy as np | ||
5 | +import argparse | ||
6 | +import cv2 | ||
7 | + | ||
8 | +from misc_utils import parse_anchors, read_class_names | ||
9 | +from nms_utils import gpu_nms | ||
10 | +from plot_utils import get_color_table, plot_one_box | ||
11 | +from data_utils import letterbox_resize | ||
12 | + | ||
13 | +from model import yolov3 | ||
14 | + | ||
15 | +parser = argparse.ArgumentParser(description="YOLO-V3 test single image test procedure.") | ||
16 | +parser.add_argument("input_image", type=str, | ||
17 | + help="The path of the input image.") | ||
18 | +parser.add_argument("--anchor_path", type=str, default="./data/yolo_anchors.txt", | ||
19 | + help="The path of the anchor txt file.") | ||
20 | +parser.add_argument("--new_size", nargs='*', type=int, default=[416, 416], | ||
21 | + help="Resize the input image with `new_size`, size format: [width, height]") | ||
22 | +parser.add_argument("--letterbox_resize", type=lambda x: (str(x).lower() == 'true'), default=True, | ||
23 | + help="Whether to use the letterbox resize.") | ||
24 | +parser.add_argument("--class_name_path", type=str, default="./data/coco.names", | ||
25 | + help="The path of the class names.") | ||
26 | +parser.add_argument("--restore_path", type=str, default="./data/darknet_weights/yolov3.ckpt", | ||
27 | + help="The path of the weights to restore.") | ||
28 | +args = parser.parse_args() | ||
29 | + | ||
30 | +args.anchors = parse_anchors(args.anchor_path) | ||
31 | +args.classes = read_class_names(args.class_name_path) | ||
32 | +args.num_class = len(args.classes) | ||
33 | + | ||
34 | +color_table = get_color_table(args.num_class) | ||
35 | + | ||
36 | +img_ori = cv2.imread(args.input_image) | ||
37 | +if args.letterbox_resize: | ||
38 | + img, resize_ratio, dw, dh = letterbox_resize(img_ori, args.new_size[0], args.new_size[1]) | ||
39 | +else: | ||
40 | + height_ori, width_ori = img_ori.shape[:2] | ||
41 | + img = cv2.resize(img_ori, tuple(args.new_size)) | ||
42 | +img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | ||
43 | +img = np.asarray(img, np.float32) | ||
44 | +img = img[np.newaxis, :] / 255. | ||
45 | + | ||
46 | +with tf.Session() as sess: | ||
47 | + input_data = tf.placeholder(tf.float32, [1, args.new_size[1], args.new_size[0], 3], name='input_data') | ||
48 | + yolo_model = yolov3(args.num_class, args.anchors) | ||
49 | + with tf.variable_scope('yolov3'): | ||
50 | + pred_feature_maps = yolo_model.forward(input_data, False) | ||
51 | + pred_boxes, pred_confs, pred_probs = yolo_model.predict(pred_feature_maps) | ||
52 | + | ||
53 | + pred_scores = pred_confs * pred_probs | ||
54 | + | ||
55 | + boxes, scores, labels = gpu_nms(pred_boxes, pred_scores, args.num_class, max_boxes=200, score_thresh=0.3, nms_thresh=0.45) | ||
56 | + | ||
57 | + saver = tf.train.Saver() | ||
58 | + saver.restore(sess, args.restore_path) | ||
59 | + | ||
60 | + boxes_, scores_, labels_ = sess.run([boxes, scores, labels], feed_dict={input_data: img}) | ||
61 | + | ||
62 | + if args.letterbox_resize: | ||
63 | + boxes_[:, [0, 2]] = (boxes_[:, [0, 2]] - dw) / resize_ratio | ||
64 | + boxes_[:, [1, 3]] = (boxes_[:, [1, 3]] - dh) / resize_ratio | ||
65 | + else: | ||
66 | + boxes_[:, [0, 2]] *= (width_ori/float(args.new_size[0])) | ||
67 | + boxes_[:, [1, 3]] *= (height_ori/float(args.new_size[1])) | ||
68 | + | ||
69 | + print("box coords:") | ||
70 | + print(boxes_) | ||
71 | + print('*' * 30) | ||
72 | + print("scores:") | ||
73 | + print(scores_) | ||
74 | + print('*' * 30) | ||
75 | + print("labels:") | ||
76 | + print(labels_) | ||
77 | + | ||
78 | + for i in range(len(boxes_)): | ||
79 | + x0, y0, x1, y1 = boxes_[i] | ||
80 | + plot_one_box(img_ori, [x0, y0, x1, y1], label=args.classes[labels_[i]] + ', {:.2f}%'.format(scores_[i] * 100), color=color_table[labels_[i]]) | ||
81 | + cv2.imshow('Detection result', img_ori) | ||
82 | + cv2.imwrite('detection_result.jpg', img_ori) | ||
83 | + cv2.waitKey(0) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
code/yolov3/tfrecord_utils.py
0 → 100644
1 | +import tensorflow as tf | ||
2 | +from itertools import tee | ||
3 | + | ||
4 | +class TFRecordIterator: | ||
5 | + def __init__(self, path, compression=None): | ||
6 | + self._core = tf.python_io.tf_record_iterator(path, tf.python_io.TFRecordOptions(compression)) | ||
7 | + self._iterator = iter(self._core) | ||
8 | + self._iterator, self._iterator_temp = tee(self._iterator) | ||
9 | + self._total_cnt = sum(1 for _ in self._iterator_temp) | ||
10 | + | ||
11 | + def _read_value(self, feature): | ||
12 | + if len(feature.int64_list.value) > 0: | ||
13 | + return feature.int64_list.value | ||
14 | + | ||
15 | + if len(feature.bytes_list.value) > 0: | ||
16 | + return feature.bytes_list.value | ||
17 | + | ||
18 | + if len(feature.float_list.value) > 0: | ||
19 | + return feature.float_list.value | ||
20 | + | ||
21 | + return None | ||
22 | + | ||
23 | + def _read_features(self, features): | ||
24 | + d = dict() | ||
25 | + for data in features: | ||
26 | + d[data] = self._read_value(features[data]) | ||
27 | + return d | ||
28 | + | ||
29 | + def __enter__(self): | ||
30 | + return self | ||
31 | + | ||
32 | + def __exit__(self, exception_type, exception_value, traceback): | ||
33 | + pass | ||
34 | + | ||
35 | + def __iter__(self): | ||
36 | + return self | ||
37 | + | ||
38 | + def __next__(self): | ||
39 | + record = next(self._iterator) | ||
40 | + example = tf.train.Example() | ||
41 | + example.ParseFromString(record) | ||
42 | + return self._read_features(example.features.feature) | ||
43 | + | ||
44 | + def count(self): | ||
45 | + return self._total_cnt | ||
46 | + | ||
47 | + | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
code/yolov3/train.py
0 → 100644
1 | +from __future__ import division, print_function | ||
2 | + | ||
3 | +import tensorflow as tf | ||
4 | +import numpy as np | ||
5 | +import os | ||
6 | +from tqdm import trange | ||
7 | + | ||
8 | +import args | ||
9 | + | ||
10 | +from misc_utils import shuffle_and_overwrite, config_learning_rate, config_optimizer, AverageMeter | ||
11 | +from data_utils import get_batch_data | ||
12 | +from eval_utils import evaluate_on_cpu, evaluate_on_gpu, get_preds_gpu, voc_eval, parse_gt_rec | ||
13 | +from nms_utils import gpu_nms | ||
14 | + | ||
15 | +from model import yolov3 | ||
16 | + | ||
17 | +train_dataset = tf.data.TFRecordDataset(filenames=train_file, compression_type='GZIP') | ||
18 | +train_dataset = train_dataset.shuffle(train_img_cnt) | ||
19 | +train_dataset = train_dataset.batch(batch_size) | ||
20 | +train_dataset = train_dataset.map( | ||
21 | + lambda x: tf.py_func(get_batch_data, | ||
22 | + inp=[x, args.class_num, args.img_size, args.anchors, True, args.multi_scale_train, args.use_mix_up, args.letterbox_resize], | ||
23 | + Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]), | ||
24 | + num_parallel_calls=args.num_threads | ||
25 | +) | ||
26 | +train_dataset = train_dataset.prefetch(prefetech_buffer) | ||
27 | + | ||
28 | +val_dataset = tf.data.TFRecordDataset(filenames=val_file, compression_type='GZIP') | ||
29 | +val_dataset = val_dataset.batch(1) | ||
30 | +val_dataset = val_dataset.map( | ||
31 | + lambda x: tf.py_func(get_batch_data, | ||
32 | + inp=[x, args.class_num, args.img_size, args.anchors, False, False, False, args.letterbox_resize], | ||
33 | + Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]), | ||
34 | + num_parallel_calls=args.num_threads | ||
35 | +) | ||
36 | +val_dataset.prefetch(prefetech_buffer) | ||
37 | + | ||
38 | +iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) | ||
39 | +train_init_op = iterator.make_initializer(train_dataset) | ||
40 | +val_init_op = iterator.make_initializer(val_dataset) | ||
41 | + | ||
42 | +image_ids, image, y_true_13, y_true_26, y_true_52 = iterator.get_next() | ||
43 | +y_true = [y_true_13, y_true_26, y_true_52] | ||
44 | + | ||
45 | +image_ids.set_shape([None]) | ||
46 | +image.set_shape([None, None, None, 3]) | ||
47 | +for y in y_true: | ||
48 | + y.set_shape([None, None, None, None, None]) | ||
49 | + | ||
50 | + | ||
51 | +### Model definition | ||
52 | +yolo_model = yolov3(class_num, anchors, use_label_smooth, use_focal_loss, batch_norm_decay, weight_decay, use_static_shape=False) | ||
53 | + | ||
54 | +with tf.variable_scope('yolov3'): | ||
55 | + pred_feature_maps = yolo_model.forward(image, is_training=is_training) | ||
56 | + | ||
57 | +loss = yolo_model.compute_loss(pred_feature_maps, y_true) | ||
58 | +y_pred = yolo_model.predict(pred_feature_maps) | ||
59 | + | ||
60 | +l2_loss = tf.losses.get_regularization_loss() | ||
61 | + | ||
62 | +saver_to_restore = tf.train.Saver(var_list=tf.contrib.framework.get_variables_to_restore(include=restore_include, exclude=restore_exclude)) | ||
63 | +update_vars = tf.contrib.framework.get_variables_to_restore(include=update_part) | ||
64 | + | ||
65 | + | ||
66 | +global_step = tf.Variable(float(global_step), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) | ||
67 | +if use_warm_up: | ||
68 | + learning_rate = tf.cond(tf.less(global_step, train_batch_num * warm_up_epoch), | ||
69 | + lambda: learning_rate_init * global_step / (train_batch_num * warm_up_epoch), | ||
70 | + lambda: config_learning_rate(global_step - args.train_batch_num * args.warm_up_epoch)) | ||
71 | +else: | ||
72 | + learning_rate = config_learning_rate(global_step) | ||
73 | + | ||
74 | +optimizer = config_optimizer(args.optimizer_name, learning_rate) | ||
75 | + | ||
76 | +update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) | ||
77 | + | ||
78 | +with tf.control_dependencies(update_ops): | ||
79 | + gvs = optimizer.compute_gradients(loss[0] + l2_loss, var_list=update_vars) | ||
80 | + clip_grad_var = [gv if gv[0] is None else [ | ||
81 | + tf.clip_by_norm(gv[0], 100.), gv[1]] for gv in gvs] | ||
82 | + train_op = optimizer.apply_gradients(clip_grad_var, global_step=global_step) | ||
83 | + | ||
84 | +if args.save_optimizer: | ||
85 | + print('Saving optimizer parameters: ON') | ||
86 | + saver_to_save = tf.train.Saver() | ||
87 | + saver_best = tf.train.Saver() | ||
88 | +else: | ||
89 | + print('Saving optimizer parameters: OFF') | ||
90 | + | ||
91 | + | ||
92 | +##### Start training | ||
93 | + | ||
94 | +with tf.Session() as sess: | ||
95 | + sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) | ||
96 | + | ||
97 | + if os.path.exists(args.restore_path): | ||
98 | + saver_to_restore.restore(sess, args.restore_path) | ||
99 | + | ||
100 | + print('\nStart training...\n') | ||
101 | + | ||
102 | + best_mAP = -np.Inf | ||
103 | + | ||
104 | + for epoch in range(args.total_epoches): | ||
105 | + sess.run(train_init_op) | ||
106 | + loss_total, loss_xy, loss_wh, loss_conf, loss_class = AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter() | ||
107 | + | ||
108 | + ### train part | ||
109 | + for i in trange(args.train_batch_num): | ||
110 | + _, __y_pred, __y_true, __loss, __global_step, __lr = sess.run( | ||
111 | + [train_op, y_pred, y_true, loss, global_step, learning_rate], | ||
112 | + feed_dict={is_training: True}) | ||
113 | + | ||
114 | + loss_total.update(__loss[0], len(__y_pred[0])) | ||
115 | + loss_xy.update(__loss[1], len(__y_pred[0])) | ||
116 | + loss_wh.update(__loss[2], len(__y_pred[0])) | ||
117 | + loss_conf.update(__loss[3], len(__y_pred[0])) | ||
118 | + loss_class.update(__loss[4], len(__y_pred[0])) | ||
119 | + | ||
120 | + if __global_step % args.train_evaluation_step == 0 and __global_step > 0: | ||
121 | + recall, precision = evaluate_on_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __y_pred, __y_true, args.class_num, args.nms_threshold) | ||
122 | + | ||
123 | + info = "Epoch: {}, global_step: {} | loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f} | ".format( | ||
124 | + epoch, int(__global_step), loss_total.average, loss_xy.average, loss_wh.average, loss_conf.average, loss_class.average) | ||
125 | + info += 'Last batch: rec: {:.3f}, prec: {:.3f} | lr: {:.5g}'.format(recall, precision, __lr) | ||
126 | + print(info) | ||
127 | + | ||
128 | + if np.isnan(loss_total.average): | ||
129 | + print('****' * 10) | ||
130 | + raise ArithmeticError('Gradient exploded!') | ||
131 | + | ||
132 | + ## train end (saving parameters) | ||
133 | + if args.save_optimizer and epoch % args.save_epoch == 0 and epoch > 0: | ||
134 | + if loss_total.average <= 2.: | ||
135 | + saver_to_save.save(sess, args.save_dir + 'model-epoch_{}_step_{}_loss_{:.4f}_lr_{:.5g}'.format(epoch, int(__global_step), loss_total.average, __lr)) | ||
136 | + | ||
137 | + ### validation part | ||
138 | + if epoch % args.val_evaluation_epoch == 0 and epoch >= args.warm_up_epoch: | ||
139 | + sess.run(val_init_op) | ||
140 | + | ||
141 | + val_loss_total, val_loss_xy, val_loss_wh, val_loss_conf, val_loss_class = AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter() | ||
142 | + | ||
143 | + val_preds = [] | ||
144 | + | ||
145 | + for j in trange(args.val_img_cnt): | ||
146 | + __image_ids, __y_pred, __loss = sess.run([image_ids, y_pred, loss], | ||
147 | + feed_dict={is_training: False}) | ||
148 | + pred_content = get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __image_ids, __y_pred) | ||
149 | + val_preds.extend(pred_content) | ||
150 | + val_loss_total.update(__loss[0]) | ||
151 | + val_loss_xy.update(__loss[1]) | ||
152 | + val_loss_wh.update(__loss[2]) | ||
153 | + val_loss_conf.update(__loss[3]) | ||
154 | + val_loss_class.update(__loss[4]) | ||
155 | + | ||
156 | + # calc mAP | ||
157 | + rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter() | ||
158 | + gt_dict = parse_gt_rec(args.val_file, args.img_size, args.letterbox_resize) | ||
159 | + | ||
160 | + info = '======> Epoch: {}, global_step: {}, lr: {:.6g} <======\n'.format(epoch, __global_step, __lr) | ||
161 | + | ||
162 | + for ii in range(args.class_num): | ||
163 | + npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, ii, iou_thres=args.eval_threshold, use_07_metric=args.use_voc_07_metric) | ||
164 | + info += 'EVAL: Class {}: Recall: {:.4f}, Precision: {:.4f}, AP: {:.4f}\n'.format(ii, rec, prec, ap) | ||
165 | + rec_total.update(rec, npos) | ||
166 | + prec_total.update(prec, nd) | ||
167 | + ap_total.update(ap, 1) | ||
168 | + | ||
169 | + mAP = ap_total.average | ||
170 | + info += 'EVAL: Recall: {:.4f}, Precison: {:.4f}, mAP: {:.4f}\n'.format(rec_total.average, prec_total.average, mAP) | ||
171 | + info += 'EVAL: loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f}\n'.format( | ||
172 | + val_loss_total.average, val_loss_xy.average, val_loss_wh.average, val_loss_conf.average, val_loss_class.average) | ||
173 | + print(info) | ||
174 | + | ||
175 | + if args.save_optimizer and mAP > best_mAP: | ||
176 | + best_mAP = mAP | ||
177 | + saver_best.save(sess, args.save_dir + 'best_model_Epoch_{}_step_{}_mAP_{:.4f}_loss_{:.4f}_lr_{:.7g}'.format( | ||
178 | + epoch, int(__global_step), best_mAP, val_loss_total.average, __lr)) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
code/yolov3/video_test.py
0 → 100644
1 | +from __future__ import division, print_function | ||
2 | + | ||
3 | +import tensorflow as tf | ||
4 | +import numpy as np | ||
5 | +import argparse | ||
6 | +import cv2 | ||
7 | +import time | ||
8 | + | ||
9 | +from misc_utils import parse_anchors, read_class_names | ||
10 | +from nms_utils import gpu_nms | ||
11 | +from plot_utils import get_color_table, plot_one_box | ||
12 | +from data_utils import letterbox_resize | ||
13 | + | ||
14 | +from model import yolov3 | ||
15 | + | ||
16 | +parser = argparse.ArgumentParser(description="YOLO-V3 video test procedure.") | ||
17 | +parser.add_argument("input_video", type=str, | ||
18 | + help="The path of the input video.") | ||
19 | +parser.add_argument("--anchor_path", type=str, default="./data/yolo_anchors.txt", | ||
20 | + help="The path of the anchor txt file.") | ||
21 | +parser.add_argument("--new_size", nargs='*', type=int, default=[416, 416], | ||
22 | + help="Resize the input image with `new_size`, size format: [width, height]") | ||
23 | +parser.add_argument("--letterbox_resize", type=lambda x: (str(x).lower() == 'true'), default=True, | ||
24 | + help="Whether to use the letterbox resize.") | ||
25 | +parser.add_argument("--class_name_path", type=str, default="./data/classes.txt", | ||
26 | + help="The path of the class names.") | ||
27 | +parser.add_argument("--restore_path", type=str, default="./data/darknet_weights/yolov3.ckpt", | ||
28 | + help="The path of the weights to restore.") | ||
29 | +parser.add_argument("--save_video", type=lambda x: (str(x).lower() == 'true'), default=False, | ||
30 | + help="Whether to save the video detection results.") | ||
31 | +args = parser.parse_args() | ||
32 | + | ||
33 | +args.anchors = parse_anchors(args.anchor_path) | ||
34 | +args.classes = read_class_names(args.class_name_path) | ||
35 | +args.num_class = len(args.classes) | ||
36 | + | ||
37 | +color_table = get_color_table(args.num_class) | ||
38 | + | ||
39 | +vid = cv2.VideoCapture(args.input_video) | ||
40 | +video_frame_cnt = int(vid.get(7)) | ||
41 | +video_width = int(vid.get(3)) | ||
42 | +video_height = int(vid.get(4)) | ||
43 | +video_fps = int(vid.get(5)) | ||
44 | + | ||
45 | +if args.save_video: | ||
46 | + fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') | ||
47 | + videoWriter = cv2.VideoWriter('video_result.mp4', fourcc, video_fps, (video_width, video_height)) | ||
48 | + | ||
49 | +with tf.Session() as sess: | ||
50 | + input_data = tf.placeholder(tf.float32, [1, args.new_size[1], args.new_size[0], 3], name='input_data') | ||
51 | + yolo_model = yolov3(args.num_class, args.anchors) | ||
52 | + with tf.variable_scope('yolov3'): | ||
53 | + pred_feature_maps = yolo_model.forward(input_data, False) | ||
54 | + pred_boxes, pred_confs, pred_probs = yolo_model.predict(pred_feature_maps) | ||
55 | + | ||
56 | + pred_scores = pred_confs * pred_probs | ||
57 | + | ||
58 | + boxes, scores, labels = gpu_nms(pred_boxes, pred_scores, args.num_class, max_boxes=200, score_thresh=0.3, nms_thresh=0.45) | ||
59 | + | ||
60 | + saver = tf.train.Saver() | ||
61 | + saver.restore(sess, args.restore_path) | ||
62 | + | ||
63 | + for i in range(video_frame_cnt): | ||
64 | + ret, img_ori = vid.read() | ||
65 | + if args.letterbox_resize: | ||
66 | + img, resize_ratio, dw, dh = letterbox_resize(img_ori, args.new_size[0], args.new_size[1]) | ||
67 | + else: | ||
68 | + height_ori, width_ori = img_ori.shape[:2] | ||
69 | + img = cv2.resize(img_ori, tuple(args.new_size)) | ||
70 | + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | ||
71 | + img = np.asarray(img, np.float32) | ||
72 | + img = img[np.newaxis, :] / 255. | ||
73 | + | ||
74 | + start_time = time.time() | ||
75 | + boxes_, scores_, labels_ = sess.run([boxes, scores, labels], feed_dict={input_data: img}) | ||
76 | + end_time = time.time() | ||
77 | + | ||
78 | + # rescale the coordinates to the original image | ||
79 | + if args.letterbox_resize: | ||
80 | + boxes_[:, [0, 2]] = (boxes_[:, [0, 2]] - dw) / resize_ratio | ||
81 | + boxes_[:, [1, 3]] = (boxes_[:, [1, 3]] - dh) / resize_ratio | ||
82 | + else: | ||
83 | + boxes_[:, [0, 2]] *= (width_ori/float(args.new_size[0])) | ||
84 | + boxes_[:, [1, 3]] *= (height_ori/float(args.new_size[1])) | ||
85 | + | ||
86 | + | ||
87 | + for i in range(len(boxes_)): | ||
88 | + x0, y0, x1, y1 = boxes_[i] | ||
89 | + plot_one_box(img_ori, [x0, y0, x1, y1], label=args.classes[labels_[i]] + ', {:.2f}%'.format(scores_[i] * 100), color=color_table[labels_[i]]) | ||
90 | + cv2.putText(img_ori, '{:.2f}ms'.format((end_time - start_time) * 1000), (40, 40), 0, | ||
91 | + fontScale=1, color=(0, 255, 0), thickness=2) | ||
92 | + cv2.imshow('image', img_ori) | ||
93 | + if args.save_video: | ||
94 | + videoWriter.write(img_ori) | ||
95 | + if cv2.waitKey(1) & 0xFF == ord('q'): | ||
96 | + break | ||
97 | + | ||
98 | + vid.release() | ||
99 | + if args.save_video: | ||
100 | + videoWriter.release() | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment