Last active
November 1, 2023 14:44
-
-
Save devymex/1f76224b2428d0ddbf92b93def6c587c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, io, os.path as path, functools as ft # basic packages | |
import urllib.request as url_req, numpy as np, cv2 # extension packages | |
import torch, torch.nn as nn, torchvision.ops as ops # pytorch packages | |
data_path = '.' if len(sys.argv) < 2 else sys.argv[1] | |
image_file_url = 'https://github.com/pjreddie/darknet/raw/master/data/dog.jpg' | |
image_filename = path.join(data_path, 'dog.jpg') | |
weight_file_url = 'https://pjreddie.com/media/files/yolov3.weights' | |
weight_filename = path.join(data_path, 'yolov3.weights') | |
num_cls = 80 | |
conf_thres = 0.24 | |
nms_thres = 0.45 | |
input_size = (416, 416) | |
anchors = [(10,13), (16,30), (33,23), # 0, 1, 2 | |
(30,61), (62,45), (59,119), # 3, 4, 5 | |
(116,90), (156,198), (373,326)] # 6, 7, 8 | |
masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] # yolo1, yolo2, yolo3 | |
class YoloConv(nn.Module): | |
def __init__(self, bn, c, n, size, stride, pad, leaky_relu=True): | |
super().__init__() | |
self.conv = nn.Conv2d(c, n, (size, size), (stride, stride), | |
(pad, pad), bias=not bn) | |
if bn: | |
self.bn = nn.BatchNorm2d(num_features=n, affine=True) | |
if leaky_relu: | |
self.active = nn.LeakyReLU(negative_slope=0.1) | |
def forward(self, x): | |
x = self.conv(x) | |
if hasattr(self, 'bn'): | |
x = self.bn(x) | |
if hasattr(self, 'active'): | |
x = self.active(x) | |
return x | |
class YoloResBlock(nn.Module): | |
def __init__(self, c, n, res): | |
super().__init__() | |
self.res = res | |
self.yolo_conv_1 = YoloConv(True, c, n // 2, 1, 1, 0) | |
self.yolo_conv_2 = YoloConv(True, n // 2, n, 3, 1, 1) | |
def forward(self, x): | |
y = self.yolo_conv_1(x) | |
y = self.yolo_conv_2(y) | |
if self.res: | |
y = x + y | |
return y | |
class Yolov3Backbone(nn.Module): | |
def __init__(self, num_vals): | |
super().__init__() | |
seg1_mods = [YoloConv(True, 3, 32, 3, 1, 1), # input: data | |
YoloConv(True, 32, 64, 3, 2, 1), | |
YoloResBlock(64, 64, True), | |
YoloConv(True, 64, 128, 3, 2, 1), | |
YoloResBlock(128, 128, True), | |
YoloResBlock(128, 128, True), | |
YoloConv(True, 128, 256, 3, 2, 1)] | |
for i in range(8): | |
seg1_mods.append(YoloResBlock(256, 256, True)) | |
seg2_mods = [YoloConv(True, 256, 512, 3, 2, 1)] # input: seg1_out | |
for i in range(8): | |
seg2_mods.append(YoloResBlock(512, 512, True)) | |
seg3_mods = [YoloConv(True, 512, 1024, 3, 2, 1), # input: seg2_out | |
YoloResBlock(1024, 1024, True), | |
YoloResBlock(1024, 1024, True), | |
YoloResBlock(1024, 1024, True), | |
YoloResBlock(1024, 1024, True), | |
YoloResBlock(1024, 1024, False), | |
YoloResBlock(1024, 1024, False), | |
YoloConv(True, 1024, 512, 1, 1, 0)] | |
yolo1_mods = [YoloConv(True, 512, 1024, 3, 1, 1), # input: seg3_out | |
YoloConv(False, 1024, num_vals[0], 1, 1, 0, False)] | |
seg4_mods = [YoloConv(True, 512, 256, 1, 1, 0), # input: seg2_out | |
nn.Upsample(scale_factor=2)] | |
seg5_mods = [YoloResBlock(768, 512, False), # input: seg4_out, seg2_out | |
YoloResBlock(512, 512, False), | |
YoloConv(True, 512, 256, 1, 1, 0)] | |
yolo2_mods = [YoloConv(True, 256, 512, 3, 1, 1),# yolo2 input: seg5_out | |
YoloConv(False, 512, num_vals[1], 1, 1, 0, False)] | |
seg6_mods = [YoloConv(True, 256, 128, 1, 1, 0), # input: seg5_out | |
nn.Upsample(scale_factor=2)] | |
yolo3_mods = [YoloResBlock(384, 256, False), # input: seg6, seg1_out | |
YoloResBlock(256, 256, False), | |
YoloResBlock(256, 256, False), | |
YoloConv(False, 256, num_vals[2], 1, 1, 0, False)] | |
# DO NOT REORDER FOLLOWING SEQUENTIALS | |
# the following order is consistent with the cfg file `yolov3-voc.cfg` | |
# and the weights file `yolov3.weights` | |
self.seg1 = nn.Sequential(*seg1_mods) | |
self.seg2 = nn.Sequential(*seg2_mods) | |
self.seg3 = nn.Sequential(*seg3_mods) | |
self.yolo1 = nn.Sequential(*yolo1_mods) | |
self.seg4 = nn.Sequential(*seg4_mods) | |
self.seg5 = nn.Sequential(*seg5_mods) | |
self.yolo2 = nn.Sequential(*yolo2_mods) | |
self.seg6 = nn.Sequential(*seg6_mods) | |
self.yolo3 = nn.Sequential(*yolo3_mods) | |
def forward(self, x): | |
seg1_out = self.seg1.forward(x) | |
seg2_out = self.seg2.forward(seg1_out) | |
seg3_out = self.seg3.forward(seg2_out) | |
yolo1_out = self.yolo1.forward(seg3_out) | |
seg4_out = self.seg4.forward(seg3_out) | |
cat42 = torch.cat((seg4_out, seg2_out), 1) | |
seg5_out = self.seg5.forward(cat42) | |
yolo2_out = self.yolo2.forward(seg5_out) | |
seg6_out = self.seg6.forward(seg5_out) | |
cat61 = torch.cat((seg6_out, seg1_out), 1) | |
yolo3_out = self.yolo3.forward(cat61) | |
return yolo1_out, yolo2_out, yolo3_out | |
class YoloLayer(nn.Module): | |
def __init__(self, anchors, masks): | |
super().__init__() | |
self.anchors = anchors | |
self.masks = masks | |
self.register_buffer('scale_w', torch.zeros(1)) | |
self.register_buffer('scale_h', torch.zeros(1)) | |
self.register_buffer('anc_off_c', torch.zeros(1)) | |
self.register_buffer('anc_off_r', torch.zeros(1)) | |
def forward(self, input): | |
# input: batch*anc*vals*h*w -> vals*batch*anc*h*w | |
input = input.reshape(self.calc_shape).permute(2, 0, 1, 3, 4) | |
box_cx = torch.sigmoid(input[0:1]) # sigmoid(px) | |
box_cy = torch.sigmoid(input[1:2]) # sigmoid(py) | |
box_w = input[2:3] | |
box_h = input[3:4] | |
conf_probs = torch.sigmoid(input[4:]) # objectness and probabilities | |
box_ws = torch.exp(box_w) * self.scale_w # exp(pw) * anc_w / img_w | |
box_hs = torch.exp(box_h) * self.scale_h # exp(ph) * anc_h / img_h | |
box_x1 = (box_cx + self.anc_off_c) / self.anc_cols - box_ws / 2 | |
box_y1 = (box_cy + self.anc_off_r) / self.anc_rows - box_hs / 2 | |
box_x2 = box_x1 + box_ws # x2 = x1 + w | |
box_y2 = box_y1 + box_hs # y2 = y1 + h | |
output = torch.cat((box_cx, box_cy, box_w, box_h, | |
box_x1, box_y1, box_x2, box_y2, conf_probs)) | |
# output: vals*batch*anc*h*w -> batch*anc*h*w*vals | |
return output.permute(1, 2, 3, 4, 0) | |
@torch.no_grad() | |
def resize(self, input, image_size): | |
num_ancs = len(self.masks) | |
num_vals = int(input.shape[1] // num_ancs) | |
self.anc_rows = int(input.shape[2]) | |
self.anc_cols = int(input.shape[3]) | |
self.calc_shape = [-1, num_ancs, num_vals, self.anc_rows, self.anc_cols] | |
scale_w = torch.tensor([self.anchors[m][0] for m in self.masks], | |
device=self.scale_w.device) | |
scale_h = torch.tensor([self.anchors[m][1] for m in self.masks], | |
device=self.scale_h.device) | |
self.scale_w = scale_w.reshape([1, 1, num_ancs, 1, 1]) / image_size[0] | |
self.scale_h = scale_h.reshape([1, 1, num_ancs, 1, 1]) / image_size[1] | |
off_c = torch.arange(self.anc_cols, dtype=torch.float32, | |
device=self.anc_off_c.device) | |
off_r = torch.arange(self.anc_rows, dtype=torch.float32, | |
device=self.anc_off_r.device) | |
self.anc_off_c = off_c.reshape([1, 1, 1, 1, self.anc_cols]) | |
self.anc_off_r = off_r.reshape([1, 1, 1, self.anc_rows, 1]) | |
class Yolov3Model(nn.Module): | |
def __init__(self, num_cls, image_size, anchors, masks): | |
super().__init__() | |
image_size = torch.tensor([image_size[0], image_size[1]]) | |
self.register_buffer('image_size', image_size) | |
self.backbone = Yolov3Backbone([len(m) * (5 + num_cls) for m in masks]) | |
example_input = torch.zeros([1, 3, image_size[1], image_size[0]]) | |
example_output = self.backbone.forward(example_input) | |
yolo_layers = [YoloLayer(anchors, m) for m in masks] | |
for i, input in enumerate(example_output): | |
yolo_layers[i].resize(input, image_size) | |
self.yolo_mods = nn.ModuleList(yolo_layers) | |
def forward(self, input): | |
outputs = self.backbone(input) | |
self.image_size[0].fill_(input.shape[3]) | |
self.image_size[1].fill_(input.shape[2]) | |
if self.training: | |
self.yolo_mods[0].resize(outputs[0], self.image_size) | |
self.yolo_mods[1].resize(outputs[1], self.image_size) | |
self.yolo_mods[2].resize(outputs[2], self.image_size) | |
output0 = self.yolo_mods[0](outputs[0]) | |
output1 = self.yolo_mods[1](outputs[1]) | |
output2 = self.yolo_mods[2](outputs[2]) | |
return (output0, output1, output2) | |
def load_param(tensor, bin_stream): | |
data = bin_stream.read(tensor.numel() * 4) | |
if len(data) == tensor.numel() * 4: | |
float_ary = np.frombuffer(data, dtype='<f4') | |
tensor.copy_(torch.tensor(float_ary.copy()).reshape_as(tensor)) | |
@torch.no_grad() | |
def load_params_file(filename, module): | |
bin_stream = open(filename, 'rb') | |
bin_stream.read(20) # skip the file header | |
for m in module.modules(): | |
if isinstance(m, YoloConv): | |
if hasattr(m, 'bn'): | |
load_param(m.bn.bias, bin_stream) | |
load_param(m.bn.weight, bin_stream) | |
load_param(m.bn.running_mean, bin_stream) | |
load_param(m.bn.running_var, bin_stream) | |
else: | |
load_param(m.conv.bias, bin_stream) | |
load_param(m.conv.weight, bin_stream) # all convs have weight | |
def nms_comp(k): | |
def op(box_a, box_b): | |
if box_a[5 + k] < box_b[5 + k]: return 1 | |
elif box_a[5 + k] > box_b[5 + k]: return -1 | |
else: return 0 | |
return op | |
def nms(all_boxes): | |
for k in range(0, num_cls): | |
all_boxes = sorted(all_boxes, key=ft.cmp_to_key(nms_comp(k))) | |
all_boxes = torch.cat(all_boxes).reshape(-1, 5 + num_cls) | |
for i in range(0, all_boxes.shape[0] - 1): | |
if all_boxes[i][5+k] != 0: | |
boxes_i = all_boxes[i:i+1,:4] | |
IoUs = ops.box_iou(boxes_i, all_boxes[i+1:,:4]) | |
for j, iou in enumerate(IoUs[0]): | |
if iou > nms_thres: | |
all_boxes[i+1+j,5+k] = 0 | |
return all_boxes | |
# Download image and weight files if they do not exist | |
if not path.exists(image_filename): | |
print('Downloading test image ...') | |
url_req.urlretrieve(image_file_url, image_filename) | |
if not path.exists(weight_filename): | |
print('Downloading weights file ...') | |
url_req.urlretrieve(weight_file_url, weight_filename) | |
# Load image data and do preprocessing | |
img = cv2.imread(image_filename, cv2.IMREAD_COLOR) | |
data = cv2.cvtColor(cv2.resize(img, input_size), cv2.COLOR_RGB2BGR) | |
data = torch.tensor(data) / 255.0 | |
data = data.permute(2, 0, 1).reshape(1, 3, input_size[1], input_size[0]) | |
# Create model and do inference | |
model = Yolov3Model(num_cls, input_size, anchors, masks) | |
load_params_file(weight_filename, model) | |
# Testing for script and trace | |
model.train() | |
script_model = torch.jit.script(model) | |
model.eval() | |
trace_model = torch.jit.trace(model, data) | |
torch.jit.save(script_model, path.join(data_path, 'yolov3.pth')) | |
model_outputs = model.forward(data) | |
reshaped_outputs = [] | |
for output in model_outputs: | |
output = output.detach().reshape(1, -1, output.shape[-1]) | |
reshaped_outputs.append(output[:,:,output.shape[-1] - 5 - num_cls:]) | |
predict = torch.cat(reshaped_outputs, dim=1) | |
predict[:,:,5:] *= predict[:,:,4].unsqueeze(2) # prob *= conf | |
predict[:,:,5:] *= torch.gt(predict[:,:,5:], conf_thres) # low conf are masked out | |
# NMS | |
best_boxes = [] | |
for predict in predict: | |
all_boxes = [] | |
for boxes in predict: | |
conf = boxes[4] | |
if conf > conf_thres: | |
all_boxes.append(boxes.view(1, -1)) | |
if len(all_boxes) > 0: | |
all_boxes = nms(torch.cat(all_boxes)) | |
cls_ids = torch.argmax(all_boxes[:, 5:], dim=1) | |
for box_idx, best_cls_id in enumerate(cls_ids): | |
if all_boxes[box_idx, 5 + best_cls_id] > conf_thres: | |
box = torch.cat((all_boxes[box_idx, :4], | |
torch.tensor([np.float32(best_cls_id)]))) | |
best_boxes.append(box) | |
img_size = (img.shape[1], img.shape[0]) | |
for box in best_boxes: # Draw and save precition results | |
pt1 = (int(box[0] * img_size[0]), int(box[1] * img_size[1])) | |
pt2 = (int(box[2] * img_size[0]), int(box[3] * img_size[1])) | |
cv2.rectangle(img, pt1, pt2, (0, 255, 0)) | |
cv2.imwrite(path.join(data_path, 'predict.png'), img) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Notations
(anc_w, anc_h)
: pixel size of anchor, provided byyolov3.cfg
.(img_w, img_h)
: the size of input data of the network.[batch, anchors, values, cols, rows]
: the shape of the outputs of the feature-map(cols, rows)
before yolo layer.[px, py, pw, ph, pc, probs[0, ..., 79]]
: 85values
at each pixel(c, r)
of the output.Convert Output to Predicted Boxes
Then do
NMS
for all boxes.