Matheus Schmitz
LinkedIn
Github Portfolio
Intro
Implementation of the YOLO v3 architecture for object detection in images.
Configuration File: https://github.com/pjreddie/darknet/blob/master/cfg/yolov3.cfg
Work Based On: https://blog.paperspace.com/how-to-implement-a-yolo-object-detector-in-pytorch/
Source Github: https://github.com/ayooshkathuria/YOLO_v3_tutorial_from_scratch
!pip install opencv-python==4.2.0.34
Requirement already satisfied: opencv-python==4.2.0.34 in c:\users\matheus\anaconda3\lib\site-packages (4.2.0.34) Requirement already satisfied: numpy>=1.14.5 in c:\users\matheus\appdata\roaming\python\python37\site-packages (from opencv-python==4.2.0.34) (1.19.5)
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# Ignore warnings
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter('ignore')
# Function to read the configuration file
# Returns the blocks used to build the neural network
def parse_config_file(config_file):
# Open the file for reading
arquivo = open(config_file, 'r')
# Read the lines and convert to a list
# Remove blank lines
# Remove comments
# Remove white spaces
linhas = arquivo.read().split('\n')
linhas = [x for x in linhas if len(x) > 0]
linhas = [x for x in linhas if x[0] != '#']
linhas = [x.rstrip().lstrip() for x in linhas]
# Dictionary and list of hyperparameter blocks
block = {}
blocks = []
# Loop through the lines
for linha in linhas:
# Get the type (class) of the block of hyperparameters
if linha[0] == "[":
# We are starting a new block, so first add the previous/current one to the list of blocks, then start the new one
if len(block) != 0:
blocks.append(block)
block = {}
# Name the block type accordingly (minus the [] brackets)
block["type"] = linha[1:-1].rstrip()
# If/while inside a block, get the hyperparameter and value to use
else:
key, value = linha.split("=")
block[key.rstrip()] = value.lstrip()
# Need an extra line to append the last block (because usually we append the block before starting the next one)
blocks.append(block)
return blocks
# Function to create the network modules
# Returns PyTorch objects
def create_modules(blocks):
# Info about the neural network input parameters
net_info = blocks[0]
# Create an object to build the modules
# https://pytorch.org/docs/master/generated/torch.nn.ModuleList.html
lista_modulos = nn.ModuleList()
# Number of color channels in the images
num_filters = 3
# List used in the route layers to keep record of all filters' outputs
output_filters = []
# Iterate through the blocks and create the neural network's modules (layers)
for index, x in enumerate(blocks[1:]):
# Create the current module (sequence of steps)
# https://pytorch.org/docs/master/generated/torch.nn.Sequential.html
module = nn.Sequential()
# Check the block type
if x['type'] == 'convolutional':
# Extract the hyperparameters for a convolutional layer
activation = x['activation']
filters = int(x['filters'])
padding = int(x['pad'])
kernel_size = int(x['size'])
stride = int(x['stride'])
# Add batch normalization if the layer has it
try:
batch_normalize = int(x['batch_normalize'])
bias = False
except:
batch_normalize = 0
bias = True
# Adjust the padding
if padding:
pad = (kernel_size - 1)//2
else:
pad = 0
# Create the convolutional layer
# https://pytorch.org/docs/master/generated/torch.nn.Conv2d.html
conv = nn.Conv2d(in_channels = num_filters,
out_channels = filters,
kernel_size = kernel_size,
stride = stride,
padding = pad,
bias = bias)
# Add the convolutional layer to the list of modules
module.add_module(f'conv_{index}', conv)
# Add the batch normalization layer
# https://pytorch.org/docs/master/generated/torch.nn.BatchNorm2d.html
if batch_normalize:
bn = nn.BatchNorm2d(num_features = filters)
module.add_module(f'batch_norm_{index}', bn)
# Check the activation type and add an activation layer (YOLO v3 uses only LeakyReLU activations)
# https://pytorch.org/docs/master/generated/torch.nn.LeakyReLU.html
if activation == 'leaky':
activn = nn.LeakyReLU(negative_slope = 0.1, inplace = True)
module.add_module(f'leaky_{index}', activn)
# Upsampling layer - used to restore the image resolution to the size of the previous layer
# https://pytorch.org/docs/stable/generated/torch.nn.Upsample.html
elif x['type'] == 'upsample':
stride = int(x['stride'])
upsample = nn.Upsample(scale_factor = 2, mode = 'bilinear')
module.add_module(f'upsample_{index}', upsample)
# Route layer - used to calculate the output depth (filters) resulting from concatenation. Similar to concat layers.
# When the attribute has a single value, the layer generates feature maps indexed by that value.
# E.g. if route = -4, then the route layer will have the feature maps of a layer 4 steps behind it.
# When the attribute has two values, the layer returns a concatenation of the feature maps of both layers by index number.
# E.g. if route = -1, 61, the layer will have the feature maps of the previous layer (-1) and the 61st layer, ...
# ... with both feature maps concatenated along the depth dimension.
# https://github.com/pjreddie/darknet/issues/545
# https://github.com/AlexeyAB/darknet/issues/487#issuecomment-374902735
# https://github.com/AlexeyAB/darknet/issues/279#issuecomment-397248821
# https://github.com/AlexeyAB/darknet#how-to-train-to-detect-your-custom-objects
elif x['type'] == 'route':
x['layers'] = x['layers'].split(',')
# Route "start"
start = int(x['layers'][0])
# Check if there are two attributes/values
# If yes, set the second attribute as the route "end"
try:
end = int(x['layers'][1])
# If not, set the end to zero
except:
end = 0
# Calculate relative positions
if start > 0:
start = start - index
if end > 0:
end = end - index
# Create the layer
route = EmptyLayer()
# Add the layer to the neural network module
module.add_module(f'route_{index}', route)
# Extract the filters
if end < 0:
filters = output_filters[index + start] + output_filters[index + end]
else:
filters = output_filters[index+start]
# Shortcut layer - same as a skip layer in ResNet.
# E.g. if the hyperparameter is -3, then the shortcut layer's output is obtained by merging the feature vectors ...
# ... from the previous layer abd the layer 3 steps behind the shortcut layer.
elif x['type'] == 'shortcut':
# Create the layer
shortcut = EmptyLayer()
# Add layer to the model
module.add_module(f'shortcut_{index}', shortcut)
# YOLO layer with anchor detection
# The YOLO layer is the detection layer. The anchors describe 9 total anchors, yet only those anchors indexed by ...
# ... the mask attributes are used. E.g. if the mask value is 0, 1, 2, that means the first, second and third anchors ...
# ... will be used. This makes sense given that each cell in the detection layer predicts 3 boxes. In total, we have ...
# ... detection in three scales, resulting in 9 anchors.
# Anchors: Predetermined set of bounding boxes with specific height-width ratios.
# Mask: List of anchor IDs which the layer is responsible for predictign.
# Num: total number of anchors.
# YOLO v3 predicts a predetermined set of anchors, which have initial sizes (height, width), some of which (the one ...
# ... closest to the object size) will be redimensioned to the object's size.
# Each YOLO layer must know all anchors, but is responsible for only a subset of them.
# The mask tells the layer which anchors it should use for predicting. The first YOLO layer is assigned anchors 6,7,8, ...
# ... the second gets 3,4,5, and the third gets 0,1,2.
elif x['type'] == 'yolo':
# Extract the mask values
mask = list(map(int, x['mask'].split(',')))
# Extract the anchor values
anchors = list(map(int, x['anchors'].split(',')))
anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors), 2)]
# Filter the list of anchors using the mask
anchors = [anchors[i] for i in mask]
# Create the anchor detection layer
detection = DetectAnchor(anchors)
# Add layer to the model
module.add_module(f'detection_{index}', detection)
# Load the list of modules (layer groups), filters and output filters
lista_modulos.append(module)
num_filters = filters
output_filters.append(filters)
return (net_info, lista_modulos)
# Function summary:
# YOLO has 5 layer types: Convolutional, Upsample, Route, Shortcut and YOLO.
# All customization of a YOLO model is done by adjusting hyperparameter values in the configuration file.
# The configuration file describes the YOLO network layout block by block.
# The YOLO architecture is also known as Darknet.
# Function to make predictions
# Take a feature vector of detections and transform it into a 2D tensor, in which each tensor line corresponds to ...
# ... the attributes of one bounding box (anchor).
# Arguments:
# prediction (tensor): previous output
# input_dim (int): dimension of the input image
# anchros (list(tuple)): anchors used in the YOLO detection layer
# num_classes (int): total number of classes
# CUDA (bool): optional argument to define whether or not to use GPU
# Function retuns:
# prediction (tensor): redimensioned (3D tensor) prediction output of the current YOLO layer.
# The three dimensions are: [batch size, number of bounding boxes, bound box attributes]
def make_predictions(prediction, input_dim, anchors, num_classes, CUDA = True):
# Hyperparameters for the predictions
batch_size = prediction.size(0)
stride = input_dim // prediction.size(2)
grid_size = input_dim // stride
bbox_attributes = 5 + num_classes
num_anchors = len(anchors)
# Adjust the prediction object's shape
prediction = prediction.view(batch_size, bbox_attributes * num_anchors, grid_size * grid_size)
# Transpose the matrix
prediction = prediction.transpose(1, 2).contiguous()
# New shape adjust
prediction = prediction.view(batch_size, grid_size * grid_size * num_anchors, bbox_attributes)
# Proportionally resize the anchors based on stride
# [(,),(,),(,)] -> tensor([[,],[,],[,]]) size([3,2])
anchors = [(anchor[0]/stride, anchor[1]/stride) for anchor in anchors]
# Sigmoid transformation: centre_X, centre_Y, objectness score
prediction[:, :, 0] = torch.sigmoid(prediction[:, :, 0])
prediction[:, :, 1] = torch.sigmoid(prediction[:, :, 1])
prediction[:, :, 4] = torch.sigmoid(prediction[:, :, 4])
# Add the grids for the coordinate centers
grid = np.arange(grid_size)
a, b = np.meshgrid(grid, grid)
# Adjust the shapes
x_offset = torch.FloatTensor(a).view(-1, 1)
y_offset = torch.FloatTensor(b).view(-1, 1)
# Verify whether to use GPU
if CUDA:
x_offset = x_offset.cuda()
y_offset = y_offset.cuda()
prediction = prediction.cuda()
# Concatenate x and y for the prediction
x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1, num_anchors).view(-1, 2).unsqueeze(0)
prediction[:, :, :2] += x_y_offset
# Convert the object with the anchor values to a float tensor
anchors = torch.FloatTensor(anchors)
# If using GPU, send the anchors to the GPU too
if CUDA:
anchors = anchors.cuda()
# Matrix operations for the final anchor value
anchors = anchors.repeat(grid_size * grid_size, 1).unsqueeze(0)
# Element-wise multiplication of the anchors' predictions
prediction[:, :, 2:4] = torch.exp(prediction[:, :, 2:4]) * anchors
# Sigmoid activation for the class scores
prediction[:, :, 5:5 + num_classes] = torch.sigmoid(prediction[:, :, 5:5 + num_classes])
# Reshape the predictions map to the size of the input image
prediction[:, :, :4] *= stride
return prediction
# Class that returns the route (concat) layer of the neural network
# Used to concatenate withthe network body, simplifying changes to the input data
class EmptyLayer(nn.Module):
def __init__(self):
super(EmptyLayer, self).__init__()
# Anchor detection class
class DetectAnchor(nn.Module):
def __init__(self, anchors):
super(DetectAnchor, self).__init__()
self.anchors = anchors
# YOLO architecture, also referred to as Darknet in the paper's documentation
# Has the convolutional layers
class Darknet(nn.Module):
# Construtor da classe
def __init__(self, config_file):
# Initialize the class
super(Darknet, self).__init__()
# Read the configuration file
self.blocks = parse_config_file(config_file)
# Create the neural networks' modules
self.net_info, self.lista_modulos = create_modules(self.blocks)
# Forward propagation
def forward(self, x, CUDA):
# Modules (network's layers)
modulos = self.blocks[1:]
# Cache of all layer outputs, needed for the route/shortcut layers
outputs = {}
# Track if the first detetion layer was found
write = 0
# Loop through the modules
for i, modulo in enumerate(modulos):
# Module type
tipo_modulo = (modulo['type'])
# Convolution and upsample layers
if tipo_modulo == 'convolutional' or tipo_modulo == 'upsample':
# Define the layer
x = self.lista_modulos[i](x)
# Route layers: concatenate two feature maps from other layers
elif tipo_modulo == 'route':
# Layers
camadas = modulo['layers']
camadas = [int(a) for a in camadas]
if camadas[0] > 0:
camadas[0] = camadas[0] - i
if len(camadas) == 1:
x = outputs[i + (camadas[0])]
else:
if camadas[1] > 0:
camadas[1] = camadas[1] - i
feature_map_1 = outputs[i + camadas[0]]
feature_map_2 = outputs[i + camadas[1]]
# Concatenate along the depth dimension
x = torch.cat((feature_map_1, feature_map_2), dim = 1)
# Shortcut layer
elif tipo_modulo == 'shortcut':
# Origin layer
from_layer = int(modulo['from'])
# Addition
x = outputs[i - 1] + outputs[i + from_layer]
# YOLO layer
elif tipo_modulo == 'yolo':
# Neural network's hyperparameters
anchors = self.lista_modulos[i][0].anchors
input_dim = int(self.net_info['height'])
num_classes = int(modulo['classes'])
# Make predictions (aka detect objects in the image)
x = make_predictions(prediction = x,
input_dim = input_dim,
anchors = anchors,
num_classes = num_classes,
CUDA = CUDA)
# If this is the first detection layer, then x represents the detections
if not write:
detections = x
write = 1
# If this is not the first detection layer, concatenate the predictions with the ones from previous layers
else:
detections = torch.cat((detections, x), 1)
outputs[i] = x
return detections
# Backward propagation
# Loading weights from the pre-trained model
def load_weights(self, weight_file):
# Open the weights file
fp = open(weight_file, 'rb')
#The first 5 values are header information
# 1. Major version number
# 2. Minor Version Number
# 3. Subversion number
# 4,5. Images seen by the network (during training)
header = np.fromfile(fp, dtype = np.int32, count=5)
self.header = torch.from_numpy(header)
self.seen = self.header[3]
# Load the weights
weights = np.fromfile(fp, dtype = np.float32)
# Control parameter to track where in the weights file we are
ptr = 0
# Loop through the module types
for i in range(len(self.lista_modulos)):
# Extract the module type
module_type = self.blocks[i+1]['type']
# Load the weights for the convolutional layers
if module_type == 'convolutional':
model = self.lista_modulos[i]
try:
batch_normalize = int(self.blocks[i+1]['batch_normalize'])
except:
batch_normalize = 0
conv = model[0]
if (batch_normalize):
# Load the weights for the batch normalization layer
bn = model[1]
num_bn_biases = bn.bias.numel()
bn_biases = torch.from_numpy(weights[ptr:ptr+num_bn_biases])
ptr += num_bn_biases
bn_weights = torch.from_numpy(weights[ptr:ptr+num_bn_biases])
ptr += num_bn_biases
bn_running_mean = torch.from_numpy(weights[ptr:ptr+num_bn_biases])
ptr += num_bn_biases
bn_running_var = torch.from_numpy(weights[ptr:ptr+num_bn_biases])
ptr += num_bn_biases
# Adjust dimensions
bn_biases = bn_biases.view_as(bn.bias.data)
bn_weights = bn_weights.view_as(bn.weight.data)
bn_running_mean = bn_running_mean.view_as(bn.running_mean)
bn_running_var = bn_running_var.view_as(bn.running_var)
# Copy the data to the model
bn.bias.data.copy_(bn_biases)
bn.weight.data.copy_(bn_weights)
bn.running_mean.copy_(bn_running_mean)
bn.running_var.copy_(bn_running_var)
else:
# Number of biases
num_biases = conv.bias.numel()
# Load the weights
conv_biases = torch.from_numpy(weights[ptr:ptr+num_biases])
ptr += num_biases
# Reshape the loaded weights accorindg to the model weights' dimension
conv_biases = conv_biases.view_as(conv.bias.data)
# Copy the data to the model
conv.bias.data.copy_(conv_biases)
# Load the convolutional layer's weights (same with or without batch_normalization)
num_weights = conv.weight.numel()
conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights])
ptr += num_weights
# Reshape the loaded weights accorindg to the model weights' dimension
conv_weights = conv_weights.view_as(conv.weight.data)
# Copy the data to the model
conv.weight.data.copy_(conv_weights)
# Py Data Stack
import numpy as np
import pandas as pd
# System manipulation
import os
import time
import argparse
import random
import pickle as pkl
# Image Processing
import cv2
import torch
import torch.nn as nn
from torch.autograd import Variable
# Package versions
import watermark
%load_ext watermark
%watermark --iversions -v
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Python implementation: CPython Python version : 3.7.10 IPython version : 7.23.1 pandas : 1.2.3 cv2 : 4.2.0 argparse : 1.1 sys : 3.7.10 (default, Feb 26 2021, 13:06:18) [MSC v.1916 64 bit (AMD64)] watermark: 2.1.0 torch : 1.5.1+cu101 numpy : 1.19.5
# Function to parse command line arguments
def arg_parse():
parser = argparse.ArgumentParser(description = 'YOLO Object Detector')
parser.add_argument("--input", dest = 'input', help = "Directory with images/videos for object detection.", default = "input", type = str)
parser.add_argument("--output", dest = 'output', help = "Directory to store images/videos with detected objects.", default = "output", type = str)
parser.add_argument("--batch", dest = "batch", help = "Batch size.", default = 1)
parser.add_argument("--confidence", dest = "confidence", help = "Confidense threshold to filter predictions.", default = 0.7)
parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS threshold.", default = 0.4)
parser.add_argument("--cfgfile", dest = 'cfgfile', help = "YOLO configuration file.", default = "config/yolov3.cfg", type = str)
parser.add_argument("--weights", dest = 'weights', help = "File with pretrained weights.", default = "pesos/yolov3.weights", type = str)
parser.add_argument("--resolution", dest = 'resolution', help = "Resolution of the input images. Increase to improve accuracy. Decrease to speed up detection.", default = "384", type = str)
return parser.parse_args()
# Function to calculate the IoU of two bounding boxes.
# The Intersection over Union (IoU) if a measure of the overlap between two bounding boxes.
# IoU is the Jaccard Similarity of the areas of two objects in a plane.
# In computer vision it is used to correctly detect an object.
# By convertion, the predicted bouding box is considered correct if the IoU is greater than 0.5.
# Incrasing the threshold improves precision but worsens recall.
# If the predicted bbox and the real bbox overlapped perfectly, the IoU would be 1.
# Non-Maximal Supression (NMS) cleans the multiple detections and keeps only one detection per object. For this, it ...
# ... chooses the highest probability bbox and supresses all other bboxes whose IoU is greater. Therefore, in the end ...
# ... only one bbox is kept, likely the most precise one (and unlikely the least precise one).
def bbox_iou(box1, box2):
# Calculate the maximum and minimum intersection
inter_max_xy = torch.min(box1[:, 2:4], box2[:, 2:4])
inter_min_xy = torch.max(box1[:, 0:2], box2[:, 0:2])
# Calculate the intersection area
inter_size = torch.clamp((inter_max_xy-inter_min_xy), min = 0)
inter_area = inter_size[:, 0]*inter_size[:, 1]
# Calculate the areas
b1_area = (box1[:, 2]-box1[:, 0])*(box1[:, 3]-box1[:, 1])
b2_area = (box2[:, 2]-box2[:, 0])*(box2[:, 3]-box2[:, 1])
# Calculate IoU
iou = inter_area / (b1_area + b2_area - inter_area)
return iou
# Resize the image
def resize_image(source, input_dim):
# Extract the shapes
src_height, src_width = source.shape[0], source.shape[1]
# Dimensions
input_height, input_width = input_dim
# Adjust dimensions
multiple = min(input_height/src_height, input_width/src_width)
dst_height = int(src_height * multiple)
dst_width = int(src_width * multiple)
# Resize
resized_image = cv2.resize(source, (dst_width, dst_height), interpolation = cv2.INTER_CUBIC)
# Canvas for the image
canvas = np.full((input_height, input_width, 3), 128)
canvas[(input_height - dst_height) // 2:(input_height - dst_height) // 2 + dst_height,
(input_width - dst_width) // 2: (input_width - dst_width) // 2 + dst_width, :] = resized_image
return canvas
# Prepare the image for the neural network by transforming it from numpy to tensor
def prep_image(img, input_dim):
# Resize the input image
img = (resize_image(img, (input_dim, input_dim)))
# BGR -> RBG, HWC -> CHW
img = img[:, :, :: -1].transpose((2, 0, 1)).copy()
# numpy -> tensor
img = torch.from_numpy(img).float().div(255).unsqueeze(0)
return img
# Obtain the classes present in a given image and returns only one
def unique(indices):
indices_np = indices.cpu().numpy()
unique_np = np.unique(indices_np)
unique_indices = torch.from_numpy(unique_np)
indices_res = unique_indices.detach().clone()
return indices_res
# Function to make the detections
# To obtain true detections, subjecting the output to the objectiveness threshold and to the Non-Maximal Supression (NMS)
# Return a tensor of shape: (D * 8), in which D in the number of true detections in all images, each represented by a line.
# Each detection has the attributes: image index for the batch to which the image belongs, 4 bbox coordinates, ...
# ... objectiveness score, score of the max confidence class, and class index.
def write_results(prediction, confidence, num_classes, nms_conf = 0.4):
# Task 1: Objectness confidence thresholding
conf_mask = (prediction[:, :, 4] > confidence).float().unsqueeze(2)
prediction *= conf_mask
# Task 2: Locate the bbox corners
box_corner = prediction.detach().clone()
box_corner[:, :, 0] = (prediction[:, :, 0] - prediction[:, :, 2]/2)
box_corner[:, :, 1] = (prediction[:, :, 1] - prediction[:, :, 3]/2)
box_corner[:, :, 2] = (prediction[:, :, 0] + prediction[:, :, 2]/2)
box_corner[:, :, 3] = (prediction[:, :, 1] + prediction[:, :, 3]/2)
prediction[:, :, :4] = box_corner[:, :, :4]
batch_size = prediction.size(0)
write = False
# Task 3: Loop through the batch's images
# Confidence threshold -> Only care about the value of the largest score class.
# Get the index of the largest score class and its score.
for ib in range(batch_size):
# Image prediction
image_prediction = prediction[ib]
# Indexes and ajust dimensions
max_conf, max_conf_indices = torch.max(image_prediction[:, 5:5+num_classes], 1)
max_conf = max_conf.float().unsqueeze(1)
max_conf_indices = max_conf_indices.float().unsqueeze(1)
image_prediction = torch.cat((image_prediction[:, :5], max_conf, max_conf_indices), 1)
# Get rid of the zero objectiveness lines
non_zero_indices = torch.nonzero(image_prediction[:, 4]).squeeze()
image_prediction_ = image_prediction[non_zero_indices, :].view(-1, 7)
# If there is no prediction, go to next iteration
if image_prediction_.shape[0] == 0:
continue
# Get the classes detected in the image
img_classes = unique(image_prediction_[:, -1])
# NMS each class
for cls in img_classes:
# Get the detections attributed to the current class
cls_mask = image_prediction_ * (image_prediction_[:, -1] == cls).float().unsqueeze(1)
cls_mask_indices = torch.nonzero(cls_mask[:, -2]).squeeze()
# Get the bboxes with the same class
img_pred_classes = image_prediction_[cls_mask_indices].view(-1, 7)
# Classificate the detections along the objectiveness score sequence from highest to lowest
obj_conf_desc_indices = torch.sort(img_pred_classes[:, 4], descending=True)[1]
img_pred_classes = img_pred_classes[obj_conf_desc_indices]
num_detections = img_pred_classes.size(0)
# NMS
for i in range(num_detections):
# Obtain the IoU (intersection over union) for all boxes below the one being seen
try:
ious = bbox_iou(img_pred_classes[i].unsqueeze(0), img_pred_classes[i+1:])
except ValueError:
break
except IndexError:
break
# Zero all detections with have a IoU > threshold, that is, similar to the above bbox
iou_mask = (ious < nms_conf).float().unsqueeze(1)
img_pred_classes[i+1:] *= iou_mask
# Keep the non-zero lines, including bboxes distinct from the above bbox
non_zero_indices = torch.nonzero(img_pred_classes[:, 4]).squeeze()
img_pred_classes = img_pred_classes[non_zero_indices].view(-1, 7)
# Saving the predictions
# For each batch with index i, with k detections, the batch_indexes will be a k-by-1 tensor filled with i
batch_indices = img_pred_classes.new_full((img_pred_classes.size(0), 1), ib)
# Generate the final tuple
seq = batch_indices, img_pred_classes
# Concatenate the detections
if not write:
output = torch.cat(seq, 1)
write = True
else:
new_out = torch.cat(seq, 1)
output = torch.cat((output, new_out))
try:
return output
except:
return 0
# Function to write the bounding box for each image
def draw_bbox(x, results, classes, colors):
# bbox coordinates
corner1 = tuple(x[1:3].int())
corner2 = tuple(x[3:5].int())
# Image
img = results[int(x[0])]
# Bbox line thickness
tl = round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
# Class
cls = int(x[-1])
# Color
color = random.choice(colors)
# Label
label = f"{classes[cls]}"
# Create the rectangle (bbox) in the image using OpenCV
cv2.rectangle(img, corner1, corner2, color, thickness = tl)
# Font thickness
tf = max(tl - 1, 1)
# Font size
t_size = cv2.getTextSize(label, 0, fontScale = tl/3, thickness = tf)[0]
# Corner
corner2 = corner1[0] + t_size[0]+3, corner1[1]-t_size[1]-3
# Write bbox and text
cv2.rectangle(img, corner1, corner2, color, -1)
cv2.putText(img, label, (corner1[0], corner1[1]-2), 0, tl/3, [225, 255, 255], thickness = tf, lineType = cv2.LINE_AA)
return img
def main():
# Define parameters from user inputs
args = arg_parse()
images = args.input
batch_size = int(args.batch)
confidence = float(args.confidence)
nms_thresh = float(args.nms_thresh)
start = 0
CUDA = torch.cuda.is_available()
# Load the classes
num_classes = 80
classes_arquivo = open('classes/coco.names', 'r')
classes_nomes = classes_arquivo.read().split('\n')[:-1]
classes = classes_nomes
# Load the YOLO model
print("\nLoading Model...")
modelo = Darknet(args.cfgfile)
modelo.load_weights(args.weights)
print("\nModel Loaded Successfully!")
# Define the resolution of the input images
modelo.net_info['height'] = args.resolution
# Input dimensions
input_dim = int(modelo.net_info['height'])
# Return an error if the dimensions are inadequate
assert input_dim % 32 == 0
assert input_dim > 32
# If there is a GPU, send the model to it
if CUDA:
modelo.cuda()
# Set the model to evaluation mode
modelo.eval()
# Checkpoint
read_dir = time.time()
# Check if files and directories exist
try:
# Fetch the images for detection
imglist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images)]
except NotADirectoryError:
imglist = []
imglist.append(osp.join(osp.realpath('.'), images))
except FileNotFoundError:
print(f"Could not find a file named {images}")
exit()
# If the output directory does not exist, creat it
if not os.path.exists(args.output):
os.makedirs(args.output)
# Checkpoint
carrega_batch = time.time()
# Load the images, reading them with opencv
imagens_carregadas = [cv2.imread(x) for x in imglist]
# Define the batches
img_batches = list(map(prep_image, imagens_carregadas, [input_dim for x in range(len(imglist))]))
# Input image dimensions
img_dim_list = [(x.shape[1], x.shape[0]) for x in imagens_carregadas]
# Convert to tensor
img_dim_list = torch.FloatTensor(img_dim_list).repeat(1, 2)
# Create the image batches and send them to the GPU, if one is available
# Each batch is a concatenation of images according to batch_size
# Check if there is a single image
contador = 0
if(len(img_dim_list) % batch_size):
contador = 1
# Handle image batching
if batch_size != 1:
num_batches = len(imglist) // batch_size + contador
img_batches = [torch.cat((img_batches[i * batch_size:min((i+1) * batch_size, len(img_batches))]), dim = 0) for i in range(num_batches)]
if CUDA:
img_dim_list = img_dim_list.cuda()
# Control variable for the dectection loop
write = 0
# Checkpoint
start_det_loop = time.time()
# Loop through the image batches and detect
for i, batch in enumerate(img_batches):
# Checkpoint
start = time.time()
# If using GPU, send the batch there
if CUDA:
batch = batch.cuda()
# Predict with the model
with torch.no_grad():
previsao = modelo(batch, CUDA)
# Store the prediction details
previsao = write_results(prediction = previsao, confidence = confidence, num_classes = num_classes, nms_conf = nms_thresh)
# Checkpoint
end = time.time()
# If type(prediction) == int, that means there is no prediction for this batch
if type(previsao) == int:
for img_index, image in enumerate(imglist[i*batch_size:min((i+1)*batch_size, len(imglist))]):
# Print the details about the detection attempt
print(f"{image.split('/')[-1]:20s} predicted in {(end-start)/batch_size:6.3f} seconds.")
print(f"{'Detected objects:':20s}")
print("-----------------------------------------------")
continue
# Bounding box indexes
previsao[:, 0] += i*batch_size
if not write:
output = previsao
write = 1
else:
output = torch.cat((output, previsao))
# Loop through the bbox indexes
for img_index, image in enumerate(imglist[i*batch_size:min((i+1)*batch_size, len(imglist))]):
# Generate a global index
global_img_index = i*batch_size + img_index
# Get the classes according go the indexes
objs = [classes[int(x[-1])] for x in output if int(x[0]) == global_img_index]
# Print the result
print(f"{image.split('/')[-1]:20s} predicted in {(end - start)/batch_size:6.3f} seconds.")
print(f"{'Deteted objects:':20s} {' '.join(objs):s}")
# Synchonize the GPU with the CPU
if CUDA:
torch.cuda.synchronize()
# Draw the bounding boxes
try:
output
except NameError:
print("No detection was made!")
exit()
# Adjust the bbox coordinates for inserting in the image
# Filter only the images with detections
img_dim_list = torch.index_select(img_dim_list, 0, output[:, 0].long())
# Image scale factor
scaling_factors = torch.min(input_dim / img_dim_list, 1)[0].view(-1, 1)
# x coordinate of the bbox corners
output[:, [1, 3]] -= (input_dim - scaling_factors * img_dim_list[:, 0].view(-1, 1)) / 2
# y coordinate of the bbox corners
output[:, [2, 4]] -= (input_dim - scaling_factors * img_dim_list[:, 1].view(-1, 1)) / 2
# Rescale to the original size
output[:, 1:5] /= scaling_factors
# Clip the bboxes whose limits are outside the image borders
# https://pytorch.org/docs/stable/generated/torch.clamp.html
for i in range(output.shape[0]):
output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, img_dim_list[i, 0])
output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, img_dim_list[i, 1])
# Color palette
colors = pkl.load(open('palette/palette', 'rb'))
# Checkpoints
output_recast = time.time()
class_load = time.time()
draw_start = time.time()
# Iteration
list(map(lambda x: draw_bbox(x, imagens_carregadas, classes, colors), output))
# Checkpoints
draw_end = time.time()
# Paths to save the images and detected objects
det_names = pd.Series(imglist).apply(lambda x: f"{args.output}/{x.split('/')[-1]}_detected")
# Save images with detections to the paths in det_names
list(map(cv2.imwrite, det_names, imagens_carregadas))
print("\nSummary")
print("----------------------------------------------------------")
print(f"{'Task':25s}: {'Total Time (seconds)'}")
print()
print(f"{'Checking Directory':25s}: {carrega_batch - read_dir:2.3f}")
print(f"{'Loading Batch':25s}: {start_det_loop - carrega_batch:2.3f}")
print(f"{'Detecting (' + str(len(imglist)) + ' images)':25s}: {output_recast - start_det_loop:2.3f}")
print(f"{'Processing Outputs':25s}: {class_load - output_recast:2.3f}")
print(f"{'Drawing Bounding Boxes':25s}: {draw_end - draw_start:2.3f}")
print(f"{'Mean Image Loading Time':25s}: {(end - carrega_batch) / len(imglist):2.3f}")
print("----------------------------------------------------------")
# Clean the GPU cache
torch.cuda.empty_cache()
print("\nDetection Finished\n")
sys.argv = ["Object_Detection_YOLO.ipynb",
"--input", "input",
"--output", "output",
"--batch", "1",
"--confidence", "0.7",
"--nms_thresh", "0.4",
"--cfgfile", "config/yolov3.cfg",
"--weights", "weights/yolov3.weights",
"--resolution", "384"]
main()
Loading Model... Model Loaded Successfully! C:\Portfolio\Object_Detection_YOLO\input\airplane.jpg predicted in 0.794 seconds. Deteted objects: aeroplane C:\Portfolio\Object_Detection_YOLO\input\dog_bike_truck.jpg predicted in 0.855 seconds. Deteted objects: bicycle truck dog C:\Portfolio\Object_Detection_YOLO\input\kitten.jpg predicted in 1.294 seconds. Deteted objects: cat C:\Portfolio\Object_Detection_YOLO\input\pizza.jpg predicted in 1.168 seconds. Deteted objects: pizza pizza pizza C:\Portfolio\Object_Detection_YOLO\input\santa.jpg predicted in 0.927 seconds. Deteted objects: person C:\Portfolio\Object_Detection_YOLO\input\teddybear_dog.jpg predicted in 1.083 seconds. Deteted objects: bird C:\Portfolio\Object_Detection_YOLO\input\wine.jpeg predicted in 0.956 seconds. Deteted objects: bottle bottle wine glass Summary ---------------------------------------------------------- Task : Total Time (seconds) Checking Directory : 0.000 Loading Batch : 0.423 Detecting (7 images) : 7.080 Processing Outputs : 0.000 Drawing Bounding Boxes : 0.000 Mean Image Loading Time : 1.072 ---------------------------------------------------------- Detection Finished
# Imports
import glob
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
%matplotlib inline
# Inputs and outputs
inputs = [file for file in glob.glob("input/*.*")]
outputs = [file for file in glob.glob("output/*.*")]
# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))
# Loop through axes and plot images
for axs_row in range(1):
# Plot original image
img = mpimg.imread(inputs[0])
axs[0].imshow(img)
axs[0].set_axis_off()
# Plot detected image
img = mpimg.imread(outputs[0])
axs[1].imshow(img)
axs[1].set_axis_off()
# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))
# Loop through axes and plot images
for axs_row in range(1):
# Plot original image
img = mpimg.imread(inputs[1])
axs[0].imshow(img)
axs[0].set_axis_off()
# Plot detected image
img = mpimg.imread(outputs[1])
axs[1].imshow(img)
axs[1].set_axis_off()
# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))
# Loop through axes and plot images
for axs_row in range(1):
# Plot original image
img = mpimg.imread(inputs[2])
axs[0].imshow(img)
axs[0].set_axis_off()
# Plot detected image
img = mpimg.imread(outputs[2])
axs[1].imshow(img)
axs[1].set_axis_off()
# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))
# Loop through axes and plot images
for axs_row in range(1):
# Plot original image
img = mpimg.imread(inputs[3])
axs[0].imshow(img)
axs[0].set_axis_off()
# Plot detected image
img = mpimg.imread(outputs[3])
axs[1].imshow(img)
axs[1].set_axis_off()
# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))
# Loop through axes and plot images
for axs_row in range(1):
# Plot original image
img = mpimg.imread(inputs[4])
axs[0].imshow(img)
axs[0].set_axis_off()
# Plot detected image
img = mpimg.imread(outputs[4])
axs[1].imshow(img)
axs[1].set_axis_off()
# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))
# Loop through axes and plot images
for axs_row in range(1):
# Plot original image
img = mpimg.imread(inputs[5])
axs[0].imshow(img)
axs[0].set_axis_off()
# Plot detected image
img = mpimg.imread(outputs[5])
axs[1].imshow(img)
axs[1].set_axis_off()
# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))
# Loop through axes and plot images
for axs_row in range(1):
# Plot original image
img = mpimg.imread(inputs[6])
axs[0].imshow(img)
axs[0].set_axis_off()
# Plot detected image
img = mpimg.imread(outputs[6])
axs[1].imshow(img)
axs[1].set_axis_off()
Matheus Schmitz
LinkedIn
Github Portfolio