!pip install opencv-python==4.2.0.34

Requirement already satisfied: opencv-python==4.2.0.34 in c:\users\matheus\anaconda3\lib\site-packages (4.2.0.34)
Requirement already satisfied: numpy>=1.14.5 in c:\users\matheus\appdata\roaming\python\python37\site-packages (from opencv-python==4.2.0.34) (1.19.5)


# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Ignore warnings
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter('ignore')


# Function to read the configuration file
# Returns the blocks used to build the neural network
def parse_config_file(config_file):

    # Open the file for reading
    arquivo = open(config_file, 'r')

    # Read the lines and convert to a list
    # Remove blank lines
    # Remove comments
    # Remove white spaces
    linhas = arquivo.read().split('\n')           
    linhas = [x for x in linhas if len(x) > 0]        
    linhas = [x for x in linhas if x[0] != '#']    
    linhas = [x.rstrip().lstrip() for x in linhas]   

    # Dictionary and list of hyperparameter blocks
    block = {}
    blocks = []

    # Loop through the lines
    for linha in linhas:

        # Get the type (class) of the block of hyperparameters
        if linha[0] == "[":
            # We are starting a new block, so first add the previous/current one to the list of blocks, then start the new one
            if len(block) != 0:
                blocks.append(block)
                block = {}
            # Name the block type accordingly (minus the [] brackets)
            block["type"] = linha[1:-1].rstrip()   
        # If/while inside a block, get the hyperparameter and value to use
        else:
            key, value = linha.split("=")
            block[key.rstrip()] = value.lstrip()
    
    # Need an extra line to append the last block (because usually we append the block before starting the next one)
    blocks.append(block)

    return blocks


# Function to create the network modules
# Returns PyTorch objects
def create_modules(blocks):
   
    # Info about the neural network input parameters
    net_info = blocks[0]   
    
    # Create an object to build the modules
    # https://pytorch.org/docs/master/generated/torch.nn.ModuleList.html
    lista_modulos = nn.ModuleList()

    # Number of color channels in the images
    num_filters = 3  

    # List used in the route layers to keep record of all filters' outputs
    output_filters = []  

    # Iterate through the blocks and create the neural network's modules (layers)
    for index, x in enumerate(blocks[1:]):

        # Create the current module (sequence of steps)
        # https://pytorch.org/docs/master/generated/torch.nn.Sequential.html
        module = nn.Sequential()
        
        # Check the block type
        if x['type'] == 'convolutional':
            
            # Extract the hyperparameters for a convolutional layer
            activation = x['activation']
            filters = int(x['filters'])
            padding = int(x['pad'])
            kernel_size = int(x['size'])
            stride = int(x['stride'])
            
            # Add batch normalization if the layer has it
            try:
                batch_normalize = int(x['batch_normalize'])
                bias = False
            except:
                batch_normalize = 0
                bias = True

            # Adjust the padding
            if padding:
                pad = (kernel_size - 1)//2
            else:
                pad = 0

            # Create the convolutional layer
            # https://pytorch.org/docs/master/generated/torch.nn.Conv2d.html
            conv = nn.Conv2d(in_channels = num_filters, 
                             out_channels = filters, 
                             kernel_size = kernel_size, 
                             stride = stride, 
                             padding = pad, 
                             bias = bias)
            
            # Add the convolutional layer to the list of modules
            module.add_module(f'conv_{index}', conv)

            # Add the batch normalization layer
            # https://pytorch.org/docs/master/generated/torch.nn.BatchNorm2d.html
            if batch_normalize:
                bn = nn.BatchNorm2d(num_features = filters)
                module.add_module(f'batch_norm_{index}', bn)

            # Check the activation type and add an activation layer (YOLO v3 uses only LeakyReLU activations)
            # https://pytorch.org/docs/master/generated/torch.nn.LeakyReLU.html
            if activation == 'leaky':
                activn = nn.LeakyReLU(negative_slope = 0.1, inplace = True)
                module.add_module(f'leaky_{index}', activn)

        # Upsampling layer - used to restore the image resolution to the size of the previous layer
        # https://pytorch.org/docs/stable/generated/torch.nn.Upsample.html 
        elif x['type'] == 'upsample':
            stride = int(x['stride'])
            upsample = nn.Upsample(scale_factor = 2, mode = 'bilinear')
            module.add_module(f'upsample_{index}', upsample)
        
        # Route layer - used to calculate the output depth (filters) resulting from concatenation. Similar to concat layers.
        # When the attribute has a single value, the layer generates feature maps indexed by that value.
        # E.g. if route = -4, then the route layer will have the feature maps of a layer 4 steps behind it.
        # When the attribute has two values, the layer returns a concatenation of the feature maps of both layers by index number.
        # E.g. if route = -1, 61, the layer will have the feature maps of the previous layer (-1) and the 61st layer, ...
        # ... with both feature maps concatenated along the depth dimension.
        # https://github.com/pjreddie/darknet/issues/545
        # https://github.com/AlexeyAB/darknet/issues/487#issuecomment-374902735
        # https://github.com/AlexeyAB/darknet/issues/279#issuecomment-397248821
        # https://github.com/AlexeyAB/darknet#how-to-train-to-detect-your-custom-objects
        elif x['type'] == 'route':
            x['layers'] = x['layers'].split(',')
            
            # Route "start"
            start = int(x['layers'][0])
            
            # Check if there are two attributes/values
            # If yes, set the second attribute as the route "end"
            try:
                end = int(x['layers'][1])
            # If not, set the end to zero
            except:
                end = 0
            
            # Calculate relative positions
            if start > 0:
                start = start - index 
            if end > 0:
                end = end - index

            # Create the layer
            route = EmptyLayer()

            # Add the layer to the neural network module
            module.add_module(f'route_{index}', route)

            # Extract the filters
            if end < 0:
                filters = output_filters[index + start] + output_filters[index + end]
            else:
                filters = output_filters[index+start]
        
        # Shortcut layer - same as a skip layer in ResNet.
        # E.g. if the hyperparameter is -3, then the shortcut layer's output is obtained by merging the feature vectors ...
        # ... from the previous layer abd the layer 3 steps behind the shortcut layer.
        elif x['type'] == 'shortcut':

            # Create the layer
            shortcut = EmptyLayer()

            # Add layer to the model
            module.add_module(f'shortcut_{index}', shortcut)

        # YOLO layer with anchor detection
        # The YOLO layer is the detection layer. The anchors describe 9 total anchors, yet only those anchors indexed by ...
        # ... the mask attributes are used. E.g. if the mask value is 0, 1, 2, that means the first, second and third anchors ...
        # ... will be used. This makes sense given that each cell in the detection layer predicts 3 boxes. In total, we have ...
        # ... detection in three scales, resulting in 9 anchors.
        # Anchors: Predetermined set of bounding boxes with specific height-width ratios.
        # Mask: List of anchor IDs which the layer is responsible for predictign.
        # Num: total number of anchors.
        # YOLO v3 predicts a predetermined set of anchors, which have initial sizes (height, width), some of which (the one ...
        # ... closest to the object size) will be redimensioned to the object's size.
        # Each YOLO layer must know all anchors, but is responsible for only a subset of them.
        # The mask tells the layer which anchors it should use for predicting. The first YOLO layer is assigned anchors 6,7,8, ...
        # ... the second gets 3,4,5, and the third gets 0,1,2.
        elif x['type'] == 'yolo':

            # Extract the mask values
            mask = list(map(int, x['mask'].split(',')))

            # Extract the anchor values
            anchors = list(map(int, x['anchors'].split(',')))
            anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors), 2)]

            # Filter the list of anchors using the mask
            anchors = [anchors[i] for i in mask]

            # Create the anchor detection layer
            detection = DetectAnchor(anchors)

            # Add layer to the model
            module.add_module(f'detection_{index}', detection)

        # Load the list of modules (layer groups), filters and output filters
        lista_modulos.append(module)
        num_filters = filters
        output_filters.append(filters)

    return (net_info, lista_modulos)

# Function summary:
# YOLO has 5 layer types: Convolutional, Upsample, Route, Shortcut and YOLO.
# All customization of a YOLO model is done by adjusting hyperparameter values in the configuration file.
# The configuration file describes the YOLO network layout block by block.
# The YOLO architecture is also known as Darknet.


# Function to make predictions
# Take a feature vector of detections and transform it into a 2D tensor, in which each tensor line corresponds to ...
# ... the attributes of one bounding box (anchor).
# Arguments:
# prediction (tensor): previous output
# input_dim (int): dimension of the input image
# anchros (list(tuple)): anchors used in the YOLO detection layer
# num_classes (int): total number of classes
# CUDA (bool): optional argument to define whether or not to use GPU
# Function retuns:
# prediction (tensor): redimensioned (3D tensor) prediction output of the current YOLO layer.
# The three dimensions are: [batch size, number of bounding boxes, bound box attributes]
def make_predictions(prediction, input_dim, anchors, num_classes, CUDA = True):
   
    # Hyperparameters for the predictions
    batch_size = prediction.size(0)
    stride = input_dim // prediction.size(2)
    grid_size = input_dim // stride
    bbox_attributes = 5 + num_classes
    num_anchors = len(anchors)

    # Adjust the prediction object's shape
    prediction = prediction.view(batch_size, bbox_attributes * num_anchors, grid_size * grid_size)

    # Transpose the matrix
    prediction = prediction.transpose(1, 2).contiguous()

    # New shape adjust
    prediction = prediction.view(batch_size, grid_size * grid_size * num_anchors, bbox_attributes)

    # Proportionally resize the anchors based on stride
    # [(,),(,),(,)] -> tensor([[,],[,],[,]])   size([3,2])
    anchors = [(anchor[0]/stride, anchor[1]/stride) for anchor in anchors]

    # Sigmoid transformation: centre_X, centre_Y, objectness score
    prediction[:, :, 0] = torch.sigmoid(prediction[:, :, 0])
    prediction[:, :, 1] = torch.sigmoid(prediction[:, :, 1])
    prediction[:, :, 4] = torch.sigmoid(prediction[:, :, 4])

    # Add the grids for the coordinate centers
    grid = np.arange(grid_size)
    a, b = np.meshgrid(grid, grid)

    # Adjust the shapes
    x_offset = torch.FloatTensor(a).view(-1, 1)  
    y_offset = torch.FloatTensor(b).view(-1, 1)

    # Verify whether to use GPU
    if CUDA:
        x_offset = x_offset.cuda()
        y_offset = y_offset.cuda()
        prediction = prediction.cuda() 

    # Concatenate x and y for the prediction
    x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1, num_anchors).view(-1, 2).unsqueeze(0)  
    prediction[:, :, :2] += x_y_offset

    # Convert the object with the anchor values to a float tensor
    anchors = torch.FloatTensor(anchors)

    # If using GPU, send the anchors to the GPU too
    if CUDA:
        anchors = anchors.cuda()

    # Matrix operations for the final anchor value
    anchors = anchors.repeat(grid_size * grid_size, 1).unsqueeze(0)

    # Element-wise multiplication of the anchors' predictions
    prediction[:, :, 2:4] = torch.exp(prediction[:, :, 2:4]) * anchors  

    # Sigmoid activation for the class scores
    prediction[:, :, 5:5 + num_classes] = torch.sigmoid(prediction[:, :, 5:5 + num_classes])

    # Reshape the predictions map to the size of the input image
    prediction[:, :, :4] *= stride

    return prediction


# Class that returns the route (concat) layer of the neural network
# Used to concatenate withthe network body, simplifying changes to the input data
class EmptyLayer(nn.Module):
    def __init__(self):
        super(EmptyLayer, self).__init__()


# Anchor detection class
class DetectAnchor(nn.Module):
    def __init__(self, anchors):
        super(DetectAnchor, self).__init__()
        self.anchors = anchors


# YOLO architecture, also referred to as Darknet in the paper's documentation
# Has the convolutional layers
class Darknet(nn.Module):

    # Construtor da classe
    def __init__(self, config_file):

        # Initialize the class
        super(Darknet, self).__init__()

        # Read the configuration file
        self.blocks = parse_config_file(config_file)

        # Create the neural networks' modules
        self.net_info, self.lista_modulos = create_modules(self.blocks)
    
    # Forward propagation
    def forward(self, x, CUDA):

        # Modules (network's layers)
        modulos = self.blocks[1:]

        # Cache of all layer outputs, needed for the route/shortcut layers
        outputs = {}    

        # Track if the first detetion layer was found
        write = 0   

        # Loop through the modules
        for i, modulo in enumerate(modulos):

            # Module type
            tipo_modulo = (modulo['type'])

            # Convolution and upsample layers
            if tipo_modulo == 'convolutional' or tipo_modulo == 'upsample':

                # Define the layer
                x = self.lista_modulos[i](x)

            # Route layers: concatenate two feature maps from other layers
            elif tipo_modulo == 'route':

                # Layers
                camadas = modulo['layers']
                camadas = [int(a) for a in camadas]  

                if camadas[0] > 0:
                    camadas[0] = camadas[0] - i

                if len(camadas) == 1:
                    x = outputs[i + (camadas[0])]
                else:
                    if camadas[1] > 0:
                        camadas[1] = camadas[1] - i

                    feature_map_1 = outputs[i + camadas[0]]
                    feature_map_2 = outputs[i + camadas[1]]

                    # Concatenate along the depth dimension
                    x = torch.cat((feature_map_1, feature_map_2), dim = 1)

            # Shortcut layer
            elif tipo_modulo == 'shortcut':

                # Origin layer
                from_layer = int(modulo['from'])
                
                # Addition
                x = outputs[i - 1] + outputs[i + from_layer]

            # YOLO layer
            elif tipo_modulo == 'yolo':

                # Neural network's hyperparameters
                anchors = self.lista_modulos[i][0].anchors
                input_dim = int(self.net_info['height'])
                num_classes = int(modulo['classes'])

                # Make predictions (aka detect objects in the image)
                x = make_predictions(prediction = x, 
                                  input_dim = input_dim,
                                  anchors = anchors,
                                  num_classes = num_classes,
                                  CUDA = CUDA)

                # If this is the first detection layer, then x represents the detections
                if not write:   
                    detections = x
                    write = 1
                # If this is not the first detection layer, concatenate the predictions with the ones from previous layers
                else:
                    detections = torch.cat((detections, x), 1)

            outputs[i] = x

        return detections

    # Backward propagation
    # Loading weights from the pre-trained model
    def load_weights(self, weight_file):

        # Open the weights file
        fp = open(weight_file, 'rb')

        #The first 5 values are header information 
        # 1. Major version number
        # 2. Minor Version Number
        # 3. Subversion number 
        # 4,5. Images seen by the network (during training)
        header = np.fromfile(fp, dtype = np.int32, count=5)
        self.header = torch.from_numpy(header)
        self.seen = self.header[3]

        # Load the weights
        weights = np.fromfile(fp, dtype = np.float32)

        # Control parameter to track where in the weights file we are
        ptr = 0  

        # Loop through the module types
        for i in range(len(self.lista_modulos)):

            # Extract the module type
            module_type = self.blocks[i+1]['type']

            # Load the weights for the convolutional layers
            if module_type == 'convolutional':
                model = self.lista_modulos[i]
                try:
                    batch_normalize = int(self.blocks[i+1]['batch_normalize'])
                except:
                    batch_normalize = 0
                conv = model[0]

                if (batch_normalize):

                    # Load the weights for the batch normalization layer
                    bn = model[1]
                    num_bn_biases = bn.bias.numel()
                    bn_biases = torch.from_numpy(weights[ptr:ptr+num_bn_biases])
                    ptr += num_bn_biases

                    bn_weights = torch.from_numpy(weights[ptr:ptr+num_bn_biases])
                    ptr += num_bn_biases

                    bn_running_mean = torch.from_numpy(weights[ptr:ptr+num_bn_biases])
                    ptr += num_bn_biases

                    bn_running_var = torch.from_numpy(weights[ptr:ptr+num_bn_biases])
                    ptr += num_bn_biases

                    # Adjust dimensions
                    bn_biases = bn_biases.view_as(bn.bias.data)
                    bn_weights = bn_weights.view_as(bn.weight.data)
                    bn_running_mean = bn_running_mean.view_as(bn.running_mean)
                    bn_running_var = bn_running_var.view_as(bn.running_var)

                    # Copy the data to the model
                    bn.bias.data.copy_(bn_biases)
                    bn.weight.data.copy_(bn_weights)
                    bn.running_mean.copy_(bn_running_mean)
                    bn.running_var.copy_(bn_running_var)
                
                else:
                    # Number of biases
                    num_biases = conv.bias.numel()

                    # Load the weights
                    conv_biases = torch.from_numpy(weights[ptr:ptr+num_biases])
                    ptr += num_biases

                    # Reshape the loaded weights accorindg to the model weights' dimension
                    conv_biases = conv_biases.view_as(conv.bias.data)

                    # Copy the data to the model
                    conv.bias.data.copy_(conv_biases)

                # Load the convolutional layer's weights (same with or without batch_normalization)
                num_weights = conv.weight.numel()
                conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights])
                ptr += num_weights

                # Reshape the loaded weights accorindg to the model weights' dimension
                conv_weights = conv_weights.view_as(conv.weight.data)
                
                # Copy the data to the model
                conv.weight.data.copy_(conv_weights)


# Py Data Stack
import numpy as np
import pandas as pd

# System manipulation
import os
import time
import argparse
import random
import pickle as pkl

# Image Processing
import cv2
import torch
import torch.nn as nn
from torch.autograd import Variable

# Package versions
import watermark


%load_ext watermark
%watermark --iversions -v

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Python implementation: CPython
Python version       : 3.7.10
IPython version      : 7.23.1

pandas   : 1.2.3
cv2      : 4.2.0
argparse : 1.1
sys      : 3.7.10 (default, Feb 26 2021, 13:06:18) [MSC v.1916 64 bit (AMD64)]
watermark: 2.1.0
torch    : 1.5.1+cu101
numpy    : 1.19.5


# Function to parse command line arguments
def arg_parse():

    parser = argparse.ArgumentParser(description = 'YOLO Object Detector')
    parser.add_argument("--input",      dest = 'input',      help = "Directory with images/videos for object detection.", default = "input", type = str)
    parser.add_argument("--output",     dest = 'output',     help = "Directory to store images/videos with detected objects.", default = "output", type = str)
    parser.add_argument("--batch",      dest = "batch",      help = "Batch size.", default = 1)
    parser.add_argument("--confidence", dest = "confidence", help = "Confidense threshold to filter predictions.", default = 0.7)
    parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS threshold.", default = 0.4)
    parser.add_argument("--cfgfile",    dest = 'cfgfile',    help = "YOLO configuration file.", default = "config/yolov3.cfg", type = str)
    parser.add_argument("--weights",    dest = 'weights',    help = "File with pretrained weights.", default = "pesos/yolov3.weights", type = str)
    parser.add_argument("--resolution", dest = 'resolution', help = "Resolution of the input images. Increase to improve accuracy. Decrease to speed up detection.", default = "384", type = str)

    return parser.parse_args()


# Function to calculate the IoU of two bounding boxes.
# The Intersection over Union (IoU) if a measure of the overlap between two bounding boxes.
# IoU is the Jaccard Similarity of the areas of two objects in a plane.
# In computer vision it is used to correctly detect an object.
# By convertion, the predicted bouding box is considered correct if the IoU is greater than 0.5.
# Incrasing the threshold improves precision but worsens recall.
# If the predicted bbox and the real bbox overlapped perfectly, the IoU would be 1.
# Non-Maximal Supression (NMS) cleans the multiple detections and keeps only one detection per object. For this, it ...
# ... chooses the highest probability bbox and supresses all other bboxes whose IoU is greater. Therefore, in the end ...
# ... only one bbox is kept, likely the most precise one (and unlikely the least precise one).
def bbox_iou(box1, box2):
   
    # Calculate the maximum and minimum intersection
    inter_max_xy = torch.min(box1[:, 2:4], box2[:, 2:4])
    inter_min_xy = torch.max(box1[:, 0:2], box2[:, 0:2])

    # Calculate the intersection area
    inter_size = torch.clamp((inter_max_xy-inter_min_xy), min = 0)
    inter_area = inter_size[:, 0]*inter_size[:, 1]  

    # Calculate the areas
    b1_area = (box1[:, 2]-box1[:, 0])*(box1[:, 3]-box1[:, 1])
    b2_area = (box2[:, 2]-box2[:, 0])*(box2[:, 3]-box2[:, 1])

    # Calculate IoU
    iou = inter_area / (b1_area + b2_area - inter_area)

    return iou


# Resize the image
def resize_image(source, input_dim):

    # Extract the shapes
    src_height, src_width = source.shape[0], source.shape[1]

    # Dimensions
    input_height, input_width = input_dim

    # Adjust dimensions
    multiple = min(input_height/src_height, input_width/src_width)
    dst_height = int(src_height * multiple)
    dst_width = int(src_width * multiple)

    # Resize
    resized_image = cv2.resize(source, (dst_width, dst_height), interpolation = cv2.INTER_CUBIC)

    # Canvas for the image
    canvas = np.full((input_height, input_width, 3), 128)
    canvas[(input_height - dst_height) // 2:(input_height - dst_height) // 2 + dst_height,
    (input_width - dst_width) // 2: (input_width - dst_width) // 2 + dst_width, :] = resized_image

    return canvas


# Prepare the image for the neural network by transforming it from numpy to tensor
def prep_image(img, input_dim):

    # Resize the input image
    img = (resize_image(img, (input_dim, input_dim)))
    
    # BGR -> RBG, HWC -> CHW
    img = img[:, :, :: -1].transpose((2, 0, 1)).copy()
    
    # numpy -> tensor
    img = torch.from_numpy(img).float().div(255).unsqueeze(0)
    return img


# Obtain the classes present in a given image and returns only one
def unique(indices):
   
    indices_np = indices.cpu().numpy()
    unique_np = np.unique(indices_np)
    unique_indices = torch.from_numpy(unique_np)
    indices_res = unique_indices.detach().clone()

    return indices_res


# Function to make the detections
# To obtain true detections, subjecting the output to the objectiveness threshold and to the Non-Maximal Supression (NMS)
# Return a tensor of shape: (D * 8), in which D in the number of true detections in all images, each represented by a line.
# Each detection has the attributes: image index for the batch to which the image belongs, 4 bbox coordinates, ...
# ... objectiveness score, score of the max confidence class, and class index.
def write_results(prediction, confidence, num_classes, nms_conf = 0.4):
   
    # Task 1: Objectness confidence thresholding
    conf_mask = (prediction[:, :, 4] > confidence).float().unsqueeze(2)  
    prediction *= conf_mask

    # Task 2: Locate the bbox corners
    box_corner = prediction.detach().clone()
    box_corner[:, :, 0] = (prediction[:, :, 0] - prediction[:, :, 2]/2)
    box_corner[:, :, 1] = (prediction[:, :, 1] - prediction[:, :, 3]/2)
    box_corner[:, :, 2] = (prediction[:, :, 0] + prediction[:, :, 2]/2)
    box_corner[:, :, 3] = (prediction[:, :, 1] + prediction[:, :, 3]/2)
    prediction[:, :, :4] = box_corner[:, :, :4]

    batch_size = prediction.size(0)
    write = False 

    # Task 3: Loop through the batch's images
    # Confidence threshold -> Only care about the value of the largest score class.
    # Get the index of the largest score class and its score.
    for ib in range(batch_size):

        # Image prediction
        image_prediction = prediction[ib]    

        # Indexes and ajust dimensions
        max_conf, max_conf_indices = torch.max(image_prediction[:, 5:5+num_classes], 1)
        max_conf = max_conf.float().unsqueeze(1)    
        max_conf_indices = max_conf_indices.float().unsqueeze(1)
        image_prediction = torch.cat((image_prediction[:, :5], max_conf, max_conf_indices), 1)

        # Get rid of the zero objectiveness lines
        non_zero_indices = torch.nonzero(image_prediction[:, 4]).squeeze()
        image_prediction_ = image_prediction[non_zero_indices, :].view(-1, 7)

        # If there is no prediction, go to next iteration
        if image_prediction_.shape[0] == 0:
            continue   

        # Get the classes detected in the image
        img_classes = unique(image_prediction_[:, -1])

        # NMS each class
        for cls in img_classes:

            # Get the detections attributed to the current class
            cls_mask = image_prediction_ * (image_prediction_[:, -1] == cls).float().unsqueeze(1)
            cls_mask_indices = torch.nonzero(cls_mask[:, -2]).squeeze()

            # Get the bboxes with the same class
            img_pred_classes = image_prediction_[cls_mask_indices].view(-1, 7) 

            # Classificate the detections along the objectiveness score sequence from highest to lowest
            obj_conf_desc_indices = torch.sort(img_pred_classes[:, 4], descending=True)[1]
            img_pred_classes = img_pred_classes[obj_conf_desc_indices]
            num_detections = img_pred_classes.size(0)

            # NMS
            for i in range(num_detections):

                # Obtain the IoU (intersection over union) for all boxes below the one being seen
                try:
                    ious = bbox_iou(img_pred_classes[i].unsqueeze(0), img_pred_classes[i+1:])
                except ValueError:
                    break   
                except IndexError:
                    break   

                # Zero all detections with have a IoU > threshold, that is, similar to the above bbox
                iou_mask = (ious < nms_conf).float().unsqueeze(1)
                img_pred_classes[i+1:] *= iou_mask

                # Keep the non-zero lines, including bboxes distinct from the above bbox
                non_zero_indices = torch.nonzero(img_pred_classes[:, 4]).squeeze()
                img_pred_classes = img_pred_classes[non_zero_indices].view(-1, 7)

            # Saving the predictions
            # For each batch with index i, with k detections, the batch_indexes will be a k-by-1 tensor filled with i
            batch_indices = img_pred_classes.new_full((img_pred_classes.size(0), 1), ib)
  
            # Generate the final tuple
            seq = batch_indices, img_pred_classes   

            # Concatenate the detections
            if not write:
                output = torch.cat(seq, 1)
                write = True
            else:
                new_out = torch.cat(seq, 1)
                output = torch.cat((output, new_out))

    try:
        return output
    except:
        return 0


# Function to write the bounding box for each image
def draw_bbox(x, results, classes, colors):

    # bbox coordinates
    corner1 = tuple(x[1:3].int())
    corner2 = tuple(x[3:5].int())

    # Image
    img = results[int(x[0])]

    # Bbox line thickness
    tl = round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 

    # Class
    cls = int(x[-1])

    # Color
    color = random.choice(colors)

    # Label
    label = f"{classes[cls]}"

    # Create the rectangle (bbox) in the image using OpenCV
    cv2.rectangle(img, corner1, corner2, color, thickness = tl)

    # Font thickness
    tf = max(tl - 1, 1) 

    # Font size
    t_size = cv2.getTextSize(label, 0, fontScale = tl/3, thickness = tf)[0]

    # Corner
    corner2 = corner1[0] + t_size[0]+3, corner1[1]-t_size[1]-3

    # Write bbox and text
    cv2.rectangle(img, corner1, corner2, color, -1)
    cv2.putText(img, label, (corner1[0], corner1[1]-2), 0, tl/3, [225, 255, 255], thickness = tf, lineType = cv2.LINE_AA)

    return img


def main():
    
    # Define parameters from user inputs
    args = arg_parse()
    images = args.input
    batch_size = int(args.batch)
    confidence = float(args.confidence)
    nms_thresh = float(args.nms_thresh)
    start = 0
    CUDA = torch.cuda.is_available()

    # Load the classes
    num_classes = 80   
    classes_arquivo = open('classes/coco.names', 'r')
    classes_nomes = classes_arquivo.read().split('\n')[:-1]
    classes = classes_nomes

    # Load the YOLO model
    print("\nLoading Model...")
    modelo = Darknet(args.cfgfile)
    modelo.load_weights(args.weights)
    print("\nModel Loaded Successfully!")

    # Define the resolution of the input images
    modelo.net_info['height'] = args.resolution

    # Input dimensions
    input_dim = int(modelo.net_info['height'])

    # Return an error if the dimensions are inadequate
    assert input_dim % 32 == 0
    assert input_dim > 32

    # If there is a GPU, send the model to it
    if CUDA:
        modelo.cuda()

    # Set the model to evaluation mode
    modelo.eval()

    # Checkpoint
    read_dir = time.time() 

    # Check if files and directories exist
    try:
        # Fetch the images for detection
        imglist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images)]
    except NotADirectoryError:
        imglist = []
        imglist.append(osp.join(osp.realpath('.'), images))
    except FileNotFoundError:
        print(f"Could not find a file named {images}")
        exit()

    # If the output directory does not exist, creat it
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    
    # Checkpoint
    carrega_batch = time.time() 

    # Load the images, reading them with opencv
    imagens_carregadas = [cv2.imread(x) for x in imglist]  

    # Define the batches
    img_batches = list(map(prep_image, imagens_carregadas, [input_dim for x in range(len(imglist))]))

    # Input image dimensions
    img_dim_list = [(x.shape[1], x.shape[0]) for x in imagens_carregadas]

    # Convert to tensor
    img_dim_list = torch.FloatTensor(img_dim_list).repeat(1, 2)

    # Create the image batches and send them to the GPU, if one is available
    # Each batch is a concatenation of images according to batch_size
    
    # Check if there is a single image
    contador = 0
    if(len(img_dim_list) % batch_size):
        contador = 1

    # Handle image batching
    if batch_size != 1:
        num_batches = len(imglist) // batch_size + contador
        img_batches = [torch.cat((img_batches[i * batch_size:min((i+1) * batch_size, len(img_batches))]), dim = 0) for i in range(num_batches)]

    if CUDA:
        img_dim_list = img_dim_list.cuda()
    
    # Control variable for the dectection loop
    write = 0

    # Checkpoint
    start_det_loop = time.time()    

    # Loop through the image batches and detect
    for i, batch in enumerate(img_batches):

        # Checkpoint
        start = time.time()  

        # If using GPU, send the batch there
        if CUDA:
            batch = batch.cuda()

        # Predict with the model
        with torch.no_grad():  
            previsao = modelo(batch, CUDA)

        # Store the prediction details
        previsao = write_results(prediction = previsao, confidence = confidence, num_classes = num_classes, nms_conf = nms_thresh)

        # Checkpoint
        end = time.time()   

        # If type(prediction) == int, that means there is no prediction for this batch
        if type(previsao) == int:   
            for img_index, image in enumerate(imglist[i*batch_size:min((i+1)*batch_size, len(imglist))]):
                
                # Print the details about the detection attempt
                print(f"{image.split('/')[-1]:20s} predicted in {(end-start)/batch_size:6.3f} seconds.")
                print(f"{'Detected objects:':20s}")
                print("-----------------------------------------------")
                
            continue  

        # Bounding box indexes
        previsao[:, 0] += i*batch_size

        if not write:
            output = previsao
            write = 1
        else:
            output = torch.cat((output, previsao))   

        # Loop through the bbox indexes
        for img_index, image in enumerate(imglist[i*batch_size:min((i+1)*batch_size, len(imglist))]):

            # Generate a global index
            global_img_index = i*batch_size + img_index 

            # Get the classes according go the indexes
            objs = [classes[int(x[-1])] for x in output if int(x[0]) == global_img_index]

            # Print the result
            print(f"{image.split('/')[-1]:20s} predicted in {(end - start)/batch_size:6.3f} seconds.")
            print(f"{'Deteted objects:':20s} {' '.join(objs):s}")

        # Synchonize the GPU with the CPU
        if CUDA:
            torch.cuda.synchronize()

    # Draw the bounding boxes
    try:
        output
    except NameError:
        print("No detection was made!")
        exit() 

    # Adjust the bbox coordinates for inserting in the image
    
    # Filter only the images with detections
    img_dim_list = torch.index_select(img_dim_list, 0, output[:, 0].long())

    # Image scale factor
    scaling_factors = torch.min(input_dim / img_dim_list, 1)[0].view(-1, 1)

    # x coordinate of the bbox corners
    output[:, [1, 3]] -= (input_dim - scaling_factors * img_dim_list[:, 0].view(-1, 1)) / 2
    
    # y coordinate of the bbox corners
    output[:, [2, 4]] -= (input_dim - scaling_factors * img_dim_list[:, 1].view(-1, 1)) / 2

    # Rescale to the original size
    output[:, 1:5] /= scaling_factors

    # Clip the bboxes whose limits are outside the image borders
    # https://pytorch.org/docs/stable/generated/torch.clamp.html
    for i in range(output.shape[0]):
        output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, img_dim_list[i, 0])
        output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, img_dim_list[i, 1])

    # Color palette
    colors = pkl.load(open('palette/palette', 'rb'))

    # Checkpoints
    output_recast = time.time()  
    class_load = time.time()   
    draw_start = time.time()
    
    # Iteration
    list(map(lambda x: draw_bbox(x, imagens_carregadas, classes, colors), output))

    # Checkpoints
    draw_end = time.time()

    # Paths to save the images and detected objects
    det_names = pd.Series(imglist).apply(lambda x: f"{args.output}/{x.split('/')[-1]}_detected")

    # Save images with detections to the paths in det_names
    list(map(cv2.imwrite, det_names, imagens_carregadas))

    print("\nSummary")
    print("----------------------------------------------------------")
    print(f"{'Task':25s}: {'Total Time (seconds)'}")
    print()
    print(f"{'Checking Directory':25s}: {carrega_batch - read_dir:2.3f}")
    print(f"{'Loading Batch':25s}: {start_det_loop - carrega_batch:2.3f}")
    print(f"{'Detecting (' + str(len(imglist)) + ' images)':25s}: {output_recast - start_det_loop:2.3f}")
    print(f"{'Processing Outputs':25s}: {class_load - output_recast:2.3f}")
    print(f"{'Drawing Bounding Boxes':25s}: {draw_end - draw_start:2.3f}")
    print(f"{'Mean Image Loading Time':25s}: {(end - carrega_batch) / len(imglist):2.3f}")
    print("----------------------------------------------------------")

    # Clean the GPU cache
    torch.cuda.empty_cache()

    print("\nDetection Finished\n")


sys.argv = ["Object_Detection_YOLO.ipynb",
            "--input", "input", 
            "--output", "output", 
            "--batch", "1", 
            "--confidence", "0.7", 
            "--nms_thresh", "0.4", 
            "--cfgfile", "config/yolov3.cfg", 
            "--weights", "weights/yolov3.weights", 
            "--resolution", "384"]
main()

Loading Model...

Model Loaded Successfully!
C:\Portfolio\Object_Detection_YOLO\input\airplane.jpg predicted in  0.794 seconds.
Deteted objects:     aeroplane
C:\Portfolio\Object_Detection_YOLO\input\dog_bike_truck.jpg predicted in  0.855 seconds.
Deteted objects:     bicycle truck dog
C:\Portfolio\Object_Detection_YOLO\input\kitten.jpg predicted in  1.294 seconds.
Deteted objects:     cat
C:\Portfolio\Object_Detection_YOLO\input\pizza.jpg predicted in  1.168 seconds.
Deteted objects:     pizza pizza pizza
C:\Portfolio\Object_Detection_YOLO\input\santa.jpg predicted in  0.927 seconds.
Deteted objects:     person
C:\Portfolio\Object_Detection_YOLO\input\teddybear_dog.jpg predicted in  1.083 seconds.
Deteted objects:     bird
C:\Portfolio\Object_Detection_YOLO\input\wine.jpeg predicted in  0.956 seconds.
Deteted objects:     bottle bottle wine glass

Summary
----------------------------------------------------------
Task                     : Total Time (seconds)

Checking Directory       : 0.000
Loading Batch            : 0.423
Detecting (7 images)     : 7.080
Processing Outputs       : 0.000
Drawing Bounding Boxes   : 0.000
Mean Image Loading Time  : 1.072
----------------------------------------------------------

Detection Finished


# Imports
import glob
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
%matplotlib inline


# Inputs and outputs
inputs = [file for file in glob.glob("input/*.*")]
outputs = [file for file in glob.glob("output/*.*")]


# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))

# Loop through axes and plot images
for axs_row in range(1):
    
    # Plot original image
    img = mpimg.imread(inputs[0])
    axs[0].imshow(img)
    axs[0].set_axis_off()
    
    # Plot detected image
    img = mpimg.imread(outputs[0])
    axs[1].imshow(img)
    axs[1].set_axis_off()


# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))

# Loop through axes and plot images
for axs_row in range(1):
    
    # Plot original image
    img = mpimg.imread(inputs[1])
    axs[0].imshow(img)
    axs[0].set_axis_off()
    
    # Plot detected image
    img = mpimg.imread(outputs[1])
    axs[1].imshow(img)
    axs[1].set_axis_off()


# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))

# Loop through axes and plot images
for axs_row in range(1):
    
    # Plot original image
    img = mpimg.imread(inputs[2])
    axs[0].imshow(img)
    axs[0].set_axis_off()
    
    # Plot detected image
    img = mpimg.imread(outputs[2])
    axs[1].imshow(img)
    axs[1].set_axis_off()


# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))

# Loop through axes and plot images
for axs_row in range(1):
    
    # Plot original image
    img = mpimg.imread(inputs[3])
    axs[0].imshow(img)
    axs[0].set_axis_off()
    
    # Plot detected image
    img = mpimg.imread(outputs[3])
    axs[1].imshow(img)
    axs[1].set_axis_off()


# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))

# Loop through axes and plot images
for axs_row in range(1):
    
    # Plot original image
    img = mpimg.imread(inputs[4])
    axs[0].imshow(img)
    axs[0].set_axis_off()
    
    # Plot detected image
    img = mpimg.imread(outputs[4])
    axs[1].imshow(img)
    axs[1].set_axis_off()


# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))

# Loop through axes and plot images
for axs_row in range(1):
    
    # Plot original image
    img = mpimg.imread(inputs[5])
    axs[0].imshow(img)
    axs[0].set_axis_off()
    
    # Plot detected image
    img = mpimg.imread(outputs[5])
    axs[1].imshow(img)
    axs[1].set_axis_off()


# Plot original and detected images
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))

# Loop through axes and plot images
for axs_row in range(1):
    
    # Plot original image
    img = mpimg.imread(inputs[6])
    axs[0].imshow(img)
    axs[0].set_axis_off()
    
    # Plot detected image
    img = mpimg.imread(outputs[6])
    axs[1].imshow(img)
    axs[1].set_axis_off()

Object Detection with YOLO (You Only Look Once)¶

Part 1 - Neural Network Backend¶

Part 2 - Input Processor Frontend¶

Part 3 - Visualize Results¶

End¶