Matheus Schmitz
LinkedIn
Github Portfolio
Project Goal
The idea is to use a powerful pretained model (resnet), attaching new "prediction heads" to it, so that it can be trained on a labaled pedestrian dataset to perform both object detection (with Faster R-CNN) and instance segmentation (with Mask R-CNN).
Work is based on a tutorial provided by PyTorch: https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html
Data Source
Penn-Fudan Database for Pedestrian Detection and Segmentation
# File manipulation imports for Google Colab
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/Instance_Segmentation_Mask_RCNN")
!pip install -q watermark
# Imports
import os
import numpy as np
import watermark
# Computer Vision
import torch
import torch.utils.data
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
import PIL
# Stop annoying warnings
import warnings
warnings.filterwarnings("ignore")
# Track the package version user
%reload_ext watermark
%watermark --iversions
# Configurate PyTorch to use GPu
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Device: {device}")
# Download the model evaluation toolkit from Torch Vision
!git clone https://github.com/pytorch/vision.git
# Import toolkit tools to be used
!cp vision/references/detection/utils.py .
!cp vision/references/detection/transforms.py .
!cp vision/references/detection/coco_eval.py .
!cp vision/references/detection/engine.py .
!cp vision/references/detection/coco_utils.py .
# View a sample image
PIL.Image.open('PennFudanPed/PNGImages/FudanPed00046.png')
# View a sample mask
mask = PIL.Image.open('PennFudanPed/PedMasks/FudanPed00046_mask.png')
# Define the mask's color palette
mask.putpalette([0, 0, 0, # First color is used for the background
255, 0, 255,
255, 255, 0,
0, 255, 0,
0, 190, 255,
255, 0, 0,
])
# View
mask
# Class to prepare the dataset
class PrepareDataset(torch.utils.data.Dataset):
# Constructor
def __init__(self, root, transforms = None):
# Root folder with images and masks
self.root = root
# Image transformations to apply
self.transforms = transforms
# Load images and masks sorted by name to ensure alignment
self.images = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
# Method to feed items to the neural network
def __getitem__(self, idx):
# List of images and masks
image_path = os.path.join(self.root, "PNGImages", self.images[idx])
mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
# Load images to memory and convert colors (output is np.array)
image = PIL.Image.open(image_path).convert("RGB")
# Load masks to memory
mask = PIL.Image.open(mask_path)
# Convert masks from PIL to np.array
mask = np.array(mask)
# Instances are codified with different colors, hence each color = 1 instance
obj_ids = np.unique(mask)
# The first id is always for the image's background and can be removed
obj_ids = obj_ids[1:]
# Split the color coded masks into a set of binary masks
masks = mask == obj_ids[:, None, None]
# Get the bounding box coordinates for each mask
num_pedestrians = len(obj_ids)
boxes = []
for i in range(num_pedestrians):
pos = np.where(masks[i])
xmin = np.min(pos[1])
xmax = np.max(pos[1])
ymin = np.min(pos[0])
ymax = np.max(pos[0])
boxes.append([xmin, ymin, xmax, ymax])
# Convert the bounding boxes to tensors
boxes = torch.as_tensor(boxes, dtype = torch.float32)
# There is a single class (pedestria) per image, create labels for it
labels = torch.ones((num_pedestrians,), dtype=torch.int64)
# Convert masks to tensors
masks = torch.as_tensor(masks, dtype = torch.uint8)
# Convert ids to tensors
image_id = torch.tensor([idx])
# Calculate the total area for the bounding boxes
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# Default to assuming images have no people in them
iscrowd = torch.zeros((num_pedestrians,), dtype = torch.int64)
# Define the target variable
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
# Apply any specified image transformations
if self.transforms is not None:
image, target = self.transforms(image, target)
return image, target
# Method to calculate dataset size
def __len__(self):
return len(self.images)
# Preprocess images to test the class created above
images = PrepareDataset('PennFudanPed/')
# View one item
images[46]
# Imports
import utils
import transforms as T
def transform_images(isTrain):
transforms = []
transforms.append(T.ToTensor())
if isTrain:
transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(transforms)
# Load and transform the train images
images = PrepareDataset('PennFudanPed/',
transform_images(isTrain = True))
# Load test images without transforming
images = PrepareDataset('PennFudanPed/',
transform_images(isTrain = False))
# Shuffle images
indexes = torch.randperm(len(images)).tolist()
# Select all but 50 images as training data
train_images = torch.utils.data.Subset(images, indexes[:-50])
# Then use the 50 leftover images for test
test_images = torch.utils.data.Subset(images, indexes[-50:])
# Create dataloaders for the images
train_data_loader = torch.utils.data.DataLoader(train_images,
batch_size = 4,
shuffle = True,
num_workers = 0,
collate_fn = utils.collate_fn)
test_data_loader = torch.utils.data.DataLoader(train_images,
batch_size = 1,
shuffle = False,
num_workers = 0,
collate_fn = utils.collate_fn)
# Imports
from engine import train_one_epoch, evaluate
# Function to build the model
def build_model():
# There are only two classes: Pedestrian and Background
num_classes = 2
# load an instance segmentation model pre-trained on COCO
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained = True)
# Get the number of input features used by bounding box classifier
in_features_bbox = model.roi_heads.box_predictor.cls_score.in_features
# Replace the bounding box pretrained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features_bbox, num_classes)
# Get the number of input features used by instance mask classifier
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
# Replace the instance mask pretrained head with a new one
hidden_layer = 256
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
hidden_layer,
num_classes)
return model
# Run the function and build the model
model = build_model()
# Then send the model to the GPU
model.to(device)
# Define the optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
momentum=0.9, weight_decay=0.0005)
# Add a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=3,
gamma=0.1)
num_epochs = 15
# Training loop
for epoch in range(num_epochs):
# Train for one epoch, printing every 10 iterations
train_one_epoch(model,
optimizer,
train_data_loader,
device,
epoch,
print_freq=10)
# Update the learning rate
lr_scheduler.step()
# Evaluate on the test dataset
evaluate(model,
test_data_loader,
device=device)
# Set model to evaluation mode
model.eval()
# Extract an image for testing
img1, _ = test_images[6]
# Original image
PIL.Image.fromarray(img1.mul(255).permute(1, 2, 0).byte().numpy())
# Predict on the test image
with torch.no_grad():
predictions = model([img1.to(device)])
# Shape of the prediction mask
# The index 0 represents the number of maksks (pedestrians) found
predictions[0]['masks'].shape
# Prediction 1
PIL.Image.fromarray(predictions[0]['masks'][0, 0].mul(255).byte().cpu().numpy())
# Prediction 2
PIL.Image.fromarray(predictions[0]['masks'][1, 0].mul(255).byte().cpu().numpy())
# Prediction 3
PIL.Image.fromarray(predictions[0]['masks'][2, 0].mul(255).byte().cpu().numpy())
# Prediction 4
PIL.Image.fromarray(predictions[0]['masks'][3, 0].mul(255).byte().cpu().numpy())
# Prediction 5
PIL.Image.fromarray(predictions[0]['masks'][4, 0].mul(255).byte().cpu().numpy())
# Prediction 6
PIL.Image.fromarray(predictions[0]['masks'][5, 0].mul(255).byte().cpu().numpy())
# Prediction 7
PIL.Image.fromarray(predictions[0]['masks'][6, 0].mul(255).byte().cpu().numpy())
# Prediction 8
PIL.Image.fromarray(predictions[0]['masks'][7, 0].mul(255).byte().cpu().numpy())
Matheus Schmitz
LinkedIn
Github Portfolio
!jupyter nbconvert --to html Instance_Segmentation_Mask_RCNN.ipynb