Last active
December 16, 2022 06:07
-
-
Save revantteotia/7a992edff725a08819fa21d87d8d2598 to your computer and use it in GitHub Desktop.
Extract Faster R-CNN Features: detect objects and their faster rcnn features in images
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Code to detect objects and their faster rcnn features. | |
First install maskrcnn-benchmark and download model weights, using instructions given in the code. | |
Then give img_dir and output_dir in main() before running the code. | |
Will generate 2 files for each image, | |
"img_name.npy" : cnn features of detected objects | |
"img_name_info.npy" : bbox, object class, etc. of detected objects |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################### | |
# Steps before running the scripts: | |
# 1. first install maskrcnn-benchmark : FRCNN Model | |
# $ git clone https://gitlab.com/meetshah1995/vqa-maskrcnn-benchmark.git | |
# $ cd vqa-maskrcnn-benchmark | |
# $ python setup.py build | |
# $ python setup.py develop | |
# 2. download pre-trained detectron weights | |
# $ mkdir detectron_weights | |
# $ wget -O detectron_weights/detectron_model.pth https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model.pth | |
# $ wget -O detectron_weights/detectron_model.yaml https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model.yaml | |
# NOTE: just modify the code in /content/vqa-maskrcnn-benchmark/maskrcnn_benchmark/utils/imports.py, change PY3 to PY37 | |
################################################### | |
import argparse | |
import glob | |
import os | |
import cv2 | |
import numpy as np | |
import torch | |
from PIL import Image | |
from maskrcnn_benchmark.config import cfg | |
from maskrcnn_benchmark.layers import nms | |
from maskrcnn_benchmark.modeling.detector import build_detection_model | |
from maskrcnn_benchmark.structures.image_list import to_image_list | |
from maskrcnn_benchmark.utils.model_serialization import load_state_dict | |
class FeatureExtractor: | |
MODEL_URL = ( | |
"https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model.pth" | |
) | |
CONFIG_URL = ( | |
"https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model.yaml" | |
) | |
MAX_SIZE = 1333 | |
MIN_SIZE = 800 | |
def __init__(self, img_dir, output_folder): | |
self.args = self.get_parser().parse_args() | |
self.detection_model = self._build_detection_model() | |
# overwriting input output folders | |
self.args.image_dir = img_dir | |
self.args.output_folder = output_folder | |
os.makedirs(self.args.output_folder, exist_ok=True) | |
# def _try_downloading_necessities(self): | |
# if self.args.model_file is None: | |
# print("Downloading model and configuration") | |
# self.args.model_file = self.MODEL_URL.split("/")[-1] | |
# self.args.config_file = self.CONFIG_URL.split("/")[-1] | |
# download_file(self.MODEL_URL) | |
# download_file(self.CONFIG_URL) | |
def get_parser(self): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--model_file", default="detectron_weights/detectron_model.pth", type=str, help="Detectron model file" | |
) | |
parser.add_argument( | |
"--config_file", default="detectron_weights/detectron_model.yaml", type=str, help="Detectron config file" | |
) | |
parser.add_argument("--batch_size", type=int, default=4, help="Batch size") | |
parser.add_argument( | |
"--num_features", type=int, default=50, help="Number of features to extract." | |
) | |
parser.add_argument( | |
"--output_folder", type=str, default="./output_demo", help="Output folder" | |
) | |
parser.add_argument("--image_dir", default="./demo_input", type=str, help="Image directory or file") | |
parser.add_argument( | |
"--feature_name", type=str, help="The name of the feature to extract", | |
default="fc6", | |
) | |
parser.add_argument( | |
"--confidence_threshold", type=float, default=0, | |
help="Threshold of detection confidence above which boxes will be selected" | |
) | |
parser.add_argument( | |
"--background", action="store_true", | |
help="The model will output predictions for the background class when set" | |
) | |
return parser | |
def _build_detection_model(self): | |
cfg.merge_from_file(self.args.config_file) | |
cfg.freeze() | |
model = build_detection_model(cfg) | |
checkpoint = torch.load(self.args.model_file, map_location=torch.device("cpu")) | |
load_state_dict(model, checkpoint.pop("model")) | |
model.to("cuda") | |
model.eval() | |
return model | |
def _image_transform(self, path): | |
img = Image.open(path) | |
im = np.array(img).astype(np.float32) | |
# temp fix : for images with 4 channels | |
if im.shape[-1] > 3: | |
im = np.array(img.convert("RGB")).astype(np.float32) | |
# IndexError: too many indices for array, grayscale images | |
if len(im.shape) < 3: | |
im = np.repeat(im[:, :, np.newaxis], 3, axis=2) | |
im = im[:, :, ::-1] | |
im -= np.array([102.9801, 115.9465, 122.7717]) | |
im_shape = im.shape | |
im_height = im_shape[0] | |
im_width = im_shape[1] | |
im_size_min = np.min(im_shape[0:2]) | |
im_size_max = np.max(im_shape[0:2]) | |
# Scale based on minimum size | |
im_scale = self.MIN_SIZE / im_size_min | |
# Prevent the biggest axis from being more than max_size | |
# If bigger, scale it down | |
if np.round(im_scale * im_size_max) > self.MAX_SIZE: | |
im_scale = self.MAX_SIZE / im_size_max | |
im = cv2.resize( | |
im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR | |
) | |
img = torch.from_numpy(im).permute(2, 0, 1) | |
im_info = { | |
"width": im_width, | |
"height": im_height | |
} | |
return img, im_scale, im_info | |
def _process_feature_extraction( | |
self, output, im_scales, im_infos, feature_name="fc6", conf_thresh=0 | |
): | |
batch_size = len(output[0]["proposals"]) | |
n_boxes_per_image = [len(boxes) for boxes in output[0]["proposals"]] | |
score_list = output[0]["scores"].split(n_boxes_per_image) | |
score_list = [torch.nn.functional.softmax(x, -1) for x in score_list] | |
feats = output[0][feature_name].split(n_boxes_per_image) | |
cur_device = score_list[0].device | |
feat_list = [] | |
info_list = [] | |
for i in range(batch_size): | |
dets = output[0]["proposals"][i].bbox / im_scales[i] | |
scores = score_list[i] | |
max_conf = torch.zeros((scores.shape[0])).to(cur_device) | |
conf_thresh_tensor = torch.full_like(max_conf, conf_thresh) | |
start_index = 1 | |
# Column 0 of the scores matrix is for the background class | |
if self.args.background: | |
start_index = 0 | |
for cls_ind in range(start_index, scores.shape[1]): | |
cls_scores = scores[:, cls_ind] | |
keep = nms(dets, cls_scores, 0.25) | |
max_conf[keep] = torch.where( | |
# Better than max one till now and minimally greater than conf_thresh | |
(cls_scores[keep] > max_conf[keep]) & | |
(cls_scores[keep] > conf_thresh_tensor[keep]), | |
cls_scores[keep], max_conf[keep] | |
) | |
sorted_scores, sorted_indices = torch.sort(max_conf, descending=True) | |
num_boxes = (sorted_scores[:self.args.num_features] != 0).sum() | |
keep_boxes = sorted_indices[:self.args.num_features] | |
feat_list.append(feats[i][keep_boxes]) | |
bbox = output[0]["proposals"][i][keep_boxes].bbox / im_scales[i] | |
# Predict the class label using the scores | |
objects = torch.argmax(scores[keep_boxes], dim=1) | |
info_list.append( | |
{ | |
"bbox": bbox.cpu().numpy(), | |
"num_boxes": num_boxes.item(), | |
"objects": objects.cpu().numpy(), | |
"image_width": im_infos[i]["width"], | |
"image_height": im_infos[i]["height"], | |
} | |
) | |
return feat_list, info_list | |
def get_detectron_features(self, image_paths): | |
img_tensor, im_scales, im_infos = [], [], [] | |
for image_path in image_paths: | |
im, im_scale, im_info = self._image_transform(image_path) | |
img_tensor.append(im) | |
im_scales.append(im_scale) | |
im_infos.append(im_info) | |
# Image dimensions should be divisible by 32, to allow convolutions | |
# in detector to work | |
current_img_list = to_image_list(img_tensor, size_divisible=32) | |
current_img_list = current_img_list.to("cuda") | |
with torch.no_grad(): | |
output = self.detection_model(current_img_list) | |
feat_list = self._process_feature_extraction( | |
output, im_scales, im_infos, self.args.feature_name, | |
self.args.confidence_threshold | |
) | |
return feat_list | |
def _chunks(self, array, chunk_size): | |
for i in range(0, len(array), chunk_size): | |
yield array[i : i + chunk_size] | |
def _save_feature(self, file_name, feature, info): | |
file_base_name = os.path.basename(file_name) | |
file_base_name = file_base_name.split(".")[0] | |
info_file_base_name = file_base_name + "_info.npy" | |
file_base_name = file_base_name + ".npy" | |
np.save( | |
os.path.join(self.args.output_folder, file_base_name), feature.cpu().numpy() | |
) | |
np.save(os.path.join(self.args.output_folder, info_file_base_name), info) | |
def extract_features(self): | |
image_dir = self.args.image_dir | |
if os.path.isfile(image_dir): | |
features, infos = self.get_detectron_features([image_dir]) | |
self._save_feature(image_dir, features[0], infos[0]) | |
else: | |
files = glob.glob(os.path.join(image_dir, "*.*")) | |
for chunk in self._chunks(files, self.args.batch_size): | |
features, infos = self.get_detectron_features(chunk) | |
for idx, file_name in enumerate(chunk): | |
self._save_feature(file_name, features[idx], infos[idx]) | |
if __name__ == "__main__": | |
# running on train set images | |
input_dir = "dirctory containing images" | |
output_dir = 'dirctory containing frcnn features ofimages' | |
feature_extractor = FeatureExtractor(input_dir, output_dir) | |
feature_extractor.extract_features() | |
Will this work on a any image, if I wanted to test it out? And would I be able to feed this into the M4C Captioner?
Sorry for late response
Yes it will work on any image. You would be able to feed this into the M4C Captioner, however you will need to figure out how/where to change the configs in m4c.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Will this work on a any image, if I wanted to test it out? And would I be able to feed this into the M4C Captioner?