ji1kang/visualbert-tutorial.py

## visualbert-tutorial.py
import os
import numpy as np
import pandas as pd

# torch
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

# ours
from helper import label_cols # preprocessing labels

# visualbert - you need to download huggingface visualbert codes
# https://github.com/huggingface/transformers/tree/main/examples/research_projects/visual_bert
from utils import Config
from modeling_frcnn import GeneralizedRCNN
from processing_image import Preprocess


class ImageProcessor:
    def __init__(self, device='cuda'):
        frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
        frcnn_cfg.MODEL.DEVICE = device
        self.device = device

        self.frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)

        self.frcnn_cfg = frcnn_cfg
        self.image_preprocess = Preprocess(frcnn_cfg)

    def get_visual_embeddings(self, image_path):
        # run frcnn
        images, sizes, scales_yx = self.image_preprocess(image_path)

        output_dict = self.frcnn(
            images,
            sizes,
            scales_yx=scales_yx,
            padding="max_detections",
            max_detections=self.frcnn_cfg.max_detections,
            return_tensors="pt",
        )
        features = output_dict.get("roi_features").detach().cpu()
        return features


class TrainDataset(Dataset):
    def __init__(self, base_path, data_path, max_len, device='cuda:1'):
        """Dataset for training steps
        Args:
          base_path: the directory for data
          data_path: a .csv file with columns as follows:
            ['path', 'preprocessed_title', 'preprocessed_text', 'majority_vote']
          word2idx: a dictionary where a key is a word and a value is an index word in word embedding
          max_len: maximum length of texts
        """
        self.data_path = data_path
        self.data = pd.read_csv(data_path).values
        self.base_path = base_path
        self.max_len = max_len

        # feature extractor
        self.tokenizer = BertTokenizer.from_pretrained(
            "bert-base-uncased", cache_dir="cache")
        self.visual_extractor = ImageProcessor(device=device)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        Args:
          idx: the index of image
        Return:
          training data
        """

        # load meta data (customize this code for your dataset)
        file_path, title_text, ocr_text, label = self.data[idx]

        # load image
        image_path = os.path.join(self.base_path, file_path)
        visual_embeds = self.visual_extractor.get_visual_embeddings(image_path)
        visual_token_type_ids = torch.ones(
            visual_embeds.shape[:-1], dtype=torch.long)
        visual_attention_mask = torch.ones(
            visual_embeds.shape[:-1], dtype=torch.float)

        # load text (customize this code for your dataset)
        title_text, title_text_length = self.preprocess_text(title_text)
        ocr_text, ocr_text_length = self.preprocess_text(ocr_text)
        text = f'title: {title_text} body: {ocr_text}'
        inputs = self.tokenizer(text, return_tensors="pt")
        input_ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        attention_mask = inputs["attention_mask"]

        # load label (customize this code for your dataset)
        label_index = label_cols.index(label)
        label_index = torch.tensor(label_index).long()

        return (input_ids, token_type_ids, attention_mask,
                visual_embeds, visual_token_type_ids, visual_attention_mask,
                label_index)

    def preprocess_text(self, text):
        """Get word vectors (customize this function for your dataset)
        Args:
          text: a string of text
        """
        text_1 = str(text)
        text_1 = [x.lower() for x in text_1.split()]
        idx = np.zeros(self.max_len)
        start = 0
        for i in text_1:
            try:
                index = self.word2idx[i]
                idx[start] = index
                start += 1
                if start == self.max_len:
                    break
            except:
                continue

        return idx, start


# run visualbert with the train dataloader

seed = 2022
max_len = 100
image_path = 'your-path'
train_path = 'your-path'
train_path = train_path + f'your-path.csv'
train_dataset = TrainDataset(
    base_path=image_path,
    data_path=train_path,
    max_len=max_len)


batch_size = 32
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True)

for data in train_dataloader:
    (input_ids, token_type_ids, attention_mask,
         visual_embeds, visual_token_type_ids, visual_attention_mask,
         train_labels) = data

    outputs = model(**inputs)
	import os
	import numpy as np
	import pandas as pd

	# torch
	import torch
	from torch.utils.data import Dataset
	from transformers import BertTokenizer

	# ours
	from helper import label_cols # preprocessing labels

	# visualbert - you need to download huggingface visualbert codes
	# https://github.com/huggingface/transformers/tree/main/examples/research_projects/visual_bert
	from utils import Config
	from modeling_frcnn import GeneralizedRCNN
	from processing_image import Preprocess


	class ImageProcessor:
	def __init__(self, device='cuda'):
	frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
	frcnn_cfg.MODEL.DEVICE = device
	self.device = device

	self.frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)

	self.frcnn_cfg = frcnn_cfg
	self.image_preprocess = Preprocess(frcnn_cfg)

	def get_visual_embeddings(self, image_path):
	# run frcnn
	images, sizes, scales_yx = self.image_preprocess(image_path)

	output_dict = self.frcnn(
	images,
	sizes,
	scales_yx=scales_yx,
	padding="max_detections",
	max_detections=self.frcnn_cfg.max_detections,
	return_tensors="pt",
	)
	features = output_dict.get("roi_features").detach().cpu()
	return features


	class TrainDataset(Dataset):
	def __init__(self, base_path, data_path, max_len, device='cuda:1'):
	"""Dataset for training steps
	Args:
	base_path: the directory for data
	data_path: a .csv file with columns as follows:
	['path', 'preprocessed_title', 'preprocessed_text', 'majority_vote']
	word2idx: a dictionary where a key is a word and a value is an index word in word embedding
	max_len: maximum length of texts
	"""
	self.data_path = data_path
	self.data = pd.read_csv(data_path).values
	self.base_path = base_path
	self.max_len = max_len

	# feature extractor
	self.tokenizer = BertTokenizer.from_pretrained(
	"bert-base-uncased", cache_dir="cache")
	self.visual_extractor = ImageProcessor(device=device)

	def __len__(self):
	return len(self.data)

	def __getitem__(self, idx):
	"""
	Args:
	idx: the index of image
	Return:
	training data
	"""

	# load meta data (customize this code for your dataset)
	file_path, title_text, ocr_text, label = self.data[idx]

	# load image
	image_path = os.path.join(self.base_path, file_path)
	visual_embeds = self.visual_extractor.get_visual_embeddings(image_path)
	visual_token_type_ids = torch.ones(
	visual_embeds.shape[:-1], dtype=torch.long)
	visual_attention_mask = torch.ones(
	visual_embeds.shape[:-1], dtype=torch.float)

	# load text (customize this code for your dataset)
	title_text, title_text_length = self.preprocess_text(title_text)
	ocr_text, ocr_text_length = self.preprocess_text(ocr_text)
	text = f'title: {title_text} body: {ocr_text}'
	inputs = self.tokenizer(text, return_tensors="pt")
	input_ids = inputs["input_ids"]
	token_type_ids = inputs["token_type_ids"]
	attention_mask = inputs["attention_mask"]

	# load label (customize this code for your dataset)
	label_index = label_cols.index(label)
	label_index = torch.tensor(label_index).long()

	return (input_ids, token_type_ids, attention_mask,
	visual_embeds, visual_token_type_ids, visual_attention_mask,
	label_index)

	def preprocess_text(self, text):
	"""Get word vectors (customize this function for your dataset)
	Args:
	text: a string of text
	"""
	text_1 = str(text)
	text_1 = [x.lower() for x in text_1.split()]
	idx = np.zeros(self.max_len)
	start = 0
	for i in text_1:
	try:
	index = self.word2idx[i]
	idx[start] = index
	start += 1
	if start == self.max_len:
	break
	except:
	continue

	return idx, start


	# run visualbert with the train dataloader

	seed = 2022
	max_len = 100
	image_path = 'your-path'
	train_path = 'your-path'
	train_path = train_path + f'your-path.csv'
	train_dataset = TrainDataset(
	base_path=image_path,
	data_path=train_path,
	max_len=max_len)


	batch_size = 32
	train_dataloader = DataLoader(
	train_dataset, batch_size=batch_size, shuffle=True)

	for data in train_dataloader:
	(input_ids, token_type_ids, attention_mask,
	visual_embeds, visual_token_type_ids, visual_attention_mask,
	train_labels) = data

	outputs = model(**inputs)