tezansahu/create_collator_model.py

## create_collator_model.py
def createMultimodalVQACollatorAndModel(text='bert-base-uncased', image='google/vit-base-patch16-224-in21k'):
    # Initialize the correct text tokenizer and image feature extractor, and use them to create the collator
    tokenizer = AutoTokenizer.from_pretrained(text)
    preprocessor = AutoFeatureExtractor.from_pretrained(image)
    multimodal_collator = MultimodalCollator(tokenizer=tokenizer, preprocessor=preprocessor)

    # Initialize the multimodal model with the appropriate weights from pretrained models
    multimodal_model = MultimodalVQAModel(pretrained_text_name=text, pretrained_image_name=image).to(device)

    return multimodal_collator, multimodal_model
	def createMultimodalVQACollatorAndModel(text='bert-base-uncased', image='google/vit-base-patch16-224-in21k'):
	# Initialize the correct text tokenizer and image feature extractor, and use them to create the collator
	tokenizer = AutoTokenizer.from_pretrained(text)
	preprocessor = AutoFeatureExtractor.from_pretrained(image)
	multimodal_collator = MultimodalCollator(tokenizer=tokenizer, preprocessor=preprocessor)

	# Initialize the multimodal model with the appropriate weights from pretrained models
	multimodal_model = MultimodalVQAModel(pretrained_text_name=text, pretrained_image_name=image).to(device)

	return multimodal_collator, multimodal_model