Skip to content

Instantly share code, notes, and snippets.

@JoshuaPurtell
Last active February 29, 2024 00:31
Show Gist options
  • Save JoshuaPurtell/c4a280ded7214456b5d61e1ef202add3 to your computer and use it in GitHub Desktop.
Save JoshuaPurtell/c4a280ded7214456b5d61e1ef202add3 to your computer and use it in GitHub Desktop.
research doc titles
from pdf2image import convert_from_bytes
from transformers import NougatImageProcessor, NougatTokenizerFast
from transformers.models.vision_encoder_decoder import VisionEncoderDecoderModel
MODEL_ID = "facebook/nougat-small"
def first_page_to_title_and_authors(file):
tokenizer = NougatTokenizerFast.from_pretrained(MODEL_ID)
processor = NougatImageProcessor.from_pretrained(MODEL_ID)
model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID)
file_bytes = file.read()
first_page = convert_from_bytes(file_bytes)[0]
nougat_model = model
pixel_values = processor(first_page, return_tensors="pt").pixel_values
outputs = nougat_model.generate(
pixel_values, min_length=1, max_new_tokens=3000, bad_words_ids=[[tokenizer.unk_token_id]]
)
sequence_tokens = tokenizer.batch_decode(outputs.tolist()[0], skip_special_tokens=True)#[0]
transcription = tokenizer.post_process_generation(sequence_tokens, fix_markdown=False)
return "".join(transcription).lstrip("\n").split("\n")[0].replace(" ","").replace(",","_")+".pdf"
if __name__ == '__main__':
with open("paper.pdf", "rb") as file:
trnscrption = first_page_to_title_and_authors(file)
print(trnscrption.strip("\n"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment