danielhavir/mathpix2gpt.py

## mathpix2gpt.py
import requests
import time
import os
import sys
import openai
import tiktoken
from termcolor import colored

openai.api_key = open(os.path.expanduser('~/.openai')).read().strip()

USE_GPT_4_32K = True
DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301"
TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096  # Leave some room for the chat.
IS_SCIENTIFIC = True

# Assumes you have a file called ~/.mathpix with the first line containing your app_id and
# the second line containing your app_key
with open(os.path.expanduser('~/.mathpix')) as f:
    APP_ID = f.readline().strip()
    APP_KEY = f.readline().strip()


def send_pdf_to_mathpix(file_path, output_format='mmd'):
    url = 'https://api.mathpix.com/v3/pdf'
    headers = {
        'app_id': APP_ID,
        'app_key': APP_KEY
    }

    with open(file_path, 'rb') as file:
        files = {'file': file}
        options = {
            'options_json': '{"conversion_formats": {"%s": true}}' % output_format
        }
        print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix")
        response = requests.post(url, headers=headers,
                                 files=files, data=options)
        response_data = response.json()

        if 'pdf_id' in response_data:
            pdf_id = response_data['pdf_id']
            print(f"PDF ID: {pdf_id}")
            return pdf_id
        else:
            print("Error: Unable to send PDF to Mathpix")
            return None


def wait_for_processing(pdf_id):
    url = f'https://api.mathpix.com/v3/pdf/{pdf_id}'
    headers = {
        'app_id': APP_ID,
        'app_key': APP_KEY
    }

    while True:
        response = requests.get(url, headers=headers)
        response_data = response.json()
        status = response_data.get('status', None)

        if status == 'completed':
            print("Processing complete")
            return True
        elif status == 'error':
            print("Error: Unable to process PDF")
            return False
        else:
            print(f"Status: {status}, waiting for processing to complete")
            time.sleep(5)


def download_processed_file(pdf_id, file_format, output_path):
    url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.{file_format}'
    headers = {
        'app_id': APP_ID,
        'app_key': APP_KEY
    }

    response = requests.get(url, headers=headers)
    with open(output_path, 'wb') as output_file:
        output_file.write(response.content)
    print(f"File downloaded to {output_path}")


def clear_terminal():
    os.system('cls' if os.name == 'nt' else 'clear')


def print_messages(messages):
    for index, message in enumerate(messages):
        color = 'blue' if message['role'] == 'assistant' else 'white'
        print(
            colored(f"{message['role'].capitalize()}: {message['content']}", color))


def chat_gpt(messages):
    result = openai.ChatCompletion.create(
        model=DEFAULT_MODEL,
        messages=messages
    )
    answer = result.choices[0].message.content
    messages.append({"role": "assistant", "content": answer})
    return messages


def start_question_answering(input_path):
    print("Using model: %s" % DEFAULT_MODEL)

    with open(input_path) as fh:
        data = fh.read()

    text = data.strip()
    tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL)
    text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT])
    text = '\n' + '-' * 50 + '\n' + text + '\n' + '-' * 50
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Here is some content from a PDF I extracted to Markdown. %s" % text},
    ]
    messages = chat_gpt(messages)
    while True:
        clear_terminal()
        print_messages(messages)

        question = input("User: ")
        if question.lower() == 'exit':
            break

        messages.append({"role": "user", "content": question})
        messages = chat_gpt(messages)


def main():
    if len(sys.argv) < 2:
        print("Usage: python pdfvqa.py <input_pdf_path>")
        return

    input_pdf_path = sys.argv[1]
    output_mmd_path = input_pdf_path.replace('.pdf', '.md')
    output_simplemd_path = input_pdf_path.replace('.pdf', '.simple.md')

    if not os.path.exists(output_mmd_path):
        pdf_id = send_pdf_to_mathpix(input_pdf_path)
        if pdf_id and wait_for_processing(pdf_id):
            download_processed_file(pdf_id, 'mmd', output_mmd_path)
    if not os.path.exists(output_simplemd_path):
        with open(output_mmd_path, 'r') as mmd_file:
            mmd = mmd_file.read()
            if not IS_SCIENTIFIC:
                # There's too much LaTeX style escaping for most PDFs in my view, so remove some of it.
                # Keep it if the paper is a scientific paper.
                mmd = '\n'.join([line for line in mmd.split(
                    '\n') if not line.startswith('![]')])
                # replace \section{Title} with # Title
                mmd = mmd.replace('\\section{', '# ').replace('}', '')
                # replace the "\" slash that Mathpix adds to escape $, %, (, etc.
                mmd = mmd.replace('\$', '$').replace(
                    '\%', '%').replace('\(', '(').replace('\)', ')')
        with open(output_simplemd_path, 'w') as simplemd_file:
            simplemd_file.write(mmd)

    start_question_answering(output_simplemd_path)


if __name__ == '__main__':
    main()
	import requests
	import time
	import os
	import sys
	import openai
	import tiktoken
	from termcolor import colored

	openai.api_key = open(os.path.expanduser('~/.openai')).read().strip()

	USE_GPT_4_32K = True
	DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301"
	TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096 # Leave some room for the chat.
	IS_SCIENTIFIC = True

	# Assumes you have a file called ~/.mathpix with the first line containing your app_id and
	# the second line containing your app_key
	with open(os.path.expanduser('~/.mathpix')) as f:
	APP_ID = f.readline().strip()
	APP_KEY = f.readline().strip()


	def send_pdf_to_mathpix(file_path, output_format='mmd'):
	url = 'https://api.mathpix.com/v3/pdf'
	headers = {
	'app_id': APP_ID,
	'app_key': APP_KEY
	}

	with open(file_path, 'rb') as file:
	files = {'file': file}
	options = {
	'options_json': '{"conversion_formats": {"%s": true}}' % output_format
	}
	print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix")
	response = requests.post(url, headers=headers,
	files=files, data=options)
	response_data = response.json()

	if 'pdf_id' in response_data:
	pdf_id = response_data['pdf_id']
	print(f"PDF ID: {pdf_id}")
	return pdf_id
	else:
	print("Error: Unable to send PDF to Mathpix")
	return None


	def wait_for_processing(pdf_id):
	url = f'https://api.mathpix.com/v3/pdf/{pdf_id}'
	headers = {
	'app_id': APP_ID,
	'app_key': APP_KEY
	}

	while True:
	response = requests.get(url, headers=headers)
	response_data = response.json()
	status = response_data.get('status', None)

	if status == 'completed':
	print("Processing complete")
	return True
	elif status == 'error':
	print("Error: Unable to process PDF")
	return False
	else:
	print(f"Status: {status}, waiting for processing to complete")
	time.sleep(5)


	def download_processed_file(pdf_id, file_format, output_path):
	url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.{file_format}'
	headers = {
	'app_id': APP_ID,
	'app_key': APP_KEY
	}

	response = requests.get(url, headers=headers)
	with open(output_path, 'wb') as output_file:
	output_file.write(response.content)
	print(f"File downloaded to {output_path}")


	def clear_terminal():
	os.system('cls' if os.name == 'nt' else 'clear')


	def print_messages(messages):
	for index, message in enumerate(messages):
	color = 'blue' if message['role'] == 'assistant' else 'white'
	print(
	colored(f"{message['role'].capitalize()}: {message['content']}", color))


	def chat_gpt(messages):
	result = openai.ChatCompletion.create(
	model=DEFAULT_MODEL,
	messages=messages
	)
	answer = result.choices[0].message.content
	messages.append({"role": "assistant", "content": answer})
	return messages


	def start_question_answering(input_path):
	print("Using model: %s" % DEFAULT_MODEL)

	with open(input_path) as fh:
	data = fh.read()

	text = data.strip()
	tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL)
	text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT])
	text = '\n' + '-' * 50 + '\n' + text + '\n' + '-' * 50
	messages = [
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": "Here is some content from a PDF I extracted to Markdown. %s" % text},
	]
	messages = chat_gpt(messages)
	while True:
	clear_terminal()
	print_messages(messages)

	question = input("User: ")
	if question.lower() == 'exit':
	break

	messages.append({"role": "user", "content": question})
	messages = chat_gpt(messages)


	def main():
	if len(sys.argv) < 2:
	print("Usage: python pdfvqa.py <input_pdf_path>")
	return

	input_pdf_path = sys.argv[1]
	output_mmd_path = input_pdf_path.replace('.pdf', '.md')
	output_simplemd_path = input_pdf_path.replace('.pdf', '.simple.md')

	if not os.path.exists(output_mmd_path):
	pdf_id = send_pdf_to_mathpix(input_pdf_path)
	if pdf_id and wait_for_processing(pdf_id):
	download_processed_file(pdf_id, 'mmd', output_mmd_path)
	if not os.path.exists(output_simplemd_path):
	with open(output_mmd_path, 'r') as mmd_file:
	mmd = mmd_file.read()
	if not IS_SCIENTIFIC:
	# There's too much LaTeX style escaping for most PDFs in my view, so remove some of it.
	# Keep it if the paper is a scientific paper.
	mmd = '\n'.join([line for line in mmd.split(
	'\n') if not line.startswith('![]')])
	# replace \section{Title} with # Title
	mmd = mmd.replace('\\section{', '# ').replace('}', '')
	# replace the "\" slash that Mathpix adds to escape $, %, (, etc.
	mmd = mmd.replace('\$', '$').replace(
	'\%', '%').replace('\(', '(').replace('\)', ')')
	with open(output_simplemd_path, 'w') as simplemd_file:
	simplemd_file.write(mmd)

	start_question_answering(output_simplemd_path)


	if __name__ == '__main__':
	main()