Skip to content

Instantly share code, notes, and snippets.

Last active April 22, 2024 05:19
Show Gist options
  • Star 89 You must be signed in to star a gist
  • Fork 11 You must be signed in to fork a gist
  • Save danielgross/3ab4104e14faccc12b49200843adab21 to your computer and use it in GitHub Desktop.
Save danielgross/3ab4104e14faccc12b49200843adab21 to your computer and use it in GitHub Desktop.
import requests
import time
import os
import sys
import openai
import tiktoken
from termcolor import colored
openai.api_key = open(os.path.expanduser('~/.openai')).read().strip()
USE_GPT_4_32K = True
DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301"
TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096 # Leave some room for the chat.
# Assumes you have a file called ~/.mathpix with the first line containing your app_id and
# the second line containing your app_key
with open(os.path.expanduser('~/.mathpix')) as f:
APP_ID = f.readline().strip()
APP_KEY = f.readline().strip()
def send_pdf_to_mathpix(file_path, output_format='mmd'):
url = ''
headers = {
'app_id': APP_ID,
'app_key': APP_KEY
with open(file_path, 'rb') as file:
files = {'file': file}
options = {
'options_json': '{"conversion_formats": {"%s": true}}' % output_format
print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix")
response =, headers=headers,
files=files, data=options)
response_data = response.json()
if 'pdf_id' in response_data:
pdf_id = response_data['pdf_id']
print(f"PDF ID: {pdf_id}")
return pdf_id
print("Error: Unable to send PDF to Mathpix")
return None
def wait_for_processing(pdf_id):
url = f'{pdf_id}'
headers = {
'app_id': APP_ID,
'app_key': APP_KEY
while True:
response = requests.get(url, headers=headers)
response_data = response.json()
status = response_data.get('status', None)
if status == 'completed':
print("Processing complete")
return True
elif status == 'error':
print("Error: Unable to process PDF")
return False
print(f"Status: {status}, waiting for processing to complete")
def download_processed_file(pdf_id, file_format, output_path):
url = f'{pdf_id}.{file_format}'
headers = {
'app_id': APP_ID,
'app_key': APP_KEY
response = requests.get(url, headers=headers)
with open(output_path, 'wb') as output_file:
print(f"File downloaded to {output_path}")
def clear_terminal():
os.system('cls' if == 'nt' else 'clear')
def print_messages(messages):
for index, message in enumerate(messages):
color = 'blue' if message['role'] == 'assistant' else 'white'
colored(f"{message['role'].capitalize()}: {message['content']}", color))
def chat_gpt(messages):
result = openai.ChatCompletion.create(
answer = result.choices[0].message.content
messages.append({"role": "assistant", "content": answer})
return messages
def start_question_answering(input_path):
print("Using model: %s" % DEFAULT_MODEL)
with open(input_path) as fh:
data =
text = data.strip()
tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL)
text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT])
text = '\n' + '-' * 50 + '\n' + text + '\n' + '-' * 50
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Here is some content from a PDF I extracted to Markdown. %s" % text},
messages = chat_gpt(messages)
while True:
question = input("User: ")
if question.lower() == 'exit':
messages.append({"role": "user", "content": question})
messages = chat_gpt(messages)
def main():
if len(sys.argv) < 2:
print("Usage: python <input_pdf_path>")
input_pdf_path = sys.argv[1]
output_mmd_path = input_pdf_path.replace('.pdf', '.md')
output_simplemd_path = input_pdf_path.replace('.pdf', '')
if not os.path.exists(output_mmd_path):
pdf_id = send_pdf_to_mathpix(input_pdf_path)
if pdf_id and wait_for_processing(pdf_id):
download_processed_file(pdf_id, 'mmd', output_mmd_path)
if not os.path.exists(output_simplemd_path):
with open(output_mmd_path, 'r') as mmd_file:
mmd =
# There's too much LaTeX style escaping for most PDFs in my view, so remove some of it.
# Keep it if the paper is a scientific paper.
mmd = '\n'.join([line for line in mmd.split(
'\n') if not line.startswith('![]')])
# replace \section{Title} with # Title
mmd = mmd.replace('\\section{', '# ').replace('}', '')
# replace the "\" slash that Mathpix adds to escape $, %, (, etc.
mmd = mmd.replace('\$', '$').replace(
'\%', '%').replace('\(', '(').replace('\)', ')')
with open(output_simplemd_path, 'w') as simplemd_file:
if __name__ == '__main__':
Copy link

this mathpix, should be extract to another script, because is very useful for many things others, i make a mathpix api like this in bash, but in python is very semantic.

Copy link

Great script. It's very helpful for me.
However, the output_format setting is not working correctly.
I believe we need to modify it for a specific file conversion process.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment