Skip to content

Instantly share code, notes, and snippets.

@Erol444
Created June 13, 2024 09:09
Show Gist options
  • Save Erol444/9fa445a98853c5606a296611596fffe3 to your computer and use it in GitHub Desktop.
Save Erol444/9fa445a98853c5606a296611596fffe3 to your computer and use it in GitHub Desktop.
openai_parse_pdf_output_json.py
from openai import OpenAI
from openai.types.beta.threads.message_create_params import Attachment, AttachmentToolFileSearch
import os
from dotenv import load_dotenv
load_dotenv()
import json
# Add your OpenAI API key
client = OpenAI(api_key=os.getenv("OPENAI_KEY"))
# Upload the file to the OpenAI API
file = client.files.create(
file=open('my_inference.pdf', 'rb'),
purpose='assistants'
)
# Create thread
thread = client.beta.threads.create()
# Add the user message to the thread
prompt = "Tell me 5 interesting things that you find in this PDF file."
client.beta.threads.messages.create(
thread_id = thread.id,
role='user',
content=prompt,
attachments=[Attachment(file_id=file.id, tools=[AttachmentToolFileSearch(type='file_search')])]
)
# Create an Assistant (or fetch it if it already exists)
assistants = client.beta.assistants.list()
myAssistant = None
for assistant in assistants:
if assistant.name == 'My Assistant Name':
myAssistant = assistant
break
if myAssistant is None:
# Create assistant
myAssistant = client.beta.assistants.create(
model='gpt-4o',
description='You are a PDF retrieval assistant.',
instructions="You are a helpful assistant designed to output only JSON. Find information from the text and files provided.",
tools=[{"type": "file_search"}],
# response_format={"type": "json_object"}, # Isn't possible
name='My Assistant Name',
)
# Run the created thread with the assistant. It will wait until the message is processed.
run = client.beta.threads.runs.create_and_poll(
thread_id=thread.id,
assistant_id=myAssistant.id,
instructions="Please output in JSON format",
timeout=300, # 5 minutes
# response_format={"type": "json_object"}, # Isn't possible
)
# Eg. issue with openai server
if run.status != "completed":
raise Exception('Run failed:', run.status)
# Fetch outputs of the thread
messages_cursor = client.beta.threads.messages.list(thread_id=thread.id)
messages = [message for message in messages_cursor]
message = messages[0]
assert message.content[0].type == "text"
# Output of the Assistant
res_txt = message.content[0].text.value
# Because assistant can't produce JSON (as we're using "file_search"),
# it will output text + some JSON code. We can parse and extract just
# the JSON part, and ignore everything else (eg. gpt4o will start with something
# similar to "Of course, here's the parsed text: {useful_JSON_here}")
if res_txt.startswith('```json'):
res_txt = res_txt[6:]
if res_txt.endswith('```'):
res_txt = res_txt[:-3]
res_txt = res_txt[:res_txt.rfind('}')+1]
res_txt = res_txt[res_txt.find('{'):]
res_txt.strip()
# Parse the JSON output
data = json.loads(res_txt)
print(data)
# Delete the file(s) afterwards to preserve space (max 100gb/company)
delete_ok = client.files.delete(file.id)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment