Skip to content

Instantly share code, notes, and snippets.

@jeffehobbs
Created June 19, 2024 02:04
Show Gist options
  • Save jeffehobbs/db292a3dc5a5577469b0cd18ff300825 to your computer and use it in GitHub Desktop.
Save jeffehobbs/db292a3dc5a5577469b0cd18ff300825 to your computer and use it in GitHub Desktop.
image upload and detection flask app
# owl.py | jhobbs@advance.net
#
# GOAL: upload a PDF, convert it to an image, and perform OpenAI vision capabilities on the image
#
# TO DO:
# 1. [X] upload PDF or image
# 2. [X] change PDF to image
# 3. [X] upload default prompt
# 4. [X] perform vision API call
# 5. [X] splat response back to client
# 6. [X] fix fuckin heic files
#
# note to self, install this layer when moving to AWS:
# https://github.com/jeylabs/aws-lambda-poppler-layer/releases
import os, configparser, base64, requests
from flask import Flask, Response, request, json, render_template, jsonify
from pdf2image import convert_from_path
from PIL import Image
from pillow_heif import register_heif_opener
# globals
UPLOAD_PATH = '/tmp/'
SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
config = configparser.ConfigParser()
config.read(SCRIPT_PATH +'/secrets.txt')
OPENAI_API_KEY = config.get('openai', 'apikey')
app = Flask(__name__)
@app.route('/')
def main():
return render_template("index.html")
@app.route('/description', methods = ['POST'])
def success():
if request.method == 'POST':
f = request.files['file']
prompt = request.form['prompt']
max_tokens = int(request.form['max_tokens'])
#print(prompt)
image_path = UPLOAD_PATH + f.filename
if (image_path == UPLOAD_PATH):
return render_template("index.html")
f.save(image_path)
if 'pdf' in image_path:
image_set = convert_pdf(image_path)
elif 'heic' in image_path:
image_set = convert_heic(image_path)
else:
image_set = [image_path]
#print(image_set)
description = describe_image(image_set[0], prompt, max_tokens)
response = description['choices'][0]['message']['content']
#print(response)
return render_template("response.html", response=response)
def convert_pdf(pdf_path):
images = convert_from_path(pdf_path)
image_set = []
for i in range(len(images)):
image_path = pdf_path.replace('.pdf', '_' + str(i) + '.jpg')
images[i].save(image_path, 'JPEG')
image_set.append(image_path)
return image_set
def convert_heic(heic_path):
register_heif_opener()
image = Image.open(heic_path)
filename = heic_path.replace('heic','jpg')
image.convert('RGB').save(filename)
image_set = [filename]
return image_set
def return_apikey(section, key):
SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
config = configparser.ConfigParser()
config.read(SCRIPT_PATH +'/secrets.txt')
secret = config.get(section, key)
return(secret)
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def describe_image(image_path, prompt, max_tokens):
# need to fix this to encompass entire array
#for image_path in image_set:
base64_image = encode_image(image_path)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
payload = {
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "auto"
}
}
]
}
],
"max_tokens": max_tokens
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
print(json.dumps(response.json(), indent=4))
return(response.json())
# main function
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)
#fin
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment