Skip to content

Instantly share code, notes, and snippets.

@pirate
Created January 12, 2024 14:43
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pirate/080c65d6eb2464341be728ea12967e59 to your computer and use it in GitHub Desktop.
Save pirate/080c65d6eb2464341be728ea12967e59 to your computer and use it in GitHub Desktop.
Automatically detect song/video title/artist/album/metadata captured in screenshots using GPT-4-vision via the OpenAI API.
#!/usr/bin/env python3
# Script to extract song/video title, artist, album, etc. metadata from screenshots w/ GPT-4.
### Example Usage: ###############################################################################
#
# ➜ ~/Desktop # python3 music_in_screnshots.py --prompt=prompt.txt --attach=spotify_screenshot.PNG
# {
# "found_prominent_media": true,
# "all_strings": [
# "1:21",
# "open.spotify.com",
# "Spotify",
# "OPEN APP",
# "Deuter",
# "Heartnotes 2.0 432 Hz - 24-bit digital",
# "Steven Halpern, David Darling",
# "Call Within (Instrumental Meditation ...",
# "Manose",
# "417 Hz - Undoing Emotional Patterns",
# "Kev Thompson",
# "Oceanic World of Atlas 3D",
# "Dreamflute Doroth\u00e9e Fr\u00f6ller",
# "Pure Tranquility (Theta Binaural)",
# "NREM",
# "Binaural Alpha Sinus 110Hz - 118Hz",
# "Binaural Shapers",
# "432Hz Miracle Tone: Expanding Wind...",
# "PowerThoughts Meditation Club",
# "111hz Michael: Victory from Fear",
# "Ted Winslow",
# "Listen With Bamboo Flute, ..."
# ],
# "title": "Call Within (Instrumental Meditation ...",
# "artist": "Manose",
# "album": null,
# "now_playing_position": null,
# "playback_time_remaining": null,
# "total_duration": null,
# "foreground_app": "browser",
# "browser_active_url": "open.spotify.com",
# "audio_out_device": null,
# "operating_system": "ios",
# "prominent_colors": [
# "black",
# "green",
# "white"
# ],
# "description": "screenshot of iOS browser showing Spotify web player interface with a song paused",
# "warnings": [
# "could not determine complete song title due to truncation",
# "could not determine now playing position, playback time remaining, or total duration from the image"
# ]
# }
# ➜ ~/Desktop # python3 music_in_screnshots.py --prompt=prompt.txt --attach=playlist.PNG
# {
# "found_prominent_media": true,
# "all_strings": [
# "4:19",
# "Across the Sea",
# "Middle Sky Boom & Eliezer",
# "Goodthing",
# "Leon Vynehall",
# "It's Just (House of Dupree)",
# "Pier Children",
# "Caint Use My Phone (Suite)",
# "Erykah Badu",
# "663 songs, 116 hours 42 minutes",
# "Featured Artists",
# "See All",
# "Jordan Rakei",
# "Sevdaliza",
# "Big Muff",
# "Butterflies (Demo Version)",
# "Listen Now",
# "Browse",
# "Radio",
# "Library",
# "Search"
# ],
# "title": "Butterflies (Demo Version)",
# "artist": null,
# "album": null,
# "now_playing_position": null,
# "playback_time_remaining": null,
# "total_duration": null,
# "foreground_app": "applemusic",
# "browser_active_url": null,
# "audio_out_device": null,
# "operating_system": "ios",
# "prominent_colors": [
# "black",
# "red",
# "purple",
# "gray",
# "white"
# ],
# "description": "screenshot of iOS Apple Music app with 'Butterflies (Demo Version)' paused at the bottom menu",
# "warnings": [
# "artist not visible for currently highlighted song",
# "album not visible for currently highlighted song",
# "playback position, time remaining, and total duration not visible"
# ]
# }
########################################################################################
import os
import sys
import json
import base64
import argparse
import requests
import pprint
from pathlib import Path
pp = pprint.PrettyPrinter(indent=4)
DEFAULT_OPENAI_API_KEY = 'api-key-here'
MAX_RESPONSE_TOKENS = 700
DEFAULT_PROMPT = """\
Find the now playing or most prominent song (or video). Respond using the JSON format below.
Record all the raw strings that appear in the image in the "all_strings" field (stripping newlines).
If any values are partially obscured, use them as-is and add a warning.
If any values are fully obscured, unreadable, or unavailable, use `null` as their value and add a warning.
Sometimes Bluetooth or Airplay devices names appear in images to indicate they are being used for audio out (e.g. headphones, speakers, cars, etc.).
Don't confuse those device names with title/artist/album values, put any device name (if present) in the audio_out_device field only.
{
"found_prominent_media": true,
"all_strings": ["all prominent strings", "seen in image", "9:25", "T-Mobile LTE", "23%", "Jazz", "The Beatles", "Yellow Submarine - EP", "Big Apple Records Ltd", "other songs/artists/albums visible", "fragments of titles/artists/timestamps", ...],
"title": "detected now playing song or video title here" | null,
"artist": "detected now playing artist name here" | null,
"album": "detected now playing album name here" | null,
"now_playing_position": "hours:minutes:seconds" | "3:59:59" | "1:23" | ... | null,
"playback_time_remaining": "-hours:minutes:seconds" | "-1:32:03" | "-1:23" | ... | null,
"total_duration": "hours:minutes:seconds" | "3:59:59" | "4:59" | ... | null,
"foreground_app": "lockscreen" | "controlcenter" | "browser" | "youtube" | "soundcloud" | "instagram" | "applemusic" | "spotify" | "shazam" | "photos" | "imessage" | "signal" | ... | null,
"browser_active_url": "youtube.com/watch?v=w_5K8dRt7Bs" | "www.instagram.com" | ... | null,
"audio_out_device": "AirPods Pro" | "RAM Promaster" | "Nickpods" | "BrickBedroomTV" | "Cardo" | "Bathpod" | "Apple TV" | "PLT_BACKBEAT_PRO" | "Minirig" | ... | null,
"operating_system": "macos" | "ios" | "ipados" | "windows" | "android" | ... | null,
"prominent_colors": ["blue", "white", "pink", ...],
"description": "screenshot of iOS Apple Music app showing song playing" | "screenshot of desktop macOS browser showing a Soundcloud mix" | "picture of a physical vinyl record cover" | ...,
"warnings": ["title partially obscured by edge of screen", "not enough confidence to guess foreground app", "multiple songs seen in image, no particular song is selected", "album/artist inferred from different area than title", ...]
}
If no media is shown clearly playing in the UI, respond with an error response like so:
{
"found_prominent_media": false,
"all_strings": ["all prominent strings", "depicted in image", "1:24", "AT&T 5G", "message recipient", "website text...", "other content...", ...],
"foreground_app": "lockscreen" | "browser" | "messenger" | "photos" | "twitter" | "maps" | "mail" | ... | null,
"browser_active_url": "www.instagram.com" | "https://example.com" | "chase.com" | ... | null,
"audio_out_device": "AirPods Pro" | "RAM Promaster" | "Nickpods" | "BrickBedroomTV" | "Cardo" | "Bathpod" | "Apple TV" | "PLT_BACKBEAT_PRO" | "Minirig" | ... | null,
"operating_system": "macos" | "ios" | "ipados" | "windows" | "android" | ... | null,
"prominent_colors": ["red", "pink", "orange", ...],
"description": "screenshot of an iPhone lockscreen with nothing playing" | "screenshot of a browser showing a news article" | "picture of a post-it note with a cat drawn on it" | ...,
"warnings": ["could not find any music or videos depicted", "low confidence in audio_out_device guess", "image contained non-English characters or symbols", "could not find any text at all in the image", ...]
}
"""
def detect_mimetype(path: str) -> str:
extension = Path(path or 'default.png').suffix.lower().strip('.').replace('jpg', 'jpeg')
mimetype = f'image/{extension}'
return mimetype
def encode_image(image_path: str | None) -> str:
if not image_path:
return None
# print("[*] Encoding attachments into base64...")
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
return base64_image
def call_openai_api(prompt: str, image: str | None=None, mimetype='image/png', api_key: str=DEFAULT_OPENAI_API_KEY, max_tokens: int=MAX_RESPONSE_TOKENS, model: str='gpt-4-vision-preview') -> dict:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": model,
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt,
},
{
"type": "image_url",
"image_url": {
"url": f"data:{mimetype};base64,{image}"
}
}
]
}
],
"max_tokens": max_tokens,
}
# print("[^] Sending request to OpenAI GPT-4 API...")
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
# print("[>] Processing response from OpenAI GPT-4 API...")
response_json = response.json()
try:
response_message = response_json["choices"][0]["message"]
response_body = response_message["content"].strip('```json').strip('```').strip()
answer_json = '{' + response_body.split('{', 1)[-1].rsplit('}', 1)[0] + '}'
parsed_json = json.loads(answer_json)
except Exception as e:
pp.pprint(response_json)
raise e
return parsed_json
def main():
parser = argparse.ArgumentParser(description='Query ChatGPT-4 with an optional image attachment.')
parser.add_argument('--prompt', type=str, help='File path for the text prompt.')
parser.add_argument('--attach', type=str, help='File path for the image to attach.')
args = parser.parse_args()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or DEFAULT_OPENAI_API_KEY
prompt_text = Path(args.prompt).read_text() if args.prompt else DEFAULT_PROMPT
base64_image = encode_image(args.attach)
mimetype = detect_mimetype(args.attach)
# Call the function and print the result to stdout
result = call_openai_api(prompt=prompt_text, image=base64_image, mimetype=mimetype, api_key=OPENAI_API_KEY)
print(json.dumps(result, indent=4))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment