pirate/music_in_screenshots.py

## music_in_screenshots.py
#!/usr/bin/env python3

# Script to extract song/video title, artist, album, etc. metadata from screenshots w/ GPT-4.

### Example Usage: ###############################################################################
#
# ➜ ~/Desktop # python3 music_in_screnshots.py --prompt=prompt.txt --attach=spotify_screenshot.PNG
# {
#     "found_prominent_media": true,
#     "all_strings": [
#         "1:21",
#         "open.spotify.com",
#         "Spotify",
#         "OPEN APP",
#         "Deuter",
#         "Heartnotes 2.0 432 Hz - 24-bit digital",
#         "Steven Halpern, David Darling",
#         "Call Within (Instrumental Meditation ...",
#         "Manose",
#         "417 Hz - Undoing Emotional Patterns",
#         "Kev Thompson",
#         "Oceanic World of Atlas 3D",
#         "Dreamflute Doroth\u00e9e Fr\u00f6ller",
#         "Pure Tranquility (Theta Binaural)",
#         "NREM",
#         "Binaural Alpha Sinus 110Hz - 118Hz",
#         "Binaural Shapers",
#         "432Hz Miracle Tone: Expanding Wind...",
#         "PowerThoughts Meditation Club",
#         "111hz Michael: Victory from Fear",
#         "Ted Winslow",
#         "Listen With Bamboo Flute, ..."
#     ],
#     "title": "Call Within (Instrumental Meditation ...",
#     "artist": "Manose",
#     "album": null,
#     "now_playing_position": null,
#     "playback_time_remaining": null,
#     "total_duration": null,
#     "foreground_app": "browser",
#     "browser_active_url": "open.spotify.com",
#     "audio_out_device": null,
#     "operating_system": "ios",
#     "prominent_colors": [
#         "black",
#         "green",
#         "white"
#     ],
#     "description": "screenshot of iOS browser showing Spotify web player interface with a song paused",
#     "warnings": [
#         "could not determine complete song title due to truncation",
#         "could not determine now playing position, playback time remaining, or total duration from the image"
#     ]
# }
# ➜ ~/Desktop # python3 music_in_screnshots.py --prompt=prompt.txt --attach=playlist.PNG
# {
#     "found_prominent_media": true,
#     "all_strings": [
#         "4:19",
#         "Across the Sea",
#         "Middle Sky Boom & Eliezer",
#         "Goodthing",
#         "Leon Vynehall",
#         "It's Just (House of Dupree)",
#         "Pier Children",
#         "Caint Use My Phone (Suite)",
#         "Erykah Badu",
#         "663 songs, 116 hours 42 minutes",
#         "Featured Artists",
#         "See All",
#         "Jordan Rakei",
#         "Sevdaliza",
#         "Big Muff",
#         "Butterflies (Demo Version)",
#         "Listen Now",
#         "Browse",
#         "Radio",
#         "Library",
#         "Search"
#     ],
#     "title": "Butterflies (Demo Version)",
#     "artist": null,
#     "album": null,
#     "now_playing_position": null,
#     "playback_time_remaining": null,
#     "total_duration": null,
#     "foreground_app": "applemusic",
#     "browser_active_url": null,
#     "audio_out_device": null,
#     "operating_system": "ios",
#     "prominent_colors": [
#         "black",
#         "red",
#         "purple",
#         "gray",
#         "white"
#     ],
#     "description": "screenshot of iOS Apple Music app with 'Butterflies (Demo Version)' paused at the bottom menu",
#     "warnings": [
#         "artist not visible for currently highlighted song",
#         "album not visible for currently highlighted song",
#         "playback position, time remaining, and total duration not visible"
#     ]
# }
########################################################################################

import os
import sys
import json
import base64
import argparse
import requests
import pprint
from pathlib import Path

pp = pprint.PrettyPrinter(indent=4)


DEFAULT_OPENAI_API_KEY = 'api-key-here'

MAX_RESPONSE_TOKENS = 700

DEFAULT_PROMPT = """\
Find the now playing or most prominent song (or video). Respond using the JSON format below.

Record all the raw strings that appear in the image in the "all_strings" field (stripping newlines).
If any values are partially obscured, use them as-is and add a warning.
If any values are fully obscured, unreadable, or unavailable, use `null` as their value and add a warning.

Sometimes Bluetooth or Airplay devices names appear in images to indicate they are being used for audio out (e.g. headphones, speakers, cars, etc.).
Don't confuse those device names with title/artist/album values, put any device name (if present) in the audio_out_device field only.

{
    "found_prominent_media": true,
    "all_strings": ["all prominent strings", "seen in image", "9:25", "T-Mobile LTE", "23%", "Jazz", "The Beatles", "Yellow Submarine - EP", "Big Apple Records Ltd", "other songs/artists/albums visible", "fragments of titles/artists/timestamps", ...],
    "title": "detected now playing song or video title here" | null,
    "artist": "detected now playing artist name here" | null,
    "album": "detected now playing album name here" | null,
    "now_playing_position": "hours:minutes:seconds" | "3:59:59" | "1:23" | ... | null,
    "playback_time_remaining": "-hours:minutes:seconds" | "-1:32:03" | "-1:23" | ... | null,
    "total_duration": "hours:minutes:seconds" | "3:59:59" | "4:59" | ... | null,
    "foreground_app": "lockscreen" | "controlcenter" | "browser" | "youtube" | "soundcloud" | "instagram" | "applemusic" | "spotify" | "shazam" | "photos" | "imessage" | "signal" | ... | null,
    "browser_active_url": "youtube.com/watch?v=w_5K8dRt7Bs" | "www.instagram.com" | ... | null,
    "audio_out_device": "AirPods Pro" | "RAM Promaster" | "Nickpods" | "BrickBedroomTV" | "Cardo" | "Bathpod" | "Apple TV" | "PLT_BACKBEAT_PRO" | "Minirig" | ... | null,
    "operating_system": "macos" | "ios" | "ipados" | "windows" | "android" | ... | null,
    "prominent_colors": ["blue", "white", "pink", ...],
    "description": "screenshot of iOS Apple Music app showing song playing" | "screenshot of desktop macOS browser showing a Soundcloud mix" | "picture of a physical vinyl record cover" | ...,
    "warnings": ["title partially obscured by edge of screen", "not enough confidence to guess foreground app", "multiple songs seen in image, no particular song is selected", "album/artist inferred from different area than title", ...]
}


If no media is shown clearly playing in the UI, respond with an error response like so:

{
    "found_prominent_media": false,
    "all_strings": ["all prominent strings", "depicted in image", "1:24", "AT&T 5G", "message recipient", "website text...", "other content...", ...],
    "foreground_app": "lockscreen" | "browser" | "messenger" | "photos" | "twitter" | "maps" | "mail" | ... | null,
    "browser_active_url": "www.instagram.com" | "https://example.com" | "chase.com" | ... | null,
    "audio_out_device": "AirPods Pro" | "RAM Promaster" | "Nickpods" | "BrickBedroomTV" | "Cardo" | "Bathpod" | "Apple TV" | "PLT_BACKBEAT_PRO" | "Minirig" | ... | null,
    "operating_system": "macos" | "ios" | "ipados" | "windows" | "android" | ... | null,
    "prominent_colors": ["red", "pink", "orange", ...],
    "description": "screenshot of an iPhone lockscreen with nothing playing" | "screenshot of a browser showing a news article" | "picture of a post-it note with a cat drawn on it" | ...,
    "warnings": ["could not find any music or videos depicted", "low confidence in audio_out_device guess", "image contained non-English characters or symbols", "could not find any text at all in the image", ...]
}
"""

def detect_mimetype(path: str) -> str:
    extension = Path(path or 'default.png').suffix.lower().strip('.').replace('jpg', 'jpeg')
    mimetype = f'image/{extension}'
    return mimetype


def encode_image(image_path: str | None) -> str:
    if not image_path:
        return None

    # print("[*] Encoding attachments into base64...")
    with open(image_path, "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode('utf-8')

    return base64_image


def call_openai_api(prompt: str, image: str | None=None, mimetype='image/png', api_key: str=DEFAULT_OPENAI_API_KEY, max_tokens: int=MAX_RESPONSE_TOKENS, model: str='gpt-4-vision-preview') -> dict:
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": model,
        "messages": [
          {
            "role": "user",
            "content": [
              {
                "type": "text",
                "text": prompt,
              },
              {
                "type": "image_url",
                "image_url": {
                  "url": f"data:{mimetype};base64,{image}"
                }
              }
            ]
          }
        ],
        "max_tokens": max_tokens,
    }

    # print("[^] Sending request to OpenAI GPT-4 API...")

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    # print("[>] Processing response from OpenAI GPT-4 API...")
    response_json = response.json()

    try:
        response_message = response_json["choices"][0]["message"]
        response_body = response_message["content"].strip('```json').strip('```').strip()
        answer_json = '{' + response_body.split('{', 1)[-1].rsplit('}', 1)[0] + '}'
        parsed_json = json.loads(answer_json)
    except Exception as e:
        pp.pprint(response_json)
        raise e

    return parsed_json


def main():
    parser = argparse.ArgumentParser(description='Query ChatGPT-4 with an optional image attachment.')
    parser.add_argument('--prompt', type=str, help='File path for the text prompt.')
    parser.add_argument('--attach', type=str, help='File path for the image to attach.')

    args = parser.parse_args()

    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or DEFAULT_OPENAI_API_KEY

    prompt_text = Path(args.prompt).read_text() if args.prompt else DEFAULT_PROMPT
    base64_image = encode_image(args.attach)
    mimetype = detect_mimetype(args.attach)

    # Call the function and print the result to stdout
    result = call_openai_api(prompt=prompt_text, image=base64_image, mimetype=mimetype, api_key=OPENAI_API_KEY)
    print(json.dumps(result, indent=4))


if __name__ == '__main__':
    main()
	#!/usr/bin/env python3

	# Script to extract song/video title, artist, album, etc. metadata from screenshots w/ GPT-4.

	### Example Usage: ###############################################################################
	#
	# ➜ ~/Desktop # python3 music_in_screnshots.py --prompt=prompt.txt --attach=spotify_screenshot.PNG
	# {
	# "found_prominent_media": true,
	# "all_strings": [
	# "1:21",
	# "open.spotify.com",
	# "Spotify",
	# "OPEN APP",
	# "Deuter",
	# "Heartnotes 2.0 432 Hz - 24-bit digital",
	# "Steven Halpern, David Darling",
	# "Call Within (Instrumental Meditation ...",
	# "Manose",
	# "417 Hz - Undoing Emotional Patterns",
	# "Kev Thompson",
	# "Oceanic World of Atlas 3D",
	# "Dreamflute Doroth\u00e9e Fr\u00f6ller",
	# "Pure Tranquility (Theta Binaural)",
	# "NREM",
	# "Binaural Alpha Sinus 110Hz - 118Hz",
	# "Binaural Shapers",
	# "432Hz Miracle Tone: Expanding Wind...",
	# "PowerThoughts Meditation Club",
	# "111hz Michael: Victory from Fear",
	# "Ted Winslow",
	# "Listen With Bamboo Flute, ..."
	# ],
	# "title": "Call Within (Instrumental Meditation ...",
	# "artist": "Manose",
	# "album": null,
	# "now_playing_position": null,
	# "playback_time_remaining": null,
	# "total_duration": null,
	# "foreground_app": "browser",
	# "browser_active_url": "open.spotify.com",
	# "audio_out_device": null,
	# "operating_system": "ios",
	# "prominent_colors": [
	# "black",
	# "green",
	# "white"
	# ],
	# "description": "screenshot of iOS browser showing Spotify web player interface with a song paused",
	# "warnings": [
	# "could not determine complete song title due to truncation",
	# "could not determine now playing position, playback time remaining, or total duration from the image"
	# ]
	# }
	# ➜ ~/Desktop # python3 music_in_screnshots.py --prompt=prompt.txt --attach=playlist.PNG
	# {
	# "found_prominent_media": true,
	# "all_strings": [
	# "4:19",
	# "Across the Sea",
	# "Middle Sky Boom & Eliezer",
	# "Goodthing",
	# "Leon Vynehall",
	# "It's Just (House of Dupree)",
	# "Pier Children",
	# "Caint Use My Phone (Suite)",
	# "Erykah Badu",
	# "663 songs, 116 hours 42 minutes",
	# "Featured Artists",
	# "See All",
	# "Jordan Rakei",
	# "Sevdaliza",
	# "Big Muff",
	# "Butterflies (Demo Version)",
	# "Listen Now",
	# "Browse",
	# "Radio",
	# "Library",
	# "Search"
	# ],
	# "title": "Butterflies (Demo Version)",
	# "artist": null,
	# "album": null,
	# "now_playing_position": null,
	# "playback_time_remaining": null,
	# "total_duration": null,
	# "foreground_app": "applemusic",
	# "browser_active_url": null,
	# "audio_out_device": null,
	# "operating_system": "ios",
	# "prominent_colors": [
	# "black",
	# "red",
	# "purple",
	# "gray",
	# "white"
	# ],
	# "description": "screenshot of iOS Apple Music app with 'Butterflies (Demo Version)' paused at the bottom menu",
	# "warnings": [
	# "artist not visible for currently highlighted song",
	# "album not visible for currently highlighted song",
	# "playback position, time remaining, and total duration not visible"
	# ]
	# }
	########################################################################################

	import os
	import sys
	import json
	import base64
	import argparse
	import requests
	import pprint
	from pathlib import Path

	pp = pprint.PrettyPrinter(indent=4)



	DEFAULT_OPENAI_API_KEY = 'api-key-here'

	MAX_RESPONSE_TOKENS = 700

	DEFAULT_PROMPT = """\
	Find the now playing or most prominent song (or video). Respond using the JSON format below.

	Record all the raw strings that appear in the image in the "all_strings" field (stripping newlines).
	If any values are partially obscured, use them as-is and add a warning.
	If any values are fully obscured, unreadable, or unavailable, use `null` as their value and add a warning.

	Sometimes Bluetooth or Airplay devices names appear in images to indicate they are being used for audio out (e.g. headphones, speakers, cars, etc.).
	Don't confuse those device names with title/artist/album values, put any device name (if present) in the audio_out_device field only.

	{
	"found_prominent_media": true,
	"all_strings": ["all prominent strings", "seen in image", "9:25", "T-Mobile LTE", "23%", "Jazz", "The Beatles", "Yellow Submarine - EP", "Big Apple Records Ltd", "other songs/artists/albums visible", "fragments of titles/artists/timestamps", ...],
	"title": "detected now playing song or video title here" \| null,
	"artist": "detected now playing artist name here" \| null,
	"album": "detected now playing album name here" \| null,
	"now_playing_position": "hours:minutes:seconds" \| "3:59:59" \| "1:23" \| ... \| null,
	"playback_time_remaining": "-hours:minutes:seconds" \| "-1:32:03" \| "-1:23" \| ... \| null,
	"total_duration": "hours:minutes:seconds" \| "3:59:59" \| "4:59" \| ... \| null,
	"foreground_app": "lockscreen" \| "controlcenter" \| "browser" \| "youtube" \| "soundcloud" \| "instagram" \| "applemusic" \| "spotify" \| "shazam" \| "photos" \| "imessage" \| "signal" \| ... \| null,
	"browser_active_url": "youtube.com/watch?v=w_5K8dRt7Bs" \| "www.instagram.com" \| ... \| null,
	"audio_out_device": "AirPods Pro" \| "RAM Promaster" \| "Nickpods" \| "BrickBedroomTV" \| "Cardo" \| "Bathpod" \| "Apple TV" \| "PLT_BACKBEAT_PRO" \| "Minirig" \| ... \| null,
	"operating_system": "macos" \| "ios" \| "ipados" \| "windows" \| "android" \| ... \| null,
	"prominent_colors": ["blue", "white", "pink", ...],
	"description": "screenshot of iOS Apple Music app showing song playing" \| "screenshot of desktop macOS browser showing a Soundcloud mix" \| "picture of a physical vinyl record cover" \| ...,
	"warnings": ["title partially obscured by edge of screen", "not enough confidence to guess foreground app", "multiple songs seen in image, no particular song is selected", "album/artist inferred from different area than title", ...]
	}


	If no media is shown clearly playing in the UI, respond with an error response like so:

	{
	"found_prominent_media": false,
	"all_strings": ["all prominent strings", "depicted in image", "1:24", "AT&T 5G", "message recipient", "website text...", "other content...", ...],
	"foreground_app": "lockscreen" \| "browser" \| "messenger" \| "photos" \| "twitter" \| "maps" \| "mail" \| ... \| null,
	"browser_active_url": "www.instagram.com" \| "https://example.com" \| "chase.com" \| ... \| null,
	"audio_out_device": "AirPods Pro" \| "RAM Promaster" \| "Nickpods" \| "BrickBedroomTV" \| "Cardo" \| "Bathpod" \| "Apple TV" \| "PLT_BACKBEAT_PRO" \| "Minirig" \| ... \| null,
	"operating_system": "macos" \| "ios" \| "ipados" \| "windows" \| "android" \| ... \| null,
	"prominent_colors": ["red", "pink", "orange", ...],
	"description": "screenshot of an iPhone lockscreen with nothing playing" \| "screenshot of a browser showing a news article" \| "picture of a post-it note with a cat drawn on it" \| ...,
	"warnings": ["could not find any music or videos depicted", "low confidence in audio_out_device guess", "image contained non-English characters or symbols", "could not find any text at all in the image", ...]
	}
	"""

	def detect_mimetype(path: str) -> str:
	extension = Path(path or 'default.png').suffix.lower().strip('.').replace('jpg', 'jpeg')
	mimetype = f'image/{extension}'
	return mimetype


	def encode_image(image_path: str \| None) -> str:
	if not image_path:
	return None

	# print("[*] Encoding attachments into base64...")
	with open(image_path, "rb") as image_file:
	base64_image = base64.b64encode(image_file.read()).decode('utf-8')

	return base64_image


	def call_openai_api(prompt: str, image: str \| None=None, mimetype='image/png', api_key: str=DEFAULT_OPENAI_API_KEY, max_tokens: int=MAX_RESPONSE_TOKENS, model: str='gpt-4-vision-preview') -> dict:
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {api_key}"
	}

	payload = {
	"model": model,
	"messages": [
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": prompt,
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:{mimetype};base64,{image}"
	}
	}
	]
	}
	],
	"max_tokens": max_tokens,
	}

	# print("[^] Sending request to OpenAI GPT-4 API...")

	response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

	# print("[>] Processing response from OpenAI GPT-4 API...")
	response_json = response.json()

	try:
	response_message = response_json["choices"][0]["message"]
	response_body = response_message["content"].strip('```json').strip('```').strip()
	answer_json = '{' + response_body.split('{', 1)[-1].rsplit('}', 1)[0] + '}'
	parsed_json = json.loads(answer_json)
	except Exception as e:
	pp.pprint(response_json)
	raise e

	return parsed_json




	def main():
	parser = argparse.ArgumentParser(description='Query ChatGPT-4 with an optional image attachment.')
	parser.add_argument('--prompt', type=str, help='File path for the text prompt.')
	parser.add_argument('--attach', type=str, help='File path for the image to attach.')

	args = parser.parse_args()

	OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or DEFAULT_OPENAI_API_KEY

	prompt_text = Path(args.prompt).read_text() if args.prompt else DEFAULT_PROMPT
	base64_image = encode_image(args.attach)
	mimetype = detect_mimetype(args.attach)

	# Call the function and print the result to stdout
	result = call_openai_api(prompt=prompt_text, image=base64_image, mimetype=mimetype, api_key=OPENAI_API_KEY)
	print(json.dumps(result, indent=4))



	if __name__ == '__main__':
	main()