JGalego/describeforme.py

## describeforme.py
"""
Describe for Me (Bedrock edition ⛰️)

Image ==> Bedrock -> Translate -> Polly ==> Speech
"""

import argparse
import base64
import json

import boto3

# Parse arguments
parser = argparse.ArgumentParser(
    prog='Describe for Me (Bedrock Edition) ⛰️',
    description="""
Describe for Me is an 'image-to-speech' app
built on top of AI services that was created
to help the visually impaired understand images
through lively audio captions.
""",
)
parser.add_argument('image')
parser.add_argument('-m', '--model', default="anthropic.claude-3-sonnet-20240229-v1:0")
parser.add_argument('-t', '--translate')
parser.add_argument('-v', '--voice')
args = parser.parse_args()

# Initialize clients
bedrock = boto3.client("bedrock-runtime")
translate = boto3.client("translate")
polly = boto3.client("polly")

# Process image
with open(args.image, "rb") as image_file:
    image = base64.b64encode(image_file.read()).decode("utf8")

# Generate a description of the image
response = bedrock.invoke_model(
    modelId=args.model,
    body=json.dumps({
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 1024,
        "messages": [{
            "role": "user",
            "content": [{
                "type": "text",
                "text": "Describe the image for me.",
            },
            {
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": f"image/{args.image.split('.')[-1]}",
                    "data": image,
                },
            }],
        }],
    }),
)

result = json.loads(response.get("body").read())
description = " ".join([output["text"] for output in result.get("content", [])])

print(f"##### Original #####\n\n{description}\n")

# Translate the description to a target language
if args.translate:
    description = translate.translate_text(
        Text=description,
        SourceLanguageCode='en',
        TargetLanguageCode=args.translate
    )['TranslatedText']

    print(f"##### Translation #####\n\n{description}\n")

# Read the description back to the user
if args.voice:
    # Translate -> Polly language map
    lang_map = {
        'en': 'en-US',
        'pt': 'pt-BR'
    }
    lang_code = lang_map.get(args.translate, args.translate)

    # Synthesize audio description
    print(f"\nSynthesizing audio description\n> Language: {lang_code}\n> Voice: {args.voice}\n")
    audio_data = polly.synthesize_speech(
        Engine="neural",  # hardcoded, because we want natural-sounding voices only
        LanguageCode=lang_code,
        OutputFormat="mp3",
        Text=description,
        TextType='text',
        VoiceId=args.voice,
    )

    # Save audio description
    with open(f"{''.join(args.image.split('.')[:-1])}.mp3", 'wb') as audio_file:
        audio_file.write(audio_data['AudioStream'].read())
	"""
	Describe for Me (Bedrock edition ⛰️)

	Image ==> Bedrock -> Translate -> Polly ==> Speech
	"""

	import argparse
	import base64
	import json

	import boto3

	# Parse arguments
	parser = argparse.ArgumentParser(
	prog='Describe for Me (Bedrock Edition) ⛰️',
	description="""
	Describe for Me is an 'image-to-speech' app
	built on top of AI services that was created
	to help the visually impaired understand images
	through lively audio captions.
	""",
	)
	parser.add_argument('image')
	parser.add_argument('-m', '--model', default="anthropic.claude-3-sonnet-20240229-v1:0")
	parser.add_argument('-t', '--translate')
	parser.add_argument('-v', '--voice')
	args = parser.parse_args()

	# Initialize clients
	bedrock = boto3.client("bedrock-runtime")
	translate = boto3.client("translate")
	polly = boto3.client("polly")

	# Process image
	with open(args.image, "rb") as image_file:
	image = base64.b64encode(image_file.read()).decode("utf8")

	# Generate a description of the image
	response = bedrock.invoke_model(
	modelId=args.model,
	body=json.dumps({
	"anthropic_version": "bedrock-2023-05-31",
	"max_tokens": 1024,
	"messages": [{
	"role": "user",
	"content": [{
	"type": "text",
	"text": "Describe the image for me.",
	},
	{
	"type": "image",
	"source": {
	"type": "base64",
	"media_type": f"image/{args.image.split('.')[-1]}",
	"data": image,
	},
	}],
	}],
	}),
	)

	result = json.loads(response.get("body").read())
	description = " ".join([output["text"] for output in result.get("content", [])])

	print(f"##### Original #####\n\n{description}\n")

	# Translate the description to a target language
	if args.translate:
	description = translate.translate_text(
	Text=description,
	SourceLanguageCode='en',
	TargetLanguageCode=args.translate
	)['TranslatedText']

	print(f"##### Translation #####\n\n{description}\n")

	# Read the description back to the user
	if args.voice:
	# Translate -> Polly language map
	lang_map = {
	'en': 'en-US',
	'pt': 'pt-BR'
	}
	lang_code = lang_map.get(args.translate, args.translate)

	# Synthesize audio description
	print(f"\nSynthesizing audio description\n> Language: {lang_code}\n> Voice: {args.voice}\n")
	audio_data = polly.synthesize_speech(
	Engine="neural", # hardcoded, because we want natural-sounding voices only
	LanguageCode=lang_code,
	OutputFormat="mp3",
	Text=description,
	TextType='text',
	VoiceId=args.voice,
	)

	# Save audio description
	with open(f"{''.join(args.image.split('.')[:-1])}.mp3", 'wb') as audio_file:
	audio_file.write(audio_data['AudioStream'].read())