Skip to content

Instantly share code, notes, and snippets.

@JGalego
Last active May 22, 2024 17:41
Show Gist options
  • Save JGalego/469ec3bf542aa502770d3975181d9c1b to your computer and use it in GitHub Desktop.
Save JGalego/469ec3bf542aa502770d3975181d9c1b to your computer and use it in GitHub Desktop.
DescribeForMe // Bedrock Edition ⛰️
"""
Describe for Me (Bedrock edition ⛰️)
Image ==> Bedrock -> Translate -> Polly ==> Speech
"""
import argparse
import base64
import json
import boto3
# Parse arguments
parser = argparse.ArgumentParser(
prog='Describe for Me (Bedrock Edition) ⛰️',
description="""
Describe for Me is an 'image-to-speech' app
built on top of AI services that was created
to help the visually impaired understand images
through lively audio captions.
""",
)
parser.add_argument('image')
parser.add_argument('-m', '--model', default="anthropic.claude-3-sonnet-20240229-v1:0")
parser.add_argument('-t', '--translate')
parser.add_argument('-v', '--voice')
args = parser.parse_args()
# Initialize clients
bedrock = boto3.client("bedrock-runtime")
translate = boto3.client("translate")
polly = boto3.client("polly")
# Process image
with open(args.image, "rb") as image_file:
image = base64.b64encode(image_file.read()).decode("utf8")
# Generate a description of the image
response = bedrock.invoke_model(
modelId=args.model,
body=json.dumps({
"anthropic_version": "bedrock-2023-05-31",
"max_tokens": 1024,
"messages": [{
"role": "user",
"content": [{
"type": "text",
"text": "Describe the image for me.",
},
{
"type": "image",
"source": {
"type": "base64",
"media_type": f"image/{args.image.split('.')[-1]}",
"data": image,
},
}],
}],
}),
)
result = json.loads(response.get("body").read())
description = " ".join([output["text"] for output in result.get("content", [])])
print(f"##### Original #####\n\n{description}\n")
# Translate the description to a target language
if args.translate:
description = translate.translate_text(
Text=description,
SourceLanguageCode='en',
TargetLanguageCode=args.translate
)['TranslatedText']
print(f"##### Translation #####\n\n{description}\n")
# Read the description back to the user
if args.voice:
# Translate -> Polly language map
lang_map = {
'en': 'en-US',
'pt': 'pt-BR'
}
lang_code = lang_map.get(args.translate, args.translate)
# Synthesize audio description
print(f"\nSynthesizing audio description\n> Language: {lang_code}\n> Voice: {args.voice}\n")
audio_data = polly.synthesize_speech(
Engine="neural", # hardcoded, because we want natural-sounding voices only
LanguageCode=lang_code,
OutputFormat="mp3",
Text=description,
TextType='text',
VoiceId=args.voice,
)
# Save audio description
with open(f"{''.join(args.image.split('.')[:-1])}.mp3", 'wb') as audio_file:
audio_file.write(audio_data['AudioStream'].read())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment