Last active
May 22, 2024 17:41
-
-
Save JGalego/469ec3bf542aa502770d3975181d9c1b to your computer and use it in GitHub Desktop.
DescribeForMe // Bedrock Edition ⛰️
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Describe for Me (Bedrock edition ⛰️) | |
Image ==> Bedrock -> Translate -> Polly ==> Speech | |
""" | |
import argparse | |
import base64 | |
import json | |
import boto3 | |
# Parse arguments | |
parser = argparse.ArgumentParser( | |
prog='Describe for Me (Bedrock Edition) ⛰️', | |
description=""" | |
Describe for Me is an 'image-to-speech' app | |
built on top of AI services that was created | |
to help the visually impaired understand images | |
through lively audio captions. | |
""", | |
) | |
parser.add_argument('image') | |
parser.add_argument('-m', '--model', default="anthropic.claude-3-sonnet-20240229-v1:0") | |
parser.add_argument('-t', '--translate') | |
parser.add_argument('-v', '--voice') | |
args = parser.parse_args() | |
# Initialize clients | |
bedrock = boto3.client("bedrock-runtime") | |
translate = boto3.client("translate") | |
polly = boto3.client("polly") | |
# Process image | |
with open(args.image, "rb") as image_file: | |
image = base64.b64encode(image_file.read()).decode("utf8") | |
# Generate a description of the image | |
response = bedrock.invoke_model( | |
modelId=args.model, | |
body=json.dumps({ | |
"anthropic_version": "bedrock-2023-05-31", | |
"max_tokens": 1024, | |
"messages": [{ | |
"role": "user", | |
"content": [{ | |
"type": "text", | |
"text": "Describe the image for me.", | |
}, | |
{ | |
"type": "image", | |
"source": { | |
"type": "base64", | |
"media_type": f"image/{args.image.split('.')[-1]}", | |
"data": image, | |
}, | |
}], | |
}], | |
}), | |
) | |
result = json.loads(response.get("body").read()) | |
description = " ".join([output["text"] for output in result.get("content", [])]) | |
print(f"##### Original #####\n\n{description}\n") | |
# Translate the description to a target language | |
if args.translate: | |
description = translate.translate_text( | |
Text=description, | |
SourceLanguageCode='en', | |
TargetLanguageCode=args.translate | |
)['TranslatedText'] | |
print(f"##### Translation #####\n\n{description}\n") | |
# Read the description back to the user | |
if args.voice: | |
# Translate -> Polly language map | |
lang_map = { | |
'en': 'en-US', | |
'pt': 'pt-BR' | |
} | |
lang_code = lang_map.get(args.translate, args.translate) | |
# Synthesize audio description | |
print(f"\nSynthesizing audio description\n> Language: {lang_code}\n> Voice: {args.voice}\n") | |
audio_data = polly.synthesize_speech( | |
Engine="neural", # hardcoded, because we want natural-sounding voices only | |
LanguageCode=lang_code, | |
OutputFormat="mp3", | |
Text=description, | |
TextType='text', | |
VoiceId=args.voice, | |
) | |
# Save audio description | |
with open(f"{''.join(args.image.split('.')[:-1])}.mp3", 'wb') as audio_file: | |
audio_file.write(audio_data['AudioStream'].read()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment