Skip to content

Instantly share code, notes, and snippets.

@GabrielSGoncalves
Last active September 18, 2019 16:41
Show Gist options
  • Save GabrielSGoncalves/adf68fd40ae8b4b9e8644a8f80e02e4e to your computer and use it in GitHub Desktop.
Save GabrielSGoncalves/adf68fd40ae8b4b9e8644a8f80e02e4e to your computer and use it in GitHub Desktop.
Second part of the NLP analysis for the Medium article on AWS ML/AI tools
# 5) Creating a new S3 bucket to upload the audio files
bucket_name = 'medium-nlp-aws'
client_s3 = boto3.client('s3')
client_s3.create_bucket(Bucket=bucket_name)
# 6) Uploading the files to the created bucket
for audio_file in df_audio.filename.values:
print(audio_file)
client_s3.upload_file(audio_file, bucket_name, audio_file)
# 7) Define the file URLs on the bucket using S3 convention for file paths
for index, row in df_audio.iterrows():
bucket_location = boto3.client('s3').get_bucket_location(Bucket=bucket_name)
object_url = f"https://{bucket_name}.s3.amazonaws.com/{row['filename'].replace(' ', '+')}"
df_audio.at[index, 'url'] = object_url
print(object_url)
# 8) Function to start Amazon Transcribe job
def start_transcription(bucket, job_name, file_url, wait_process=True):
client_transcribe = boto3.client('transcribe')
client_transcribe.start_transcription_job(
TranscriptionJobName=job_name,
Media={'MediaFileUri': file_url},
MediaFormat='mp3',
LanguageCode='en-US',
OutputBucketName=bucket)
if wait_process:
while True:
status = client_transcribe.get_transcription_job(TranscriptionJobName=job_name)
if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
break
print("Not ready yet...")
time.sleep(20)
print('Transcription finished')
return status
# 9) Iterate over the audio files URLs on S3 and call start_transcription
today = date.today().strftime("%d%m%Y")
for index, row in df_audio.iterrows():
print(f'{index}_speech_{today}', row.url)
start_transcription(bucket_name, f'{index}_speech_{today}_{var}', row.url, wait_process=False)
df_audio.at[index, 'transcription_url'] = f"https://{bucket_name}.s3.amazonaws.com/{index}_speech_{today}.json"
df_audio.at[index, 'json_transcription'] = f"{index}_speech_{today}.json"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment