GabrielSGoncalves/nlp_aws_medium_part2.py

## nlp_aws_medium_part2.py
# 5) Creating a new S3 bucket to upload the audio files
bucket_name = 'medium-nlp-aws'
client_s3 = boto3.client('s3')
client_s3.create_bucket(Bucket=bucket_name)


# 6) Uploading the files to the created bucket
for audio_file in df_audio.filename.values:
    print(audio_file)
    client_s3.upload_file(audio_file, bucket_name,  audio_file)

# 7) Define the file URLs on the bucket using S3 convention for file paths
for index, row in df_audio.iterrows():
    bucket_location = boto3.client('s3').get_bucket_location(Bucket=bucket_name)
    object_url = f"https://{bucket_name}.s3.amazonaws.com/{row['filename'].replace(' ', '+')}"
    df_audio.at[index, 'url'] = object_url
    print(object_url)

# 8) Function to start Amazon Transcribe job
def start_transcription(bucket, job_name, file_url, wait_process=True):
    client_transcribe = boto3.client('transcribe')
    client_transcribe.start_transcription_job(
        TranscriptionJobName=job_name,
        Media={'MediaFileUri': file_url},
        MediaFormat='mp3',
        LanguageCode='en-US',
        OutputBucketName=bucket)
    if wait_process:
        while True:
            status = client_transcribe.get_transcription_job(TranscriptionJobName=job_name)
            if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
                break
            print("Not ready yet...")
            time.sleep(20)

        print('Transcription finished')
        return status


# 9) Iterate over the audio files URLs on S3 and call start_transcription
today = date.today().strftime("%d%m%Y")
for index, row in df_audio.iterrows():
    print(f'{index}_speech_{today}', row.url)
    start_transcription(bucket_name, f'{index}_speech_{today}_{var}', row.url, wait_process=False)
    df_audio.at[index, 'transcription_url'] = f"https://{bucket_name}.s3.amazonaws.com/{index}_speech_{today}.json"
    df_audio.at[index, 'json_transcription'] = f"{index}_speech_{today}.json"
	# 5) Creating a new S3 bucket to upload the audio files
	bucket_name = 'medium-nlp-aws'
	client_s3 = boto3.client('s3')
	client_s3.create_bucket(Bucket=bucket_name)


	# 6) Uploading the files to the created bucket
	for audio_file in df_audio.filename.values:
	print(audio_file)
	client_s3.upload_file(audio_file, bucket_name, audio_file)

	# 7) Define the file URLs on the bucket using S3 convention for file paths
	for index, row in df_audio.iterrows():
	bucket_location = boto3.client('s3').get_bucket_location(Bucket=bucket_name)
	object_url = f"https://{bucket_name}.s3.amazonaws.com/{row['filename'].replace(' ', '+')}"
	df_audio.at[index, 'url'] = object_url
	print(object_url)

	# 8) Function to start Amazon Transcribe job
	def start_transcription(bucket, job_name, file_url, wait_process=True):
	client_transcribe = boto3.client('transcribe')
	client_transcribe.start_transcription_job(
	TranscriptionJobName=job_name,
	Media={'MediaFileUri': file_url},
	MediaFormat='mp3',
	LanguageCode='en-US',
	OutputBucketName=bucket)
	if wait_process:
	while True:
	status = client_transcribe.get_transcription_job(TranscriptionJobName=job_name)
	if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
	break
	print("Not ready yet...")
	time.sleep(20)

	print('Transcription finished')
	return status


	# 9) Iterate over the audio files URLs on S3 and call start_transcription
	today = date.today().strftime("%d%m%Y")
	for index, row in df_audio.iterrows():
	print(f'{index}_speech_{today}', row.url)
	start_transcription(bucket_name, f'{index}_speech_{today}_{var}', row.url, wait_process=False)
	df_audio.at[index, 'transcription_url'] = f"https://{bucket_name}.s3.amazonaws.com/{index}_speech_{today}.json"
	df_audio.at[index, 'json_transcription'] = f"{index}_speech_{today}.json"