Skip to content

Instantly share code, notes, and snippets.

@sbassi
Created July 18, 2020 23:51
Show Gist options
  • Save sbassi/00a7b07666ffcd9f22514703d5f47b81 to your computer and use it in GitHub Desktop.
Save sbassi/00a7b07666ffcd9f22514703d5f47b81 to your computer and use it in GitHub Desktop.
import time
from glob import glob
import uuid
import json
import requests
import boto3
FILE_TYPE = 'mp4'
LANGUAGE_CODE='es-ES'
BASE_URI = "https://s3-us-west-1.amazonaws.com/XXXXX/newmp4/"
def save_text(cnt, file_name):
"""
"""
tmp = json.loads(cnt)
trans = tmp['results']['transcripts'][0]['transcript']
with open(file_name, 'w') as fh:
fh.write(trans)
transcribe = boto3.client('transcribe')
# Get all mp4 in currdir
allfiles = glob('*.{}'.format(FILE_TYPE))
for file_ in allfiles:
job_uri = BASE_URI + file_
job_name = '{}-{}'.format(file_, uuid.uuid1())
transcribe.start_transcription_job(
TranscriptionJobName=job_name,
Media={'MediaFileUri': job_uri},
MediaFormat=FILE_TYPE,
LanguageCode=LANGUAGE_CODE
)
while True:
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
break
print("Not ready yet...")
time.sleep(15)
url = status['TranscriptionJob']['Transcript']['TranscriptFileUri']
response = requests.get(url)
cnt = response.content
save_text(cnt, file_ + '.json')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment