Skip to content

Instantly share code, notes, and snippets.

@nmante
Created November 8, 2018 21:03
Show Gist options
  • Save nmante/d3c26cb514e49c61321b9c961cb87d2f to your computer and use it in GitHub Desktop.
Save nmante/d3c26cb514e49c61321b9c961cb87d2f to your computer and use it in GitHub Desktop.
Caption Converter
#!/usr/local/bin/python3
import boto3
import json
import logging
import subprocess as sp
import argparse
import os
def create_parser():
parser = argparse.ArgumentParser(description='Use ffmpeg to convert .sub caption files to .srt files, and upload to s3')
parser.add_argument('-c', '--config', help='JSON Configuration file with s3 access keys', default='config.json')
parser.add_argument('-i', '--captions_info', help='JSON file containing caption key/filenames on s3', default='captions_info.json')
parser.add_argument(
'-e', '--execute',
help='Set this flag to actually update the files on s3. If this is not set, s3 files will NOT be updated',
action='store_true'
)
parser.add_argument('-o', '--output_dir', help='The output directory to store the converted caption files. Directory must exist', default='/tmp')
parser.add_argument(
'-r', '--remove_files',
help='Set this flag to remove the converted caption files from your system',
action='store_true'
)
return parser
class CaptionFixer:
def __init__(
self,
should_execute_updates=False,
credentials_filename='config.json',
json_captions_file='captions_info.json',
s3_url='https://s3.amazonaws.com/video-api-prod',
output_dir='/tmp',
should_remove_files=False
):
credentials = self._validate_and_load_credentials(credentials_filename)
transcript_assets_info = self._validate_and_load_transcript_assets(json_captions_file)
self._credentials = credentials
self._transcript_assets_info = transcript_assets_info
self._S3Client = self._create_s3_client(credentials)
self._should_execute_updates = should_execute_updates
self._url = s3_url
self._output_captions_dir = output_dir
self._should_remove_files = should_remove_files
def _create_s3_client(self, credentials):
try:
client = boto3.client(
's3',
aws_access_key_id=credentials['aws_access_key_id'],
aws_secret_access_key=credentials['aws_secret_access_key'],
)
except Exception as e:
logging.error('Could not initialize s3 boto client')
logging.exception(e)
return
return client
def _validate_and_load_credentials(self, credentials_filename):
required_keys = set(['aws_access_key_id', 'aws_access_secret_key'])
credentials = {}
with open(credentials_filename) as f:
credentials = dict(json.load(f))
if len(required_keys - credentials.keys()) > 0:
logging.error('Must have required keys in config file {}'.format(required_keys))
raise ValueError
return credentials
def _validate_and_load_transcript_assets(self, json_captions_file):
transcript_assets = []
required_keys = set(['filename', 'storage_path'])
line_number = 0
with open(json_captions_file) as f:
for line in f:
transcript_asset = json.loads(line)
if len(required_keys - transcript_asset.keys()) > 0:
logging.error(
'Transcript asset {} missing required keys {}. Line number {}'
.format(transcript_asset['id'], required_keys, line_number)
)
continue
transcript_assets.append(transcript_asset)
return transcript_assets
def _convert_sub_caption_to_srt(self, storage_path, bucket='assets'):
try:
url = '{}/{}/{}'.format(self._url, bucket, storage_path)
tmp_caption_filename = '{}/{}'.format(self._output_captions_dir, storage_path.replace('/', '-'))
command = [
'ffmpeg',
'-i', url,
tmp_caption_filename
]
with sp.Popen(command, stdout=sp.PIPE, bufsize=1024) as pipe:
logging.info('Converted caption file {}. Stored at tmp file'.format(storage_path))
logging.info(pipe.stdout.read())
except Exception as e:
logging.info('Could not convert caption file {}'.format(storage_path))
logging.exception(e)
return None
return tmp_caption_filename
def _update_previous_sub_file(self, tmp_caption_filename, storage_path, bucket='assets'):
try:
with open(tmp_caption_filename, 'rb') as f:
logging.info(f.read())
self._S3Client.put_object(Body=f, Bucket=bucket, Key=storage_path)
return True
except Except:
logging.error('Could not update caption file {}'.format(storage_path))
logging.exception(e)
return False
def execute(self):
for transcript_asset in self._transcript_assets_info:
storage_path = transcript_asset['storage_path'].replace('assets/', '')
tmp_caption_filename = self._convert_sub_caption_to_srt(storage_path)
if not tmp_caption_filename:
continue
if self._should_execute_updates:
self._update_previous_sub_file(tmp_caption_filename, storage_path)
if self._should_remove_files:
os.remove(tmp_caption_filename)
def main():
parser = create_parser()
args = parser.parse_args()
fixer = CaptionFixer(
should_execute_updates=args.execute,
credentials_filename=args.config,
json_captions_file=args.captions_info,
output_dir=args.output_dir,
should_remove_files=args.remove_files
)
fixer.execute()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment