cobookman/Example VTT

## README.md

      
    Raw
  

              README.md
            
          
    Installation

Install google client libraries:
pip3 install google-cloud-videointelligence

Enable Video Intelligence API
Create a service account and download it's JSON private key in a file called service_account.json.
Running

Upload a video to GCS, and run the CLI command. Subtitles are saved to local filestystem & printed to STDOUT.
Realize the Transcribing process can takes longer than the video's duration.
Example CLI:
./captioner.py \
    --video=gs://us-central1-cdn-test-files/static-mp4/stay_tuned.mp4 \
    --service_account=service_account.json \
    --out=stay_tuned.vtt

fine tuning subtitling

In method is_break_point there's 3 variables, max_time, min_num_of_words, max_num_of_words. Feel free to modify to fine tune subtitle performance.

  
## captioner.py
#!/usr/bin/env python3
# Copyright 2017 The Abseil Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
 Author: jimmymkude@ / bookman@

 This script takes in Cloud VideoIntelligence API Speech Transcription response and converts to WebVTT.

 Modify is_break_point for max words / min words / max time per subtitle logic.

 Before running install google-cloud-videointelligence API:
   `pip3 install google-cloud-videointelligence`

 Auth is done through a service account. Which needs Video Intelligence & GCS permissions.

 Usage:
   python3 captioner.py --video=<gcs url> --service_account=service_account.json --out=subtitle.vtt

 Example:
   ./captioner.py \
       --video=gs://us-central1-cdn-test-files/static-mp4/stay_tuned.mp4 \
       --service_account=service_account.json \
       --out=stay_tuned.vtt

"""
import argparse
import json
import math
from google.protobuf.json_format import MessageToJson
from google.cloud import videointelligence


def main():
    args = arg_parse()

    # Client is not thread safe.
    # However, one should re-use client within the
    # same thread, and avoid creating a new client
    # per user request.
    video_client = (videointelligence
                    .VideoIntelligenceServiceClient
                    .from_service_account_file(args.service_account))

    alternatives = transcribe(video_client, args.video)
    captions = break_down_transcriptions(alternatives)
    vtt = gen_vtt(captions)
    print(vtt)

    f = open(args.out, 'w')
    f.write(vtt)
    f.close()

def arg_parse():
    """Parses args from cli."""
    parser = argparse.ArgumentParser(description='WebVTT Caption Video in GCS.')

    parser.add_argument(
        '--video', type=str, required=True,
        help='gcs path to video, e.g: gs://mybucket/myvideo.mp4')

    parser.add_argument(
        '--service_account', type=str, required=True,
        help='path to service account, e.g: service_account.json')

    parser.add_argument(
        '--out', type=str, required=True,
        help='where to write vtt subtitles, e.g: subtitles.vtt')

    return parser.parse_args()

def transcribe(video_client, gcs_video_path, language='en-US'):
    """Transcribes video file in gcs."""
    features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION]

    config = videointelligence.types.SpeechTranscriptionConfig(
        language_code='en-US',
        enable_automatic_punctuation=True)
    video_context = videointelligence.types.VideoContext(
        speech_transcription_config=config)

    operation = video_client.annotate_video(
        gcs_video_path, features=features,
        video_context=video_context)

    result = operation.result(timeout=600)

    # There is only one annotation_result since only
    # one video is processed.
    return (result
        .annotation_results[0]
        .speech_transcriptions[0]
        .alternatives)

def duration_to_seconds(duration):
    """Converts Protobuf Duration into second float."""
    return duration.seconds + (duration.nanos * 1e-9)

def is_punctuation(char):
    if len(char) != 1:
        return False
    punctuations = {'.', ',', '!', '?', ':', ';'}
    return char in punctuations


def is_break_point(word, diff, numberOfWordsInSentence):
    max_time = 3 # seconds
    min_num_of_words = 2
    max_num_of_words = 14
    last_char = word[(len(word) - 1)]

    # Max time elapsed
    if diff >= max_time:
        return True

    # End of sentance, and line length longer than min allowed
    if is_punctuation(last_char) and numberOfWordsInSentence >= min_num_of_words:
        return True

    # More than max words per line
    if numberOfWordsInSentence >= max_num_of_words:
        return True
    return False

def seconds_to_timestring(elapsed):
    seconds = elapsed % 60

    elapsed -= seconds

    hours = math.floor(elapsed / 60 / 60)
    elapsed -= hours * 60 * 60

    minutes = math.floor(elapsed / 60)

    return '{}:{}:{:.3f}'.format(hours, minutes, seconds)


def gen_vtt(captions):
    out = 'WEBVTT\n\n'

    for caption in captions:
        start = seconds_to_timestring(caption['videoSegment']['startTimeOffset'])
        end = seconds_to_timestring(caption['videoSegment']['endTimeOffset'])
        out += '{} --> {}\n'.format(start, end)
        out += caption['transcript'] + '\n'
        out += '\n'
    return out


def break_down_transcriptions(alternatives):
    captions = []
    for alternative in alternatives:
        start_time = duration_to_seconds(alternative.words[0].start_time)
        transcript = ''
        num_words_in_segment = 0
        for word in alternative.words:
            if not transcript:
                start_time = duration_to_seconds(word.start_time)
            end_time = duration_to_seconds(word.end_time)
            diff = end_time - start_time
            transcript += word.word + ' '
            num_words_in_segment += 1
            if is_break_point(word.word, diff, num_words_in_segment):
                # found break point.
                video_segment = {
                    'startTimeOffset': start_time,
                    'endTimeOffset': end_time
                }

                # Gets rid of unnecessary white space
                # at the end of a caption.
                if transcript[-1] == ' ':
                    transcript = transcript[:-1]
                caption = {
                    'transcript': transcript,
                    'confidence': alternative.confidence,
                    'videoSegment': video_segment,
                }
                captions.append(caption)
                transcript = ''
                num_words_in_segment = 0
    return captions


if __name__ == '__main__':
    main()

## Example VTT
./captioner.py \
    --video=gs://us-central1-cdn-test-files/static-mp4/stay_tuned.mp4 \
    --service_account=service_account.json \
    --out=stay_tuned.vtt

---
WEBVTT

0:0:1.500 --> 0:0:4.500
Is this a scene from an action movie where this wild

0:0:4.500 --> 0:0:6.200
Chase actually happened?

0:0:6.300 --> 0:0:6.700
Hey guys,

0:0:6.700 --> 0:0:7.200
it's David.

0:0:7.200 --> 0:0:7.900
Dobrik. What's up?

0:0:7.900 --> 0:0:8.600
Dave? Good to see you,

0:0:8.600 --> 0:0:9.100
dude. Hey,

0:0:10.400 --> 0:0:12.200
yeah. Well you've been doing this for a year now,

0:0:12.200 --> 0:0:13.900
so might as well take a little bit of a break.

0:0:14.000 --> 0:0:17.300
I got it from here right on I'll see you later plus

0:0:17.300 --> 0:0:19.600
how one sign and Harry Styles totally changed.

0:0:19.600 --> 0:0:20.600
This girl's life.

0:0:21.400 --> 0:0:24.500
This is seriously nuts Las Vegas Police

0:0:24.500 --> 0:0:28.000
rolled up on these two suspected Killers driving in that black SUV,

0:0:28.000 --> 0:0:31.000
but when they try to stop it all hell broke loose

0:0:32.200 --> 0:0:35.300
those guys raced off shooting back at the officer and at times

0:0:35.300 --> 0:0:38.700
the cop shot back at them blasting bullets right through the windshield

0:0:38.700 --> 0:0:41.000
dozens of shots while flying back and forth.

0:0:41.000 --> 0:0:44.100
It all ended with those guys finally crashing into a school their

0:0:44.100 --> 0:0:47.900
car was all shot up and one of those suspects died those

0:0:47.900 --> 0:0:49.700
soccer players who were stuck in a tie case.

0:0:49.700 --> 0:0:52.700
They thought they were hallucinating during the rescue and now they

0:0:52.700 --> 0:0:54.600
want to get ordained as monks together.

0:0:54.800 --> 0:0:57.800
This deadly wild fire is still burning near Yosemite.

0:0:57.800 --> 0:0:58.800
It's already torn through.

0:0:58.800 --> 0:0:59.800
12,000 Acres.
	#!/usr/bin/env python3
	# Copyright 2017 The Abseil Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Author: jimmymkude@ / bookman@

	This script takes in Cloud VideoIntelligence API Speech Transcription response and converts to WebVTT.

	Modify is_break_point for max words / min words / max time per subtitle logic.

	Before running install google-cloud-videointelligence API:
	`pip3 install google-cloud-videointelligence`

	Auth is done through a service account. Which needs Video Intelligence & GCS permissions.

	Usage:
	python3 captioner.py --video=<gcs url> --service_account=service_account.json --out=subtitle.vtt

	Example:
	./captioner.py \
	--video=gs://us-central1-cdn-test-files/static-mp4/stay_tuned.mp4 \
	--service_account=service_account.json \
	--out=stay_tuned.vtt

	"""
	import argparse
	import json
	import math
	from google.protobuf.json_format import MessageToJson
	from google.cloud import videointelligence



	def main():
	args = arg_parse()

	# Client is not thread safe.
	# However, one should re-use client within the
	# same thread, and avoid creating a new client
	# per user request.
	video_client = (videointelligence
	.VideoIntelligenceServiceClient
	.from_service_account_file(args.service_account))

	alternatives = transcribe(video_client, args.video)
	captions = break_down_transcriptions(alternatives)
	vtt = gen_vtt(captions)
	print(vtt)

	f = open(args.out, 'w')
	f.write(vtt)
	f.close()

	def arg_parse():
	"""Parses args from cli."""
	parser = argparse.ArgumentParser(description='WebVTT Caption Video in GCS.')

	parser.add_argument(
	'--video', type=str, required=True,
	help='gcs path to video, e.g: gs://mybucket/myvideo.mp4')

	parser.add_argument(
	'--service_account', type=str, required=True,
	help='path to service account, e.g: service_account.json')

	parser.add_argument(
	'--out', type=str, required=True,
	help='where to write vtt subtitles, e.g: subtitles.vtt')

	return parser.parse_args()

	def transcribe(video_client, gcs_video_path, language='en-US'):
	"""Transcribes video file in gcs."""
	features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION]

	config = videointelligence.types.SpeechTranscriptionConfig(
	language_code='en-US',
	enable_automatic_punctuation=True)
	video_context = videointelligence.types.VideoContext(
	speech_transcription_config=config)

	operation = video_client.annotate_video(
	gcs_video_path, features=features,
	video_context=video_context)

	result = operation.result(timeout=600)

	# There is only one annotation_result since only
	# one video is processed.
	return (result
	.annotation_results[0]
	.speech_transcriptions[0]
	.alternatives)

	def duration_to_seconds(duration):
	"""Converts Protobuf Duration into second float."""
	return duration.seconds + (duration.nanos * 1e-9)

	def is_punctuation(char):
	if len(char) != 1:
	return False
	punctuations = {'.', ',', '!', '?', ':', ';'}
	return char in punctuations


	def is_break_point(word, diff, numberOfWordsInSentence):
	max_time = 3 # seconds
	min_num_of_words = 2
	max_num_of_words = 14
	last_char = word[(len(word) - 1)]

	# Max time elapsed
	if diff >= max_time:
	return True

	# End of sentance, and line length longer than min allowed
	if is_punctuation(last_char) and numberOfWordsInSentence >= min_num_of_words:
	return True

	# More than max words per line
	if numberOfWordsInSentence >= max_num_of_words:
	return True
	return False

	def seconds_to_timestring(elapsed):
	seconds = elapsed % 60

	elapsed -= seconds

	hours = math.floor(elapsed / 60 / 60)
	elapsed -= hours * 60 * 60

	minutes = math.floor(elapsed / 60)

	return '{}:{}:{:.3f}'.format(hours, minutes, seconds)


	def gen_vtt(captions):
	out = 'WEBVTT\n\n'

	for caption in captions:
	start = seconds_to_timestring(caption['videoSegment']['startTimeOffset'])
	end = seconds_to_timestring(caption['videoSegment']['endTimeOffset'])
	out += '{} --> {}\n'.format(start, end)
	out += caption['transcript'] + '\n'
	out += '\n'
	return out


	def break_down_transcriptions(alternatives):
	captions = []
	for alternative in alternatives:
	start_time = duration_to_seconds(alternative.words[0].start_time)
	transcript = ''
	num_words_in_segment = 0
	for word in alternative.words:
	if not transcript:
	start_time = duration_to_seconds(word.start_time)
	end_time = duration_to_seconds(word.end_time)
	diff = end_time - start_time
	transcript += word.word + ' '
	num_words_in_segment += 1
	if is_break_point(word.word, diff, num_words_in_segment):
	# found break point.
	video_segment = {
	'startTimeOffset': start_time,
	'endTimeOffset': end_time
	}

	# Gets rid of unnecessary white space
	# at the end of a caption.
	if transcript[-1] == ' ':
	transcript = transcript[:-1]
	caption = {
	'transcript': transcript,
	'confidence': alternative.confidence,
	'videoSegment': video_segment,
	}
	captions.append(caption)
	transcript = ''
	num_words_in_segment = 0
	return captions


	if __name__ == '__main__':
	main()