Skip to content

Instantly share code, notes, and snippets.

@tucan9389
Created May 27, 2019 01:39
Show Gist options
  • Save tucan9389/84c1fea7d35fc2c653243b759b51ecb7 to your computer and use it in GitHub Desktop.
Save tucan9389/84c1fea7d35fc2c653243b759b51ecb7 to your computer and use it in GitHub Desktop.
This script is a simple audio recognition using google's Cloud Speech-to-Text API
#!/usr/bin/env python
# This script is a simple audio recognition using google's Cloud Speech-to-Text API
# The script can recognize long audio or video (over 1 minute, in my case 60 minute video)
# Prerequisites libraries
# - ffmpeg
# - google-cloud-speech
# My test
# - recognize 60 minute video(.mp4)
# How to run
# 1. install anaconda and create a virtual env
# 2. install prerequisites on the virtual env
# 3. configurate some key for google-cloud-speech
# 4. run this script
# 5. you can find the result on the input file path
# Example usage:
# /Users/doyounggwak/anaconda3/envs/tts-env/bin/python main.py
# ================================================ #
# ================================================ #
# ================================================ #
main_audio_file = "audio/21th.mp4" # input video or audio file path
split_durection = "58" # I recommand do not change this
# ================================================ #
# ================================================ #
# ================================================ #
# I reference google's baseline code for google-cloud-speech
# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/speech/cloud-client/transcribe_async.py
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import os
# [START speech_transcribe_async]
def transcribe_file(speech_file):
"""Transcribe the given audio file asynchronously."""
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
client = speech.SpeechClient()
# [START speech_python_migration_async_request]
with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()
audio = types.RecognitionAudio(content=content)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
# sample_rate_hertz=16000,
language_code='ko-KR')
# [START speech_python_migration_async_response]
operation = client.long_running_recognize(config, audio)
# [END speech_python_migration_async_request]
# print('Waiting for operation to complete...')
response = operation.result(timeout=90)
my_result = []
# Each result is for a consecutive portion of the audio. Iterate through
# them to get the transcripts for the entire audio file.
for result in response.results:
# The first alternative is the most likely one for this portion.
# print(u'Transcript: {}'.format(result.alternatives[0].transcript))
# print('Confidence: {}'.format(result.alternatives[0].confidence))
print(result.alternatives[0].transcript)
my_result.append(result.alternatives[0].transcript)
# [END speech_python_migration_async_response]
my_result.append("")
return my_result
# [END speech_transcribe_async]
if __name__ == '__main__':
if os.path.splitext(main_audio_file)[-1] == ".mp4":
file_path = os.path.splitext(main_audio_file)[0]
audio_path = file_path + ".mp3"
cmd = "ffmpeg -i \"" + main_audio_file + "\" \"" + audio_path + "\""
print(cmd)
os.system(cmd)
main_audio_file = audio_path
splited_audios_path = os.path.splitext(main_audio_file)[0]
main_audio_file_name = splited_audios_path.split("/")[-1]
os.mkdir(splited_audios_path)
# split
splited_audio_file = os.path.join(splited_audios_path, main_audio_file_name + "_%03d.mp3")
split_command = "ffmpeg -i " + main_audio_file + " -f segment -segment_time " + split_durection + " -c copy " + splited_audio_file
os.system(split_command)
import glob
txtfiles = []
for file in glob.glob(os.path.join(splited_audios_path, main_audio_file_name) + "_*.mp3"):
txtfiles.append(file)
# convert mp3 to flac with ac 1
for file in txtfiles:
cmd = "ffmpeg -i " + file + " -ac 1 " + os.path.splitext(file)[0] + ".flac"
print(cmd)
os.system(cmd)
txtfiles = []
for file in glob.glob(os.path.join(splited_audios_path, main_audio_file_name) + "_*.flac"):
txtfiles.append(file)
txtfiles.sort()
print(txtfiles)
print("\n\n")
text_file = open(splited_audios_path + ".txt", "w")
# recognize!
results = []
index = 1;
for file in txtfiles:
print("--------> " + str(index) + "/" + str(len(txtfiles)) + " <--------", file)
result = transcribe_file(file)
for r in result:
text_file.write(r)
text_file.write("\n")
index = index + 1
text_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment