tucan9389/simple-audio-recognition.py

## simple-audio-recognition.py
#!/usr/bin/env python

# This script is a simple audio recognition using google's Cloud Speech-to-Text API
# The script can recognize long audio or video (over 1 minute, in my case 60 minute video)

# Prerequisites libraries
# - ffmpeg
# - google-cloud-speech

# My test
# - recognize 60 minute video(.mp4)

# How to run
# 1. install anaconda and create a virtual env
# 2. install prerequisites on the virtual env
# 3. configurate some key for google-cloud-speech
# 4. run this script
# 5. you can find the result on the input file path

# Example usage:
# /Users/doyounggwak/anaconda3/envs/tts-env/bin/python main.py


# ================================================ #
# ================================================ #
# ================================================ #

main_audio_file = "audio/21th.mp4"  # input video or audio file path
split_durection = "58"              # I recommand do not change this

# ================================================ #
# ================================================ #
# ================================================ #


# I reference google's baseline code for google-cloud-speech
# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/speech/cloud-client/transcribe_async.py


# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os

# [START speech_transcribe_async]
def transcribe_file(speech_file):
    """Transcribe the given audio file asynchronously."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    # [START speech_python_migration_async_request]
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        # sample_rate_hertz=16000,
        language_code='ko-KR')

    # [START speech_python_migration_async_response]
    operation = client.long_running_recognize(config, audio)
    # [END speech_python_migration_async_request]

    # print('Waiting for operation to complete...')
    response = operation.result(timeout=90)

    my_result = []

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        # print(u'Transcript: {}'.format(result.alternatives[0].transcript))
        # print('Confidence: {}'.format(result.alternatives[0].confidence))
        print(result.alternatives[0].transcript)
        my_result.append(result.alternatives[0].transcript)
    # [END speech_python_migration_async_response]
    my_result.append("")
    return my_result
# [END speech_transcribe_async]

if __name__ == '__main__':


    if os.path.splitext(main_audio_file)[-1] == ".mp4":
        file_path = os.path.splitext(main_audio_file)[0]
        audio_path = file_path + ".mp3"
        cmd = "ffmpeg -i \"" + main_audio_file + "\" \"" + audio_path + "\""
        print(cmd)
        os.system(cmd)
        main_audio_file = audio_path


    splited_audios_path = os.path.splitext(main_audio_file)[0]
    main_audio_file_name = splited_audios_path.split("/")[-1]
    os.mkdir(splited_audios_path)


    # split
    splited_audio_file = os.path.join(splited_audios_path, main_audio_file_name + "_%03d.mp3")
    split_command = "ffmpeg -i " + main_audio_file + " -f segment -segment_time " + split_durection + " -c copy " + splited_audio_file
    os.system(split_command)


    import glob
    txtfiles = []
    for file in glob.glob(os.path.join(splited_audios_path, main_audio_file_name) + "_*.mp3"):
        txtfiles.append(file)

    # convert mp3 to flac with ac 1
    for file in txtfiles:
        cmd = "ffmpeg -i " + file + " -ac 1 " + os.path.splitext(file)[0] + ".flac"
        print(cmd)
        os.system(cmd)


    txtfiles = []
    for file in glob.glob(os.path.join(splited_audios_path, main_audio_file_name) + "_*.flac"):
        txtfiles.append(file)

    txtfiles.sort()
    print(txtfiles)

    print("\n\n")


    text_file = open(splited_audios_path + ".txt", "w")

    # recognize!
    results = []
    index = 1;
    for file in txtfiles:
        print("--------> " + str(index) + "/" + str(len(txtfiles)) + " <--------", file)
        result = transcribe_file(file)
        for r in result:
            text_file.write(r)
            text_file.write("\n")
        index = index + 1

    text_file.close()
	#!/usr/bin/env python

	# This script is a simple audio recognition using google's Cloud Speech-to-Text API
	# The script can recognize long audio or video (over 1 minute, in my case 60 minute video)

	# Prerequisites libraries
	# - ffmpeg
	# - google-cloud-speech

	# My test
	# - recognize 60 minute video(.mp4)

	# How to run
	# 1. install anaconda and create a virtual env
	# 2. install prerequisites on the virtual env
	# 3. configurate some key for google-cloud-speech
	# 4. run this script
	# 5. you can find the result on the input file path

	# Example usage:
	# /Users/doyounggwak/anaconda3/envs/tts-env/bin/python main.py


	# ================================================ #
	# ================================================ #
	# ================================================ #

	main_audio_file = "audio/21th.mp4" # input video or audio file path
	split_durection = "58" # I recommand do not change this

	# ================================================ #
	# ================================================ #
	# ================================================ #




	# I reference google's baseline code for google-cloud-speech
	# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/speech/cloud-client/transcribe_async.py




	# Copyright 2017 Google Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import io
	import os

	# [START speech_transcribe_async]
	def transcribe_file(speech_file):
	"""Transcribe the given audio file asynchronously."""
	from google.cloud import speech
	from google.cloud.speech import enums
	from google.cloud.speech import types
	client = speech.SpeechClient()

	# [START speech_python_migration_async_request]
	with io.open(speech_file, 'rb') as audio_file:
	content = audio_file.read()

	audio = types.RecognitionAudio(content=content)
	config = types.RecognitionConfig(
	encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
	# sample_rate_hertz=16000,
	language_code='ko-KR')

	# [START speech_python_migration_async_response]
	operation = client.long_running_recognize(config, audio)
	# [END speech_python_migration_async_request]

	# print('Waiting for operation to complete...')
	response = operation.result(timeout=90)

	my_result = []

	# Each result is for a consecutive portion of the audio. Iterate through
	# them to get the transcripts for the entire audio file.
	for result in response.results:
	# The first alternative is the most likely one for this portion.
	# print(u'Transcript: {}'.format(result.alternatives[0].transcript))
	# print('Confidence: {}'.format(result.alternatives[0].confidence))
	print(result.alternatives[0].transcript)
	my_result.append(result.alternatives[0].transcript)
	# [END speech_python_migration_async_response]
	my_result.append("")
	return my_result
	# [END speech_transcribe_async]

	if __name__ == '__main__':





	if os.path.splitext(main_audio_file)[-1] == ".mp4":
	file_path = os.path.splitext(main_audio_file)[0]
	audio_path = file_path + ".mp3"
	cmd = "ffmpeg -i \"" + main_audio_file + "\" \"" + audio_path + "\""
	print(cmd)
	os.system(cmd)
	main_audio_file = audio_path


	splited_audios_path = os.path.splitext(main_audio_file)[0]
	main_audio_file_name = splited_audios_path.split("/")[-1]
	os.mkdir(splited_audios_path)


	# split
	splited_audio_file = os.path.join(splited_audios_path, main_audio_file_name + "_%03d.mp3")
	split_command = "ffmpeg -i " + main_audio_file + " -f segment -segment_time " + split_durection + " -c copy " + splited_audio_file
	os.system(split_command)


	import glob
	txtfiles = []
	for file in glob.glob(os.path.join(splited_audios_path, main_audio_file_name) + "_*.mp3"):
	txtfiles.append(file)

	# convert mp3 to flac with ac 1
	for file in txtfiles:
	cmd = "ffmpeg -i " + file + " -ac 1 " + os.path.splitext(file)[0] + ".flac"
	print(cmd)
	os.system(cmd)



	txtfiles = []
	for file in glob.glob(os.path.join(splited_audios_path, main_audio_file_name) + "_*.flac"):
	txtfiles.append(file)

	txtfiles.sort()
	print(txtfiles)

	print("\n\n")


	text_file = open(splited_audios_path + ".txt", "w")

	# recognize!
	results = []
	index = 1;
	for file in txtfiles:
	print("--------> " + str(index) + "/" + str(len(txtfiles)) + " <--------", file)
	result = transcribe_file(file)
	for r in result:
	text_file.write(r)
	text_file.write("\n")
	index = index + 1

	text_file.close()