jeakwon/sr_with_timestamp_from_wav_mono.py

## sr_with_timestamp_from_wav_mono.py
# Cutting .wav file from start to end
# ffmpeg -i <in_file> -ss <start_sec> -to <end_sec> -c copy <out_file>
# ffmpeg -i 1.mp3 -ss 0 -to 60 -c copy 2.mp3 (example of use)

# Generating mono .wav file
# ffmpeg -i <in_file> -acodec pcm_s16le -ac 1 -ar 16000 <out_file>
# ffmpeg -i 2.mp3 -acodec pcm_s16le -ac 1 -ar 16000 2.wav

# Combine both
# ffmpeg -i 1.mp3 -ss 0 -to 60 -c -acodec pcm_s16le -ac 1 -ar 16000 copy out.wav


import io
import os

# Imports the Google Cloud client library
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types


os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="<credential>.json"
# Instantiates a client
client = speech.SpeechClient()

# The name of the audio file to transcribe
file_name = os.path.join(
    os.path.dirname(__file__),
    '2.wav')

# Loads the audio into memory
with io.open(file_name, 'rb') as audio_file:
    content = audio_file.read()
    audio = types.RecognitionAudio(content=content)

config = types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=16000,
    language_code='ko-KR',
    enable_word_time_offsets=True)

# Detects speech in the audio file
response = client.recognize(config, audio)

for result in response.results:
    alternative = result.alternatives[0]
    print(u'Transcript: {}'.format(alternative.transcript))
    print('Confidence: {}'.format(alternative.confidence))

    for word_info in alternative.words:
        word = word_info.word
        start_time = word_info.start_time
        end_time = word_info.end_time
        print('Word: {}, start_time: {}, end_time: {}'.format(
            word,
            start_time.seconds + start_time.nanos * 1e-9,
            end_time.seconds + end_time.nanos * 1e-9))
	# Cutting .wav file from start to end
	# ffmpeg -i <in_file> -ss <start_sec> -to <end_sec> -c copy <out_file>
	# ffmpeg -i 1.mp3 -ss 0 -to 60 -c copy 2.mp3 (example of use)

	# Generating mono .wav file
	# ffmpeg -i <in_file> -acodec pcm_s16le -ac 1 -ar 16000 <out_file>
	# ffmpeg -i 2.mp3 -acodec pcm_s16le -ac 1 -ar 16000 2.wav

	# Combine both
	# ffmpeg -i 1.mp3 -ss 0 -to 60 -c -acodec pcm_s16le -ac 1 -ar 16000 copy out.wav



	import io
	import os

	# Imports the Google Cloud client library
	from google.cloud import speech
	from google.cloud.speech import enums
	from google.cloud.speech import types


	os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="<credential>.json"
	# Instantiates a client
	client = speech.SpeechClient()

	# The name of the audio file to transcribe
	file_name = os.path.join(
	os.path.dirname(__file__),
	'2.wav')

	# Loads the audio into memory
	with io.open(file_name, 'rb') as audio_file:
	content = audio_file.read()
	audio = types.RecognitionAudio(content=content)

	config = types.RecognitionConfig(
	encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
	sample_rate_hertz=16000,
	language_code='ko-KR',
	enable_word_time_offsets=True)

	# Detects speech in the audio file
	response = client.recognize(config, audio)

	for result in response.results:
	alternative = result.alternatives[0]
	print(u'Transcript: {}'.format(alternative.transcript))
	print('Confidence: {}'.format(alternative.confidence))

	for word_info in alternative.words:
	word = word_info.word
	start_time = word_info.start_time
	end_time = word_info.end_time
	print('Word: {}, start_time: {}, end_time: {}'.format(
	word,
	start_time.seconds + start_time.nanos * 1e-9,
	end_time.seconds + end_time.nanos * 1e-9))