Created
June 24, 2019 02:15
-
-
Save jeakwon/1bee9ba6bbdb4d20d4384dbfd0931084 to your computer and use it in GitHub Desktop.
speech recognition of Korean wav (mono)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Cutting .wav file from start to end | |
# ffmpeg -i <in_file> -ss <start_sec> -to <end_sec> -c copy <out_file> | |
# ffmpeg -i 1.mp3 -ss 0 -to 60 -c copy 2.mp3 (example of use) | |
# Generating mono .wav file | |
# ffmpeg -i <in_file> -acodec pcm_s16le -ac 1 -ar 16000 <out_file> | |
# ffmpeg -i 2.mp3 -acodec pcm_s16le -ac 1 -ar 16000 2.wav | |
# Combine both | |
# ffmpeg -i 1.mp3 -ss 0 -to 60 -c -acodec pcm_s16le -ac 1 -ar 16000 copy out.wav | |
import io | |
import os | |
# Imports the Google Cloud client library | |
from google.cloud import speech | |
from google.cloud.speech import enums | |
from google.cloud.speech import types | |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="<credential>.json" | |
# Instantiates a client | |
client = speech.SpeechClient() | |
# The name of the audio file to transcribe | |
file_name = os.path.join( | |
os.path.dirname(__file__), | |
'2.wav') | |
# Loads the audio into memory | |
with io.open(file_name, 'rb') as audio_file: | |
content = audio_file.read() | |
audio = types.RecognitionAudio(content=content) | |
config = types.RecognitionConfig( | |
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, | |
sample_rate_hertz=16000, | |
language_code='ko-KR', | |
enable_word_time_offsets=True) | |
# Detects speech in the audio file | |
response = client.recognize(config, audio) | |
for result in response.results: | |
alternative = result.alternatives[0] | |
print(u'Transcript: {}'.format(alternative.transcript)) | |
print('Confidence: {}'.format(alternative.confidence)) | |
for word_info in alternative.words: | |
word = word_info.word | |
start_time = word_info.start_time | |
end_time = word_info.end_time | |
print('Word: {}, start_time: {}, end_time: {}'.format( | |
word, | |
start_time.seconds + start_time.nanos * 1e-9, | |
end_time.seconds + end_time.nanos * 1e-9)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment