Created
November 12, 2019 01:32
-
-
Save kohlishivam/a376e63c04917f64947adb0b714c86b5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
filepath = "~/audio_wav/" #Input audio file path | |
output_filepath = "`/Transcripts/" #Final transcript path | |
bucketname = "upload" #Name of the bucket created in the step before | |
# Import libraries | |
from pydub import AudioSegment | |
import io | |
import os | |
from google.cloud import speech | |
from google.cloud.speech import enums | |
from google.cloud.speech import types | |
import wave | |
from google.cloud import storage | |
from google.cloud import speech_v1p1beta1 as speech | |
from google.cloud.speech_v1p1beta1 import enums | |
from google.cloud.speech_v1p1beta1 import types | |
def mp3_to_wav(audio_file_name): | |
if audio_file_name.split('.')[1] == 'mp3': | |
sound = AudioSegment.from_mp3(audio_file_name) | |
audio_file_name = audio_file_name.split('.')[0] + '.wav' | |
sound.export(audio_file_name, format="wav") | |
def stereo_to_mono(audio_file_name): | |
sound = AudioSegment.from_wav(audio_file_name) | |
sound = sound.set_channels(1) | |
sound.export(audio_file_name, format="wav") | |
def frame_rate_channel(audio_file_name): | |
with wave.open(audio_file_name, "rb") as wave_file: | |
frame_rate = wave_file.getframerate() | |
channels = wave_file.getnchannels() | |
return frame_rate,channels | |
def upload_blob(bucket_name, source_file_name, destination_blob_name): | |
"""Uploads a file to the bucket.""" | |
storage_client = storage.Client() | |
bucket = storage_client.get_bucket(bucket_name) | |
blob = bucket.blob(destination_blob_name) | |
blob.upload_from_filename(source_file_name) | |
def delete_blob(bucket_name, blob_name): | |
"""Deletes a blob from the bucket.""" | |
storage_client = storage.Client() | |
bucket = storage_client.get_bucket(bucket_name) | |
blob = bucket.blob(blob_name) | |
blob.delete() | |
def google_transcribe(audio_file_name): | |
file_name = filepath + audio_file_name | |
mp3_to_wav(file_name) | |
# The name of the audio file to transcribe | |
frame_rate, channels = frame_rate_channel(file_name) | |
if channels > 1: | |
stereo_to_mono(file_name) | |
bucket_name = bucketname | |
source_file_name = filepath + audio_file_name | |
destination_blob_name = audio_file_name | |
upload_blob(bucket_name, source_file_name, destination_blob_name) | |
gcs_uri = 'gs://' + bucketname + '/' + audio_file_name | |
transcript = '' | |
client = speech.SpeechClient() | |
audio = types.RecognitionAudio(uri=gcs_uri) | |
config = types.RecognitionConfig( | |
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, | |
sample_rate_hertz=frame_rate, | |
language_code='en-US') | |
# Detects speech in the audio file | |
operation = client.long_running_recognize(config, audio) | |
response = operation.result(timeout=10000) | |
for result in response.results: | |
transcript += result.alternatives[0].transcript | |
delete_blob(bucket_name, destination_blob_name) | |
return transcript | |
def write_transcripts(transcript_filename,transcript): | |
f= open(output_filepath + transcript_filename,"w+") | |
f.write(transcript) | |
f.close() | |
def google_transcribe_2(audio_file_name): | |
file_name = filepath + audio_file_name | |
mp3_to_wav(file_name) | |
# The name of the audio file to transcribe | |
frame_rate, channels = frame_rate_channel(file_name) | |
if channels > 1: | |
stereo_to_mono(file_name) | |
bucket_name = bucketname | |
source_file_name = filepath + audio_file_name | |
destination_blob_name = audio_file_name | |
upload_blob(bucket_name, source_file_name, destination_blob_name) | |
gcs_uri = 'gs://' + bucketname + '/' + audio_file_name | |
transcript = '' | |
client = speech.SpeechClient() | |
audio = types.RecognitionAudio(uri=gcs_uri) | |
config = types.RecognitionConfig( | |
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, | |
sample_rate_hertz=frame_rate, | |
language_code='en-US', | |
enable_speaker_diarization=True, | |
diarization_speaker_count=2) | |
# Detects speech in the audio file | |
operation = client.long_running_recognize(config, audio) | |
response = operation.result(timeout=10000) | |
result = response.results[-1] | |
words_info = result.alternatives[0].words | |
tag=1 | |
speaker="" | |
for word_info in words_info: | |
if word_info.speaker_tag==tag: | |
speaker=speaker+" "+word_info.word | |
else: | |
transcript += "speaker {}: {}".format(tag,speaker) + '\n' | |
tag=word_info.speaker_tag | |
speaker=""+word_info.word | |
transcript += "speaker {}: {}".format(tag,speaker) | |
delete_blob(bucket_name, destination_blob_name) | |
return transcript | |
if __name__ == "__main__": | |
# for audio_file_name in os.listdir(filepath): | |
audio_file_name = "testfile_mp3.wav" | |
transcript = google_transcribe_2(audio_file_name) | |
transcript_filename = audio_file_name.split('.')[0] + '.txt' | |
write_transcripts(transcript_filename,transcript) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment