zorbaproject/transcribe.py

## transcribe.py
# Original source: https://towardsdatascience.com/how-to-use-google-speech-to-text-api-to-transcribe-long-audio-files-1c886f4eb3e9

#Requirements
#sudo pip3 install google-cloud-storage
#sudo pip3 install google-cloud-speech
#sudo pip3 install pydub
#sudo apt install ffmpeg

#Create a project on https://console.cloud.google.com/
#Enable Storage Transfer API
#https://console.cloud.google.com/storage/
#Enable Speech to Text API
#https://console.cloud.google.com/apis/library/speech.googleapis.com
#Create a key for a service account and store it in the same folder as this script, with name 'service_account.json'
#https://console.cloud.google.com/iam-admin/serviceaccounts

from pydub import AudioSegment
import io
import os
import os.path
from google.cloud import speech
#from google.cloud.speech import enums
#from google.cloud.speech import types
#https://github.com/googleapis/python-speech/blob/master/UPGRADING.md#enums-and-types
import wave
from google.cloud import storage
import sys
import json

if len(sys.argv)>1:
    filepath = sys.argv[1]
else:
    print("Usage: ./transcribe.py audiofile.mp3 language")
    print("E.g.: ./transcribe.py audio/ it-IT")
    print("Input con be a single mp3 file, a single wav file, or a folder of files.")
    sys.exit()

output_filepath = filepath #"./Transcripts/"

if len(sys.argv)>2:
    lang = sys.argv[2]
else:
    lang = 'it-IT' #'en-US'


accountfile = "service_account.json"

text_file = open(accountfile, "r", encoding='utf-8')
lines = text_file.read()
text_file.close()
accountdata = json.loads(lines.replace("\n", "").replace("\r", ""))
bucket_name = accountdata["project_id"] + "-audiofiles"


def mp3_to_wav(audio_file_name):
    if audio_file_name.split('.')[-1] == 'mp3':
        waudio_file_name = audio_file_name.split('.')[-2] + '.wav'
        if not os.path.isfile(waudio_file_name):
            sound = AudioSegment.from_mp3(audio_file_name)
            print("MP3 to WAV: "+audio_file_name)
            sound.export(waudio_file_name, format="wav")

def frame_rate_channel(audio_file_name):
    with wave.open(audio_file_name, "rb") as wave_file:
        frame_rate = wave_file.getframerate()
        channels = wave_file.getnchannels()
        return frame_rate,channels

def stereo_to_mono(audio_file_name):
    sound = AudioSegment.from_wav(audio_file_name)
    sound = sound.set_channels(1)
    sound.export(audio_file_name, format="wav")

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    global accountfile

    storage_client = storage.Client.from_service_account_json(accountfile) #storage.Client()
    try:
        bucket = storage_client.get_bucket(bucket_name)
    except:
        bucket = storage_client.create_bucket(bucket_name)
        bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    print("Upload to bucket "+bucket_name)
    blob.upload_from_filename(source_file_name)

def delete_blob(bucket_name, blob_name):
    """Deletes a blob from the bucket."""
    global accountfile

    storage_client = storage.Client.from_service_account_json(accountfile) #storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)
    print("Delete from bucket")
    blob.delete()
def google_transcribe(file_name):
    global lang
    global bucket_name
    global accountfile


    #if ".mp3" in file_name:
    if file_name.split('.')[-1] == 'mp3':
        mp3_to_wav(file_name)
        file_name = file_name.split('.')[-2] + '.wav'
        #file_name = file_name.replace(".mp3", ".wav")

    print("Working on "+file_name)

    # The name of the audio file to transcribe

    frame_rate, channels = frame_rate_channel(file_name)

    if channels > 1:
        stereo_to_mono(file_name)

    audio_file_name = os.path.basename(file_name)
    source_file_name = file_name
    destination_blob_name = audio_file_name

    upload_blob(bucket_name, source_file_name, destination_blob_name)

    gcs_uri = 'gs://'+bucket_name+'/' + audio_file_name
    transcript = ''

    client = speech.SpeechClient.from_service_account_json(accountfile) #speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=frame_rate,
    enable_automatic_punctuation=True,
    language_code=lang)

    # Detects speech in the audio file
    print("Transcribing the audio file, this is going to take some time...")
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=10000)

    for result in response.results:
        transcript += result.alternatives[0].transcript
    delete_blob(bucket_name, destination_blob_name)
    return transcript

def write_transcripts(transcript_filename,transcript):
    f= open(transcript_filename,"w+")
    f.write(transcript)
    f.close()
    print("Wrote text on "+transcript_filename)

if __name__ == "__main__":
    if os.path.isdir(filepath):
        for audio_file_name in os.listdir(filepath):
            if ".wav" in audio_file_name or ".mp3" in audio_file_name:
                file_name = filepath + "/" + audio_file_name
                file_name = file_name.replace("//", "/")
                transcript = google_transcribe(file_name)
                transcript_filename = audio_file_name.replace('.wav', '.txt')
                transcript_filename = audio_file_name.replace('.mp3', '.txt')
                write_transcripts(transcript_filename,transcript)
    else:
        audio_file_name = filepath
        transcript = google_transcribe(audio_file_name)
        transcript_filename = audio_file_name.replace('.wav', '.txt')
        transcript_filename = audio_file_name.replace('.mp3', '.txt')
        try:
            write_transcripts(transcript_filename,transcript)
        except:
            write_transcripts("output-transcribe.txt",transcript)

#Source file: Public conference with ambient noise
#Source duration: 1 hour 50 minutes
#Source format: mp3
#Total time: 35 minutes
#Language: it-IT

#Source file: Public conference with ambient noise
#Source duration: 1 hour 50 minutes
#Source format: wav
#Total time: 20 minutes
#Language: it-IT

#Source file: Italian Premier official speech on March 11th 2020
#Source duration: 9 minutes
#Source format: mp3
#Total time: 3 minutes
#Language: it-IT

#Estimated costs: 5,36 euros for 325,75 minutes of audio
	# Original source: https://towardsdatascience.com/how-to-use-google-speech-to-text-api-to-transcribe-long-audio-files-1c886f4eb3e9

	#Requirements
	#sudo pip3 install google-cloud-storage
	#sudo pip3 install google-cloud-speech
	#sudo pip3 install pydub
	#sudo apt install ffmpeg

	#Create a project on https://console.cloud.google.com/
	#Enable Storage Transfer API
	#https://console.cloud.google.com/storage/
	#Enable Speech to Text API
	#https://console.cloud.google.com/apis/library/speech.googleapis.com
	#Create a key for a service account and store it in the same folder as this script, with name 'service_account.json'
	#https://console.cloud.google.com/iam-admin/serviceaccounts

	from pydub import AudioSegment
	import io
	import os
	import os.path
	from google.cloud import speech
	#from google.cloud.speech import enums
	#from google.cloud.speech import types
	#https://github.com/googleapis/python-speech/blob/master/UPGRADING.md#enums-and-types
	import wave
	from google.cloud import storage
	import sys
	import json

	if len(sys.argv)>1:
	filepath = sys.argv[1]
	else:
	print("Usage: ./transcribe.py audiofile.mp3 language")
	print("E.g.: ./transcribe.py audio/ it-IT")
	print("Input con be a single mp3 file, a single wav file, or a folder of files.")
	sys.exit()

	output_filepath = filepath #"./Transcripts/"

	if len(sys.argv)>2:
	lang = sys.argv[2]
	else:
	lang = 'it-IT' #'en-US'


	accountfile = "service_account.json"

	text_file = open(accountfile, "r", encoding='utf-8')
	lines = text_file.read()
	text_file.close()
	accountdata = json.loads(lines.replace("\n", "").replace("\r", ""))
	bucket_name = accountdata["project_id"] + "-audiofiles"


	def mp3_to_wav(audio_file_name):
	if audio_file_name.split('.')[-1] == 'mp3':
	waudio_file_name = audio_file_name.split('.')[-2] + '.wav'
	if not os.path.isfile(waudio_file_name):
	sound = AudioSegment.from_mp3(audio_file_name)
	print("MP3 to WAV: "+audio_file_name)
	sound.export(waudio_file_name, format="wav")

	def frame_rate_channel(audio_file_name):
	with wave.open(audio_file_name, "rb") as wave_file:
	frame_rate = wave_file.getframerate()
	channels = wave_file.getnchannels()
	return frame_rate,channels

	def stereo_to_mono(audio_file_name):
	sound = AudioSegment.from_wav(audio_file_name)
	sound = sound.set_channels(1)
	sound.export(audio_file_name, format="wav")

	def upload_blob(bucket_name, source_file_name, destination_blob_name):
	"""Uploads a file to the bucket."""
	global accountfile

	storage_client = storage.Client.from_service_account_json(accountfile) #storage.Client()
	try:
	bucket = storage_client.get_bucket(bucket_name)
	except:
	bucket = storage_client.create_bucket(bucket_name)
	bucket = storage_client.get_bucket(bucket_name)
	blob = bucket.blob(destination_blob_name)
	print("Upload to bucket "+bucket_name)
	blob.upload_from_filename(source_file_name)

	def delete_blob(bucket_name, blob_name):
	"""Deletes a blob from the bucket."""
	global accountfile

	storage_client = storage.Client.from_service_account_json(accountfile) #storage.Client()
	bucket = storage_client.get_bucket(bucket_name)
	blob = bucket.blob(blob_name)
	print("Delete from bucket")
	blob.delete()
	def google_transcribe(file_name):
	global lang
	global bucket_name
	global accountfile


	#if ".mp3" in file_name:
	if file_name.split('.')[-1] == 'mp3':
	mp3_to_wav(file_name)
	file_name = file_name.split('.')[-2] + '.wav'
	#file_name = file_name.replace(".mp3", ".wav")

	print("Working on "+file_name)

	# The name of the audio file to transcribe

	frame_rate, channels = frame_rate_channel(file_name)

	if channels > 1:
	stereo_to_mono(file_name)

	audio_file_name = os.path.basename(file_name)
	source_file_name = file_name
	destination_blob_name = audio_file_name

	upload_blob(bucket_name, source_file_name, destination_blob_name)

	gcs_uri = 'gs://'+bucket_name+'/' + audio_file_name
	transcript = ''

	client = speech.SpeechClient.from_service_account_json(accountfile) #speech.SpeechClient()
	audio = speech.RecognitionAudio(uri=gcs_uri)

	config = speech.RecognitionConfig(
	encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
	sample_rate_hertz=frame_rate,
	enable_automatic_punctuation=True,
	language_code=lang)

	# Detects speech in the audio file
	print("Transcribing the audio file, this is going to take some time...")
	operation = client.long_running_recognize(config=config, audio=audio)
	response = operation.result(timeout=10000)

	for result in response.results:
	transcript += result.alternatives[0].transcript
	delete_blob(bucket_name, destination_blob_name)
	return transcript

	def write_transcripts(transcript_filename,transcript):
	f= open(transcript_filename,"w+")
	f.write(transcript)
	f.close()
	print("Wrote text on "+transcript_filename)

	if __name__ == "__main__":
	if os.path.isdir(filepath):
	for audio_file_name in os.listdir(filepath):
	if ".wav" in audio_file_name or ".mp3" in audio_file_name:
	file_name = filepath + "/" + audio_file_name
	file_name = file_name.replace("//", "/")
	transcript = google_transcribe(file_name)
	transcript_filename = audio_file_name.replace('.wav', '.txt')
	transcript_filename = audio_file_name.replace('.mp3', '.txt')
	write_transcripts(transcript_filename,transcript)
	else:
	audio_file_name = filepath
	transcript = google_transcribe(audio_file_name)
	transcript_filename = audio_file_name.replace('.wav', '.txt')
	transcript_filename = audio_file_name.replace('.mp3', '.txt')
	try:
	write_transcripts(transcript_filename,transcript)
	except:
	write_transcripts("output-transcribe.txt",transcript)

	#Source file: Public conference with ambient noise
	#Source duration: 1 hour 50 minutes
	#Source format: mp3
	#Total time: 35 minutes
	#Language: it-IT

	#Source file: Public conference with ambient noise
	#Source duration: 1 hour 50 minutes
	#Source format: wav
	#Total time: 20 minutes
	#Language: it-IT

	#Source file: Italian Premier official speech on March 11th 2020
	#Source duration: 9 minutes
	#Source format: mp3
	#Total time: 3 minutes
	#Language: it-IT

	#Estimated costs: 5,36 euros for 325,75 minutes of audio