tikhonova/4_audio_Google_transcribe_with_multiprocessing.py

## 4_audio_Google_transcribe_with_multiprocessing.py
# snippet of instantiating a client
client = speech.SpeechClient()
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=22050,
    audio_channel_count=1,
    model='phone_call',  # recognizes low quality audio better than default
    use_enhanced=1,  # if available
    language_code="en-US")


# WER
import jiwer

with open("E:/AlanWatts/dataset/ground_truth.txt") as f:
    ground_truth = f.readlines()

with open("E:/AlanWatts/dataset/transcripts2/6_5_Zen_Mind_for_Beginners_I.mp3_2022-11-19_19_52_45.txt") as f:
    hypothesis = f.readlines()

transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemovePunctuation(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
    jiwer.RemoveWhiteSpace(replace_by_space=True),
    jiwer.RemoveMultipleSpaces(),
    jiwer.RemoveEmptyStrings(),
    jiwer.ReduceToSingleSentence(),
    jiwer.ReduceToListOfListOfWords(word_delimiter=" ")
])  # (hypothesis)

wer = jiwer.wer(ground_truth,
                hypothesis,
                truth_transform=transformation,
                hypothesis_transform=transformation)

print(wer)

# 0-20% word error rate based on just a couple clips;
# for comparison, Autosub often returned completely incorrect or blank transcriptions;

# via https://github.com/tikhonova/what_would_alan_watts_say/blob/master/speech_synthesis/4_audio_Google_transcribe_with_multiprocessing.py
	# snippet of instantiating a client
	client = speech.SpeechClient()
	config = speech.RecognitionConfig(
	encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
	sample_rate_hertz=22050,
	audio_channel_count=1,
	model='phone_call', # recognizes low quality audio better than default
	use_enhanced=1, # if available
	language_code="en-US")


	# WER
	import jiwer

	with open("E:/AlanWatts/dataset/ground_truth.txt") as f:
	ground_truth = f.readlines()

	with open("E:/AlanWatts/dataset/transcripts2/6_5_Zen_Mind_for_Beginners_I.mp3_2022-11-19_19_52_45.txt") as f:
	hypothesis = f.readlines()

	transformation = jiwer.Compose([
	jiwer.ToLowerCase(),
	jiwer.RemovePunctuation(),
	jiwer.ExpandCommonEnglishContractions(),
	jiwer.RemoveMultipleSpaces(),
	jiwer.Strip(),
	jiwer.RemoveWhiteSpace(replace_by_space=True),
	jiwer.RemoveMultipleSpaces(),
	jiwer.RemoveEmptyStrings(),
	jiwer.ReduceToSingleSentence(),
	jiwer.ReduceToListOfListOfWords(word_delimiter=" ")
	]) # (hypothesis)

	wer = jiwer.wer(ground_truth,
	hypothesis,
	truth_transform=transformation,
	hypothesis_transform=transformation)

	print(wer)

	# 0-20% word error rate based on just a couple clips;
	# for comparison, Autosub often returned completely incorrect or blank transcriptions;

	# via https://github.com/tikhonova/what_would_alan_watts_say/blob/master/speech_synthesis/4_audio_Google_transcribe_with_multiprocessing.py