SamuelAlgheriniAI/Speech_to_Text.py Secret

## Speech_to_Text.py
#Transcription function
def asr_transcript(processor, model, resampled_path, length, block_length):
    chunks = length//block_length
    if length%block_length != 0:
        chunks += 1
    transcript = ""
    # Split the speech in multiple 30 seconds chunks rather than loading the full audio file
    stream = librosa.stream(resampled_path, block_length=block_length, frame_length=16000, hop_length=16000)

    print ('Every chunk is ',block_length,'sec. long')
    print("Number of chunks",int(chunks))
    for n, speech in enumerate(stream):
        print ("Transcribing the chunk number " + str(n+1))
        separator = ' '
        if n % 2 == 0:
            separator = '\n'
        transcript += generate_transcription(speech, processor, model) + separator
    print("Encoding complete. Total number of chunks: " + str(n+1) + "\n")
    return transcript

#Speech to text function
def generate_transcription(speech, processor, model):
    if len(speech.shape) > 1:
        speech = speech[:, 0] + speech[:, 1]
    input_values = processor(speech, sampling_rate = sr, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription.lower()
	#Transcription function
	def asr_transcript(processor, model, resampled_path, length, block_length):
	chunks = length//block_length
	if length%block_length != 0:
	chunks += 1
	transcript = ""
	# Split the speech in multiple 30 seconds chunks rather than loading the full audio file
	stream = librosa.stream(resampled_path, block_length=block_length, frame_length=16000, hop_length=16000)

	print ('Every chunk is ',block_length,'sec. long')
	print("Number of chunks",int(chunks))
	for n, speech in enumerate(stream):
	print ("Transcribing the chunk number " + str(n+1))
	separator = ' '
	if n % 2 == 0:
	separator = '\n'
	transcript += generate_transcription(speech, processor, model) + separator
	print("Encoding complete. Total number of chunks: " + str(n+1) + "\n")
	return transcript

	#Speech to text function
	def generate_transcription(speech, processor, model):
	if len(speech.shape) > 1:
	speech = speech[:, 0] + speech[:, 1]
	input_values = processor(speech, sampling_rate = sr, return_tensors="pt").input_values
	logits = model(input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.decode(predicted_ids[0])
	return transcription.lower()