-
-
Save SamuelAlgheriniAI/316e835a747b1930c11ee08bf78e71ab to your computer and use it in GitHub Desktop.
Speech to text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Transcription function | |
def asr_transcript(processor, model, resampled_path, length, block_length): | |
chunks = length//block_length | |
if length%block_length != 0: | |
chunks += 1 | |
transcript = "" | |
# Split the speech in multiple 30 seconds chunks rather than loading the full audio file | |
stream = librosa.stream(resampled_path, block_length=block_length, frame_length=16000, hop_length=16000) | |
print ('Every chunk is ',block_length,'sec. long') | |
print("Number of chunks",int(chunks)) | |
for n, speech in enumerate(stream): | |
print ("Transcribing the chunk number " + str(n+1)) | |
separator = ' ' | |
if n % 2 == 0: | |
separator = '\n' | |
transcript += generate_transcription(speech, processor, model) + separator | |
print("Encoding complete. Total number of chunks: " + str(n+1) + "\n") | |
return transcript | |
#Speech to text function | |
def generate_transcription(speech, processor, model): | |
if len(speech.shape) > 1: | |
speech = speech[:, 0] + speech[:, 1] | |
input_values = processor(speech, sampling_rate = sr, return_tensors="pt").input_values | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.decode(predicted_ids[0]) | |
return transcription.lower() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment