Skip to content

Instantly share code, notes, and snippets.

@SamuelAlgheriniAI
Last active July 2, 2021 08:10
Show Gist options
  • Save SamuelAlgheriniAI/316e835a747b1930c11ee08bf78e71ab to your computer and use it in GitHub Desktop.
Save SamuelAlgheriniAI/316e835a747b1930c11ee08bf78e71ab to your computer and use it in GitHub Desktop.
Speech to text
#Transcription function
def asr_transcript(processor, model, resampled_path, length, block_length):
chunks = length//block_length
if length%block_length != 0:
chunks += 1
transcript = ""
# Split the speech in multiple 30 seconds chunks rather than loading the full audio file
stream = librosa.stream(resampled_path, block_length=block_length, frame_length=16000, hop_length=16000)
print ('Every chunk is ',block_length,'sec. long')
print("Number of chunks",int(chunks))
for n, speech in enumerate(stream):
print ("Transcribing the chunk number " + str(n+1))
separator = ' '
if n % 2 == 0:
separator = '\n'
transcript += generate_transcription(speech, processor, model) + separator
print("Encoding complete. Total number of chunks: " + str(n+1) + "\n")
return transcript
#Speech to text function
def generate_transcription(speech, processor, model):
if len(speech.shape) > 1:
speech = speech[:, 0] + speech[:, 1]
input_values = processor(speech, sampling_rate = sr, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
return transcription.lower()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment