Skip to content

Instantly share code, notes, and snippets.

@juanmc2005
Last active May 9, 2024 14:47
Show Gist options
  • Save juanmc2005/ed6413e697e176cb36a149d8c40a3a5b to your computer and use it in GitHub Desktop.
Save juanmc2005/ed6413e697e176cb36a149d8c40a3a5b to your computer and use it in GitHub Desktop.
Code for my tutorial "Color Your Captions: Streamlining Live Transcriptions with Diart and OpenAI's Whisper". Available at https://medium.com/@juanmc2005/color-your-captions-streamlining-live-transcriptions-with-diart-and-openais-whisper-6203350234ef
import logging
import os
import sys
import traceback
from contextlib import contextmanager
import diart.operators as dops
import numpy as np
import rich
import rx.operators as ops
import whisper_timestamped as whisper
from diart import SpeakerDiarization, SpeakerDiarizationConfig
from diart.sources import MicrophoneAudioSource
from pyannote.core import Annotation, SlidingWindowFeature, SlidingWindow, Segment
def concat(chunks, collar=0.05):
"""
Concatenate predictions and audio
given a list of `(diarization, waveform)` pairs
and merge contiguous single-speaker regions
with pauses shorter than `collar` seconds.
"""
first_annotation = chunks[0][0]
first_waveform = chunks[0][1]
annotation = Annotation(uri=first_annotation.uri)
data = []
for ann, wav in chunks:
annotation.update(ann)
data.append(wav.data)
annotation = annotation.support(collar)
window = SlidingWindow(
first_waveform.sliding_window.duration,
first_waveform.sliding_window.step,
first_waveform.sliding_window.start,
)
data = np.concatenate(data, axis=0)
return annotation, SlidingWindowFeature(data, window)
def colorize_transcription(transcription):
"""
Unify a speaker-aware transcription represented as
a list of `(speaker: int, text: str)` pairs
into a single text colored by speakers.
"""
colors = 2 * [
"bright_red", "bright_blue", "bright_green", "orange3", "deep_pink1",
"yellow2", "magenta", "cyan", "bright_magenta", "dodger_blue2"
]
result = []
for speaker, text in transcription:
if speaker == -1:
# No speakerfound for this text, use default terminal color
result.append(text)
else:
result.append(f"[{colors[speaker]}]{text}")
return "\n".join(result)
@contextmanager
def suppress_stdout():
# Auxiliary function to suppress Whisper logs (it is quite verbose)
# All credit goes to: https://thesmithfam.org/blog/2012/10/25/temporarily-suppress-console-output-in-python/
with open(os.devnull, "w") as devnull:
old_stdout = sys.stdout
sys.stdout = devnull
try:
yield
finally:
sys.stdout = old_stdout
class WhisperTranscriber:
def __init__(self, model="small", device=None):
self.model = whisper.load_model(model, device=device)
self._buffer = ""
def transcribe(self, waveform):
"""Transcribe audio using Whisper"""
# Pad/trim audio to fit 30 seconds as required by Whisper
audio = waveform.data.astype("float32").reshape(-1)
audio = whisper.pad_or_trim(audio)
# Transcribe the given audio while suppressing logs
with suppress_stdout():
transcription = whisper.transcribe(
self.model,
audio,
# We use past transcriptions to condition the model
initial_prompt=self._buffer,
verbose=True # to avoid progress bar
)
return transcription
def identify_speakers(self, transcription, diarization, time_shift):
"""Iterate over transcription segments to assign speakers"""
speaker_captions = []
for segment in transcription["segments"]:
# Crop diarization to the segment timestamps
start = time_shift + segment["words"][0]["start"]
end = time_shift + segment["words"][-1]["end"]
dia = diarization.crop(Segment(start, end))
# Assign a speaker to the segment based on diarization
speakers = dia.labels()
num_speakers = len(speakers)
if num_speakers == 0:
# No speakers were detected
caption = (-1, segment["text"])
elif num_speakers == 1:
# Only one speaker is active in this segment
spk_id = int(speakers[0].split("speaker")[1])
caption = (spk_id, segment["text"])
else:
# Multiple speakers, select the one that speaks the most
max_speaker = int(np.argmax([
dia.label_duration(spk) for spk in speakers
]))
caption = (max_speaker, segment["text"])
speaker_captions.append(caption)
return speaker_captions
def __call__(self, diarization, waveform):
# Step 1: Transcribe
transcription = self.transcribe(waveform)
# Update transcription buffer
self._buffer += transcription["text"]
# The audio may not be the beginning of the conversation
time_shift = waveform.sliding_window.start
# Step 2: Assign speakers
speaker_transcriptions = self.identify_speakers(transcription, diarization, time_shift)
return speaker_transcriptions
# Suppress whisper-timestamped warnings for a clean output
logging.getLogger("whisper_timestamped").setLevel(logging.ERROR)
# If you have a GPU, you can also set device=torch.device("cuda")
config = SpeakerDiarizationConfig(
duration=5,
step=0.5,
latency="min",
tau_active=0.5,
rho_update=0.1,
delta_new=0.57
)
dia = SpeakerDiarization(config)
source = MicrophoneAudioSource(config.sample_rate)
# If you have a GPU, you can also set device="cuda"
asr = WhisperTranscriber(model="small")
# Split the stream into 2s chunks for transcription
transcription_duration = 2
# Apply models in batches for better efficiency
batch_size = int(transcription_duration // config.step)
# Chain of operations to apply on the stream of microphone audio
source.stream.pipe(
# Format audio stream to sliding windows of 5s with a step of 500ms
dops.rearrange_audio_stream(
config.duration, config.step, config.sample_rate
),
# Wait until a batch is full
# The output is a list of audio chunks
ops.buffer_with_count(count=batch_size),
# Obtain diarization prediction
# The output is a list of pairs `(diarization, audio chunk)`
ops.map(dia),
# Concatenate 500ms predictions/chunks to form a single 2s chunk
ops.map(concat),
# Ignore this chunk if it does not contain speech
ops.filter(lambda ann_wav: ann_wav[0].get_timeline().duration() > 0),
# Obtain speaker-aware transcriptions
# The output is a list of pairs `(speaker: int, caption: str)`
ops.starmap(asr),
# Color transcriptions according to the speaker
# The output is plain text with color references for rich
ops.map(colorize_transcription),
).subscribe(
on_next=rich.print, # print colored text
on_error=lambda _: traceback.print_exc() # print stacktrace if error
)
print("Listening...")
source.read()
@fahnub
Copy link

fahnub commented Jul 20, 2023

@juanmc2005 thanks for the response. I am using a wireless HyperX Cloud Flight S headphone. It runs on GPU. I will check all the other things you mentioned and then get back.

@etienno83
Copy link

Hello Juan,

First, thank you a lot for your work, it's amazing.

Second, I'm having an issue when importing your project. I am on windows 10. And when I launched it for the first time, I had this error :
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
Traceback (most recent call last):
File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio_extension.py", line 71, in _init_ffmpeg
_load_lib("libtorchaudio_ffmpeg")
File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio_extension.py", line 52, in load_lib
torch.ops.load_library(path)
File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch_ops.py", line 573, in load_library
ctypes.CDLL(path)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\ctypes_init
.py", line 374, in init
self._handle = _dlopen(self._name, mode)
FileNotFoundError: Could not find module 'C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio\lib\libtorchaudio_ffmpeg.pyd' (or one of its dependencies). Try using the full path with constructor syntax.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "C:\Users\etien\PycharmProjects\jarvis2\main.py", line 12, in
from diart.sources import MicrophoneAudioSource
File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\diart\sources.py", line 11, in
from torchaudio.io import StreamReader
File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio\io_init_.py", line 21, in getattr
torchaudio._extension._init_ffmpeg()
File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio_extension.py", line 73, in _init_ffmpeg
raise ImportError("FFmpeg libraries are not found. Please install FFmpeg.") from err
ImportError: FFmpeg libraries are not found. Please install FFmpeg.

Process finished with exit code 1

Of course I have checked and yes I have imported ffmpeg, I even downloaded ffmpeg.org tar and added it to my sys path. Maybe it is an issue with the ctypes winmode (https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python) but... I'm afraid my knowledge doesn't go that far.

Before I keep dugging and dugging, have you encountered this issue before? If yes, can you please help me? If no, do you any idea on how I should handle this?

Thank you very much!

@ngtrphuong
Copy link

ngtrphuong commented Oct 17, 2023

Hello Juan,

First, thank you a lot for your work, it's amazing.

Second, I'm having an issue when importing your project. I am on windows 10. And when I launched it for the first time, I had this error : The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows. The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows. Traceback (most recent call last): File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio_extension.py", line 71, in _init_ffmpeg load_lib("libtorchaudio_ffmpeg") File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio_extension.py", line 52, in load_lib torch.ops.load_library(path) File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch_ops.py", line 573, in load_library ctypes.CDLL(path) File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\ctypes__init.py", line 374, in init self._handle = _dlopen(self._name, mode) FileNotFoundError: Could not find module 'C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio\lib\libtorchaudio_ffmpeg.pyd' (or one of its dependencies). Try using the full path with constructor syntax.

The above exception was the direct cause of the following exception:

Traceback (most recent call last): File "C:\Users\etien\PycharmProjects\jarvis2\main.py", line 12, in from diart.sources import MicrophoneAudioSource File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\diart\sources.py", line 11, in from torchaudio.io import StreamReader File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio\io__init__.py", line 21, in getattr torchaudio._extension._init_ffmpeg() File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio_extension.py", line 73, in _init_ffmpeg raise ImportError("FFmpeg libraries are not found. Please install FFmpeg.") from err ImportError: FFmpeg libraries are not found. Please install FFmpeg.

Process finished with exit code 1

Of course I have checked and yes I have imported ffmpeg, I even downloaded ffmpeg.org tar and added it to my sys path. Maybe it is an issue with the ctypes winmode (https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python) but... I'm afraid my knowledge doesn't go that far.

Before I keep dugging and dugging, have you encountered this issue before? If yes, can you please help me? If no, do you any idea on how I should handle this?

Thank you very much!

Please use the develop branch of diart and install pip libraries from there (torch 2.x, torchaudio 3.x which are compatible with ffmpeg 6.x).
After all, everything will be working like a champ,
I've tried with this yesterday and everything is working (even in windows, it reports diart v0.7.0 but who care),
You can follow the pip installation for the git branch via this SO article - https://stackoverflow.com/a/20101940/6124251

Another side note: please change the "config=PipelineConfig()" to "config = SpeakerDiarizationConfig". Same for OnlineSpeakerDiarization() to SpeakerDiarization(),

@nfsedaghat
Copy link

Hi,
Thanks for sharing your code.
Just wondering how I can give it a live stream video and get the transcription?

@lodi2001
Copy link

lodi2001 commented Nov 4, 2023

Hello,

I encountered an issue while running a script that utilizes OnlineSpeakerDiarization from the [diart] library on a Windows system. Initially, I received a notification that the torchaudio backend was switched to 'soundfile' with a note that 'sox_io' is not supported on Windows:

While this is expected behavior on Windows as 'sox_io' backend is not available, I encountered an OSError when I tried to instantiate OnlineSpeakerDiarization with a breakpoint at line 160:

dia = OnlineSpeakerDiarization(config)

Exception has occurred: OSError
[WinError 6] The handle is invalid
  File "C:\dev\voicetotext\voictotext\diart_whisper.py", line 160, in <module>


I'm curious to know if there are specific requirements or configurations needed for the 'soundfile' backend to work properly on Windows, particularly if there's a specific directory where the .wav file should be placed, or if there are any known workarounds for this issue.

Any assistance or insights you can provide would be greatly appreciated.

Thank you!

@etienno83
Copy link

Hello Juan,
First, thank you a lot for your work, it's amazing.
Second, I'm having an issue when importing your project. I am on windows 10. And when I launched it for the first time, I had this error : The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows. The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows. Traceback (most recent call last): File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio_extension.py", line 71, in _init_ffmpeg load_lib("libtorchaudio_ffmpeg") File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio_extension.py", line 52, in load_lib torch.ops.load_library(path) File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch_ops.py", line 573, in load_library ctypes.CDLL(path) File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\ctypes__init.py", line 374, in init self._handle = dlopen(self.name, mode) FileNotFoundError: Could not find module 'C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio\lib\libtorchaudio_ffmpeg.pyd' (or one of its dependencies). Try using the full path with constructor syntax.
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "C:\Users\etien\PycharmProjects\jarvis2\main.py", line 12, in from diart.sources import MicrophoneAudioSource File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\diart\sources.py", line 11, in from torchaudio.io import StreamReader File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio\io__init
.py", line 21, in getattr torchaudio._extension._init_ffmpeg() File "C:\Users\etien\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio_extension.py", line 73, in _init_ffmpeg raise ImportError("FFmpeg libraries are not found. Please install FFmpeg.") from err ImportError: FFmpeg libraries are not found. Please install FFmpeg.
Process finished with exit code 1
Of course I have checked and yes I have imported ffmpeg, I even downloaded ffmpeg.org tar and added it to my sys path. Maybe it is an issue with the ctypes winmode (https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python) but... I'm afraid my knowledge doesn't go that far.
Before I keep dugging and dugging, have you encountered this issue before? If yes, can you please help me? If no, do you any idea on how I should handle this?
Thank you very much!

Please use the develop branch of diart and install pip libraries from there (torch 2.x, torchaudio 3.x which are compatible with ffmpeg 6.x). After all, everything will be working like a champ, I've tried with this yesterday and everything is working (even in windows, it reports diart v0.7.0 but who care), You can follow the pip installation for the git branch via this SO article - https://stackoverflow.com/a/20101940/6124251

Another side note: please change the "config=PipelineConfig()" to "config = SpeakerDiarizationConfig". Same for OnlineSpeakerDiarization() to SpeakerDiarization(),

Hello again,

Everything is working properly, thank yoou for your help

Quick question : is it possible to make it work in other languages like french?

@libowen424
Copy link

libowen424 commented Nov 23, 2023

Hello Juan,thank you a lot for your work!
I didn't find any code for OnlineSpeakerDiarization in the diart.
from diart import OnlineSpeakerDiarization
Can you explain where it is?

@juanmc2005
Copy link
Author

Hi @libowen424, since I wrote the article, these classes have been renamed:

OnlineSpeakerDiarization -> SpeakerDiarization
PipelineConfig -> SpeakerDiarizationConfig

I just updated the gist accordingly

@juanmc2005
Copy link
Author

@etienno83

Quick question : is it possible to make it work in other languages like french?

Yes, you can pass the language code language="fr" in the call from line 86

@jber18
Copy link

jber18 commented Nov 30, 2023

Hi @juanmc2005 , I followed all your steps but whenever i run the script its just stuck on listening...
here are the output from console.

(diart) PS C:\Users\User\Desktop\whisper> python main.py
C:\Users\User\miniconda3\envs\diart\lib\site-packages\pyannote\audio\core\io.py:43: UserWarning: torchaudio._backend.set_audio_backend has been deprecated. With dispatcher enabled, this function is no-op. You can remove the function call.
torchaudio.set_audio_backend("soundfile")
C:\Users\User\miniconda3\envs\diart\lib\site-packages\torch_audiomentations\utils\io.py:27: UserWarning: torchaudio._backend.set_audio_backend has been deprecated. With dispatcher enabled, this function is no-op. You can remove the function call.
torchaudio.set_audio_backend("soundfile")
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\pyannote\models--pyannote--segmentation\snapshots\2ffce0501d0aecad81b43a06d538186e292d0070\pytorch_model.bin
Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.1+cpu. Bad things might happen unless you revert torch to 1.x.
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.1.2. To apply the upgrade to your files permanently, run python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\pyannote\models--pyannote--embedding\snapshots\c6335d8f1cd77b30084387468a6cf26fea90009b\pytorch_model.bin
Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.1.1+cpu. Bad things might happen unless you revert torch to 1.x.
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.1.2. To apply the upgrade to your files permanently, run python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\pyannote\models--pyannote--embedding\snapshots\c6335d8f1cd77b30084387468a6cf26fea90009b\pytorch_model.bin
Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.1.1+cpu. Bad things might happen unless you revert torch to 1.x.
Listening...

@juanmc2005
Copy link
Author

Hi @jber18, apparently there's an issue here that was introduced with v0.8 (see this issue). For now, I suggest you downgrade to v0.7 where it seems to be working normally.

I'm still investigating this

@jber18
Copy link

jber18 commented Nov 30, 2023

Hi @juanmc2005 , this actually works with this version of diart, thank you for the reply, but I guess my pc can't handle the task so it takes time to process the transcription. 😁

@SharhadBashar
Copy link

SharhadBashar commented Dec 3, 2023

Hi @juanmc2005
I am trying to run this
Just running diart from this repo https://github.com/juanmc2005/diart works perfectly
But when i try to run the above code, it starts and is stuck on Listening...
Nothing else is printed.
Any idea what im doing wrong?
I am on M1 Mac using python 3.9.16

Heres my console output:

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/2ffce0501d0aecad81b43a06d538186e292d0070/pytorch_model.bin`
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.1. Bad things might happen unless you revert torch to 1.x.
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/pyannote/models--pyannote--embedding/snapshots/c6335d8f1cd77b30084387468a6cf26fea90009b/pytorch_model.bin`
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.0.1. Bad things might happen unless you revert torch to 1.x.
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/pyannote/models--pyannote--embedding/snapshots/c6335d8f1cd77b30084387468a6cf26fea90009b/pytorch_model.bin`
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.0.1. Bad things might happen unless you revert torch to 1.x.
Listening...

When i try to install diart v0.7.0, i get this error:

pip install diart==0.7.0
Collecting diart==0.7.0
  Using cached diart-0.7.0-py3-none-any.whl (50 kB)
Requirement already satisfied: numpy>=1.20.2 in /opt/homebrew/lib/python3.11/site-packages (from diart==0.7.0) (1.24.2)
Requirement already satisfied: matplotlib>=3.3.3 in /opt/homebrew/lib/python3.11/site-packages (from diart==0.7.0) (3.8.2)
Requirement already satisfied: rx>=3.2.0 in /opt/homebrew/lib/python3.11/site-packages (from diart==0.7.0) (3.2.0)
Requirement already satisfied: scipy>=1.6.0 in /opt/homebrew/lib/python3.11/site-packages (from diart==0.7.0) (1.10.1)
Requirement already satisfied: sounddevice>=0.4.2 in /opt/homebrew/lib/python3.11/site-packages (from diart==0.7.0) (0.4.6)
Requirement already satisfied: einops>=0.3.0 in /opt/homebrew/lib/python3.11/site-packages (from diart==0.7.0) (0.7.0)
Requirement already satisfied: tqdm>=4.64.0 in /opt/homebrew/lib/python3.11/site-packages (from diart==0.7.0) (4.65.0)
Requirement already satisfied: pandas>=1.4.2 in /opt/homebrew/lib/python3.11/site-packages (from diart==0.7.0) (2.0.1)
Requirement already satisfied: torch>=1.12.1 in /opt/homebrew/lib/python3.11/site-packages (from diart==0.7.0) (2.1.1)
Requirement already satisfied: torchvision>=0.14.0 in /opt/homebrew/lib/python3.11/site-packages (from diart==0.7.0) (0.16.1)
INFO: pip is looking at multiple versions of diart to determine which version is compatible with other requirements. This could take a while.
ERROR: Could not find a version that satisfies the requirement torchaudio<1.0,>=0.12.1 (from diart) (from versions: 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1)
ERROR: No matching distribution found for torchaudio<1.0,>=0.12.1

@nzhl
Copy link

nzhl commented Dec 8, 2023

@SharhadBashar same issue, have u fixed it ?

@juanmc2005
Copy link
Author

I tried it out using diart 0.9, both from the mic and from an audio file, with and without GPU. Each time I was able to see colored transcriptions. However, what may be happening is that the chunk processing is too slow (due to hardware) and hence interrupts the recording of the microphone (although it should be asynchronous with MicrophoneAudioSource).

If you can get real time diarization with only diart (quick test diart.stream microphone and see what you get), then what I suggest is that you change line 151 to source = WebSocketAudioSource(config.sample_rate) and run the script, then from another terminal run diart.client microphone --host 127.0.0.1 --port 7007 --sample-rate 16000 --step 0.5.

This is basically reading from the microphone and sending chunks to the pipeline through a websocket server, then you should see the colored captions on the pipeline script's output.
This will guarantee that the mic streaming and the pipeline run in different processes, avoiding the interference problem that I mentioned.

Let me know if that works out!

@juanmc2005
Copy link
Author

@SharhadBashar concerning the v0.7 issue, consider installing torch==1.13.1 before diart, that way you get a torchaudio version that matches the requirement. After that, you can install a more recent torch/torchaudio version and everything should work correctly.

@jber18
Copy link

jber18 commented Dec 12, 2023

I made a Little bit of modification about this script where instead processing from my local machine when transcribing, I send http request from openai api instead, However I cant distinguish how to get the speakers

@swm35
Copy link

swm35 commented Dec 12, 2023

@juanmc2005's suggested change to use WebSocketAudioSource instead of MicrophoneAudioSource, with separate terminal running diart.client ..., works for me on my laptop (Linux Ubuntu 22.04 with in-built 4GB GPU), thank you for resolving that issue! This solution works for me on both diart v0.9 and v0.7. Incidentally, I get substantially better transcriptions by increasing transcription_duration from 2 to 5.

@zwanderer0
Copy link

zwanderer0 commented Dec 15, 2023

I am on a Mac M1 Pro

  1. Does this code to realtime transcription with speaker labels?

  2. the gist is stuck at "listening..."

python3 testcopy.py
objc[23911]: Class AVFFrameReceiver is implemented in both /Users/zwanderer/miniconda3/lib/python3.11/site-packages/av/.dylibs/libavdevice.59.7.100.dylib (0x162ae4778) and /Users/zwanderer/miniconda3/lib/libavdevice.59.7.100.dylib (0x1691dc778). One of the two will be used. Which one is undefined.
objc[23911]: Class AVFAudioReceiver is implemented in both /Users/zwanderer/miniconda3/lib/python3.11/site-packages/av/.dylibs/libavdevice.59.7.100.dylib (0x162ae47c8) and /Users/zwanderer/miniconda3/lib/libavdevice.59.7.100.dylib (0x1691dc7c8). One of the two will be used. Which one is undefined.
/Users/zwanderer/miniconda3/lib/python3.11/site-packages/pyannote/audio/core/io.py:43: UserWarning: torchaudio._backend.set_audio_backend has been deprecated. With dispatcher enabled, this function is no-op. You can remove the function call.
torchaudio.set_audio_backend("soundfile")
/Users/zwanderer/miniconda3/lib/python3.11/site-packages/torch_audiomentations/utils/io.py:27: UserWarning: torchaudio._backend.set_audio_backend has been deprecated. With dispatcher enabled, this function is no-op. You can remove the function call.
torchaudio.set_audio_backend("soundfile")
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.0.7. To apply the upgrade to your files permanently, run python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/2ffce0501d0aecad81b43a06d538186e292d0070/pytorch_model.bin
Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.2. Bad things might happen unless you revert torch to 1.x.
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.0.7. To apply the upgrade to your files permanently, run python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../../.cache/torch/pyannote/models--pyannote--embedding/snapshots/c6335d8f1cd77b30084387468a6cf26fea90009b/pytorch_model.bin
Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.1.2. Bad things might happen unless you revert torch to 1.x.
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.0.7. To apply the upgrade to your files permanently, run python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../../.cache/torch/pyannote/models--pyannote--embedding/snapshots/c6335d8f1cd77b30084387468a6cf26fea90009b/pytorch_model.bin
Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.1.2. Bad things might happen unless you revert torch to 1.x.
Listening...

@znabF
Copy link

znabF commented Dec 31, 2023

Hi,

I am encountering this error:
LocalTokenNotFoundError Traceback (most recent call last)
Cell In[2], line 161
159 config = SpeakerDiarizationConfig(duration = 5, step = 0.5, latency = "min", tau_active = 0.5, rho_update = 0.1, delta_new = 0.57)
160 # making stream system using configuration settings
--> 161 dia = SpeakerDiarization(config)
162 source = MicrophoneAudioSource(config.sample_rate)
164 transcriber = WhisperTranscriber(model="small")

File ~\anaconda3\Lib\site-packages\diart\blocks\diarization.py:96, in SpeakerDiarization.init(self, config)
93 msg = f"Latency should be in the range [{self._config.step}, {self._config.duration}]"
94 assert self._config.step <= self._config.latency <= self._config.duration, msg
---> 96 self.segmentation = SpeakerSegmentation(
97 self._config.segmentation, self._config.device
98 )
99 self.embedding = OverlapAwareSpeakerEmbedding(
100 self._config.embedding,
101 self._config.gamma,
(...)
105 device=self._config.device,
106 )
107 self.pred_aggregation = DelayedAggregation(
108 self._config.step,
109 self._config.latency,
110 strategy="hamming",
111 cropping_mode="loose",
112 )

File ~\anaconda3\Lib\site-packages\diart\blocks\segmentation.py:13, in SpeakerSegmentation.init(self, model, device)
11 def init(self, model: SegmentationModel, device: Optional[torch.device] = None):
12 self.model = model
---> 13 self.model.eval()
14 self.device = device
15 if self.device is None:

File ~\anaconda3\Lib\site-packages\diart\models.py:136, in LazyModel.eval(self)
135 def eval(self) -> LazyModel:
--> 136 self.load()
137 if isinstance(self.model, nn.Module):
138 self.model.eval()

File ~\anaconda3\Lib\site-packages\diart\models.py:124, in LazyModel.load(self)
122 def load(self):
123 if not self.is_in_memory():
--> 124 self.model = self.get_model()

File ~\anaconda3\Lib\site-packages\diart\models.py:50, in PyannoteLoader.call(self)
48 def call(self) -> Callable:
49 try:
---> 50 model = Model.from_pretrained(self.model_info, use_auth_token=self.hf_token)
51 specs = getattr(model, "specifications", None)
52 if specs is not None and specs.powerset:

File ~\anaconda3\Lib\site-packages\pyannote\audio\core\model.py:624, in Model.from_pretrained(cls, checkpoint, map_location, hparams_file, strict, use_auth_token, cache_dir, **kwargs)
621 revision = None
623 try:
--> 624 path_for_pl = hf_hub_download(
625 model_id,
626 HF_PYTORCH_WEIGHTS_NAME,
627 repo_type="model",
628 revision=revision,
629 library_name="pyannote",
630 library_version=version,
631 cache_dir=cache_dir,
632 # force_download=False,
633 # proxies=None,
634 # etag_timeout=10,
635 # resume_download=False,
636 use_auth_token=use_auth_token,
637 # local_files_only=False,
638 # legacy_cache_layout=False,
639 )
640 except RepositoryNotFoundError:
641 print(
642 f"""
643 Could not download '{model_id}' model.
(...)
652 visit https://hf.co/{model_id} to accept the user conditions."""
653 )

File ~\anaconda3\Lib\site-packages\huggingface_hub\utils_validators.py:118, in validate_hf_hub_args.._inner_fn(*args, **kwargs)
115 if check_use_auth_token:
116 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.name, has_token=has_token, kwargs=kwargs)
--> 118 return fn(*args, **kwargs)

File ~\anaconda3\Lib\site-packages\huggingface_hub\file_download.py:1223, in hf_hub_download(repo_id, filename, subfolder, repo_type, revision, library_name, library_version, cache_dir, local_dir, local_dir_use_symlinks, user_agent, force_download, force_filename, proxies, etag_timeout, resume_download, token, local_files_only, legacy_cache_layout, endpoint)
1219 return pointer_path
1221 url = hf_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint)
-> 1223 headers = build_hf_headers(
1224 token=token,
1225 library_name=library_name,
1226 library_version=library_version,
1227 user_agent=user_agent,
1228 )
1230 url_to_download = url
1231 etag = None

File ~\anaconda3\Lib\site-packages\huggingface_hub\utils_validators.py:118, in validate_hf_hub_args.._inner_fn(*args, **kwargs)
115 if check_use_auth_token:
116 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.name, has_token=has_token, kwargs=kwargs)
--> 118 return fn(*args, **kwargs)

File ~\anaconda3\Lib\site-packages\huggingface_hub\utils_headers.py:121, in build_hf_headers(token, is_write_action, library_name, library_version, user_agent)
48 """
49 Build headers dictionary to send in a HF Hub call.
50
(...)
118 If token=True but token is not saved locally.
119 """
120 # Get auth token to send
--> 121 token_to_send = get_token_to_send(token)
122 _validate_token_to_send(token_to_send, is_write_action=is_write_action)
124 # Combine headers

File ~\anaconda3\Lib\site-packages\huggingface_hub\utils_headers.py:153, in get_token_to_send(token)
151 if token is True:
152 if cached_token is None:
--> 153 raise LocalTokenNotFoundError(
154 "Token is required (token=True), but no token found. You"
155 " need to provide a token or be logged in to Hugging Face with"
156 " huggingface-cli login or huggingface_hub.login. See"
157 " https://huggingface.co/settings/tokens."
158 )
159 return cached_token
161 # Case implicit use of the token is forbidden by env variable

LocalTokenNotFoundError: Token is required (token=True), but no token found. You need to provide a token or be logged in to Hugging Face with huggingface-cli login or huggingface_hub.login. See https://huggingface.co/settings/tokens.

@sachanayush47
Copy link

instead of using a microphone, i want to pass my audio file path for diarization. How can i do this?

@juanmc2005
Copy link
Author

@znabF
Copy link

znabF commented Jan 7, 2024

Hi,

I am encountering this error:
OSError: [WinError 182] The operating system cannot run %1. Error loading "C:\Users\coolz\anaconda3\lib\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.

Please help me navigate a solution for this error @juanmc2005.

@ngtrphuong
Copy link

I made a Little bit of modification about this script where instead processing from my local machine when transcribing, I send http request from openai api instead, However I can distinguish how to get the speakers

Can you share the code snippet of what you had modified to work with OpenAI Whisper API?
My PC configuration is quite slow and does not have GPU, this alternative solution is quite promissing...

@eshenayo
Copy link

eshenayo commented Feb 5, 2024

I am encountering an error with the hf_token:

Traceback (most recent call last):
  File "/usr/app/src/./diart_whisper.py", line 206, in <module>
    dia = SpeakerDiarization(config)
  File "/usr/local/lib/python3.10/dist-packages/diart/blocks/diarization.py", line 96, in __init__
    self.segmentation = SpeakerSegmentation(
  File "/usr/local/lib/python3.10/dist-packages/diart/blocks/segmentation.py", line 13, in __init__
    self.model.eval()
  File "/usr/local/lib/python3.10/dist-packages/diart/models.py", line 136, in eval
    self.load()
  File "/usr/local/lib/python3.10/dist-packages/diart/models.py", line 124, in load
    self.model = self.get_model()
  File "/usr/local/lib/python3.10/dist-packages/diart/models.py", line 50, in __call__
    model = Model.from_pretrained(self.model_info, use_auth_token=self.hf_token)
  File "/usr/local/lib/python3.10/dist-packages/pyannote/audio/core/model.py", line 624, in from_pretrained
    path_for_pl = hf_hub_download(
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py", line 118, in _inner_fn
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py", line 1223, in hf_hub_download
    headers = build_hf_headers(
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py", line 118, in _inner_fn
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_headers.py", line 121, in build_hf_headers
    token_to_send = get_token_to_send(token)
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_headers.py", line 153, in get_token_to_send
    raise LocalTokenNotFoundError(
huggingface_hub.utils._headers.LocalTokenNotFoundError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.

I don't see a way to pass as a parameter in the gist code, since SpeakerDiarizationConfig and SpeakerDiarization don't have a param for the token.

@eshenayo
Copy link

eshenayo commented Feb 5, 2024

added code to the script to log into hugging face:

from huggingface_hub import login
login(token=r4a_config['hugging_face_key']) # use key parsed from json config file

You have to provide your contact info:

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful

Could not download 'pyannote/segmentation' model.
It might be because the model is private or gated so make
sure to authenticate. Visit https://hf.co/settings/tokens to
create your access token and retry with:

   >>> Model.from_pretrained('pyannote/segmentation',
   ...                       use_auth_token=YOUR_AUTH_TOKEN)

If this still does not work, it might be because the model is gated:
visit https://hf.co/pyannote/segmentation to accept the user conditions.

Could not download 'pyannote/segmentation' model.
It might be because the model is private or gated so make
sure to authenticate. Visit https://hf.co/settings/tokens to
create your access token and retry with:

   >>> Model.from_pretrained('pyannote/segmentation',
   ...                       use_auth_token=YOUR_AUTH_TOKEN)

If this still does not work, it might be because the model is gated:
visit https://hf.co/pyannote/segmentation to accept the user conditions.
Traceback (most recent call last):
  File "/usr/app/src/./diart_whisper.py", line 210, in <module>
    dia = SpeakerDiarization(config)
  File "/usr/local/lib/python3.10/dist-packages/diart/blocks/diarization.py", line 96, in __init__
    self.segmentation = SpeakerSegmentation(
  File "/usr/local/lib/python3.10/dist-packages/diart/blocks/segmentation.py", line 17, in __init__
    self.model.to(self.device)
  File "/usr/local/lib/python3.10/dist-packages/diart/models.py", line 128, in to
    self.model = self.model.to(device)
AttributeError: 'NoneType' object has no attribute 'to'

image

And the same for trhe embedded model:

Could not download 'pyannote/embedding' model.
It might be because the model is private or gated so make
sure to authenticate. Visit https://hf.co/settings/tokens to
create your access token and retry with:

   >>> Model.from_pretrained('pyannote/embedding',
   ...                       use_auth_token=YOUR_AUTH_TOKEN)

If this still does not work, it might be because the model is gated:
visit https://hf.co/pyannote/embedding to accept the user conditions.

Could not download 'pyannote/embedding' model.
It might be because the model is private or gated so make
sure to authenticate. Visit https://hf.co/settings/tokens to
create your access token and retry with:

   >>> Model.from_pretrained('pyannote/embedding',
   ...                       use_auth_token=YOUR_AUTH_TOKEN)

If this still does not work, it might be because the model is gated:
visit https://hf.co/pyannote/embedding to accept the user conditions.

Then the script works:
image

Now to debug why there is no transcript, only the listening prompt.

I am encountering an error with the hf_token:

Traceback (most recent call last):
  File "/usr/app/src/./diart_whisper.py", line 206, in <module>
    dia = SpeakerDiarization(config)
  File "/usr/local/lib/python3.10/dist-packages/diart/blocks/diarization.py", line 96, in __init__
    self.segmentation = SpeakerSegmentation(
  File "/usr/local/lib/python3.10/dist-packages/diart/blocks/segmentation.py", line 13, in __init__
    self.model.eval()
  File "/usr/local/lib/python3.10/dist-packages/diart/models.py", line 136, in eval
    self.load()
  File "/usr/local/lib/python3.10/dist-packages/diart/models.py", line 124, in load
    self.model = self.get_model()
  File "/usr/local/lib/python3.10/dist-packages/diart/models.py", line 50, in __call__
    model = Model.from_pretrained(self.model_info, use_auth_token=self.hf_token)
  File "/usr/local/lib/python3.10/dist-packages/pyannote/audio/core/model.py", line 624, in from_pretrained
    path_for_pl = hf_hub_download(
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py", line 118, in _inner_fn
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py", line 1223, in hf_hub_download
    headers = build_hf_headers(
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py", line 118, in _inner_fn
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_headers.py", line 121, in build_hf_headers
    token_to_send = get_token_to_send(token)
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_headers.py", line 153, in get_token_to_send
    raise LocalTokenNotFoundError(
huggingface_hub.utils._headers.LocalTokenNotFoundError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.

I don't see a way to pass as a parameter in the gist code, since SpeakerDiarizationConfig and SpeakerDiarization don't have a param for the token.

@kangmin5133
Copy link

Hi @juanmc2005 , I followed all your steps but whenever i run the script its just stuck on listening... here are the output from console.

(diart) PS C:\Users\User\Desktop\whisper> python main.py C:\Users\User\miniconda3\envs\diart\lib\site-packages\pyannote\audio\core\io.py:43: UserWarning: torchaudio._backend.set_audio_backend has been deprecated. With dispatcher enabled, this function is no-op. You can remove the function call. torchaudio.set_audio_backend("soundfile") C:\Users\User\miniconda3\envs\diart\lib\site-packages\torch_audiomentations\utils\io.py:27: UserWarning: torchaudio._backend.set_audio_backend has been deprecated. With dispatcher enabled, this function is no-op. You can remove the function call. torchaudio.set_audio_backend("soundfile") The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows. The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows. Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\pyannote\models--pyannote--segmentation\snapshots\2ffce0501d0aecad81b43a06d538186e292d0070\pytorch_model.bin Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x. Model was trained with torch 1.10.0+cu102, yours is 2.1.1+cpu. Bad things might happen unless you revert torch to 1.x. Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.1.2. To apply the upgrade to your files permanently, run python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\pyannote\models--pyannote--embedding\snapshots\c6335d8f1cd77b30084387468a6cf26fea90009b\pytorch_model.bin Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x. Model was trained with torch 1.8.1+cu102, yours is 2.1.1+cpu. Bad things might happen unless you revert torch to 1.x. Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.1.2. To apply the upgrade to your files permanently, run python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\User\.cache\torch\pyannote\models--pyannote--embedding\snapshots\c6335d8f1cd77b30084387468a6cf26fea90009b\pytorch_model.bin Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x. Model was trained with torch 1.8.1+cu102, yours is 2.1.1+cpu. Bad things might happen unless you revert torch to 1.x. Listening...

same here i'm working on M2 pro

@juanmc2005
Copy link
Author

Hi @eshenayo @kangmin5133

Concerning the issue with the huggingface token, detailed instructions can be found in the README to include the token automatically. Otherwise they can be passed when loading the models with SegmentationModel.from_pretrained("pyannote/segmentation", hf_token=...) (same for EmbeddingModel).

Concerning the script getting stuck on "listening", I suggest you try to debug line by line to see where it hangs (if it even does so). As I said in other discussions, this could come from different places. You can check out this issue for context and my previous answers.

@Yuxinren
Copy link

Yuxinren commented Mar 5, 2024

Hi @juanmc2005
I got these error when running it.


Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
OMP: Error #15: Initializing libiomp5.dylib, but found libiomp5.dylib already initialized.
OMP: Hint This means that multiple copies of the OpenMP runtime have been linked into the program. That is dangerous, since it can degrade performance or cause incorrect results. The best thing to do is to ensure that only a single OpenMP runtime is linked into the process, e.g. by avoiding static linking of the OpenMP runtime in any library. As an unsafe, unsupported, undocumented workaround you can set the environment variable KMP_DUPLICATE_LIB_OK=TRUE to allow the program to continue to execute, but that may cause crashes or silently produce incorrect results. For more information, please see http://www.intel.com/software/products/support/.

@alantypoon
Copy link

I am on a Mac M1 Pro

  1. Does this code to realtime transcription with speaker labels?
  2. the gist is stuck at "listening..."

I can fix the listening halt on Ubuntu and Mac OS with M2 Ultra by just changing the following line (line 151):
source = MicrophoneAudioSource(config.sample_rate)
to this:
source = MicrophoneAudioSource(config.step)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment