Skip to content

Instantly share code, notes, and snippets.

@aksh-at
Created February 23, 2024 18:29
Show Gist options
  • Save aksh-at/fb14599c28a3bc0f907ea45398a7651d to your computer and use it in GitHub Desktop.
Save aksh-at/fb14599c28a3bc0f907ea45398a7651d to your computer and use it in GitHub Desktop.
Insanely fast whisper on Modal
import base64
import tempfile
from typing import Optional
from pydantic import BaseModel
from modal import Image, Secret, Stub, build, enter, gpu, web_endpoint
whisper_image = (
Image.micromamba()
.apt_install("ffmpeg", "ninja-build", "git")
.micromamba_install(
"cudatoolkit=11.8",
"cudnn=8.1.0",
"cuda-nvcc",
channels=["conda-forge", "nvidia"],
)
.pip_install(
"torch==2.0.1",
"tqdm==4.66.1",
"more-itertools==10.1.0",
"transformers==4.37.2",
"ffmpeg-python==0.2.0",
"openai-whisper==20231106",
"optimum==1.14.0",
"pyannote-audio==3.1.0",
"rich==13.7.0",
)
.pip_install("packaging")
.run_commands("pip install flash-attn==2.5.2 --no-build-isolation")
.run_commands(
"git clone https://github.com/Vaibhavs10/insanely-fast-whisper.git",
# Pin the commit version.
"cd insanely-fast-whisper && git checkout ff0df400f4aed859375c2507ebcc21fe5f9b99e0",
)
)
# Named to have the label prefix "modal-labs--instant".
stub = Stub("instant-whisper")
with whisper_image.imports():
import sys
import torch
# The folders are missing `__init__.py` files, so we need to add them to the path.
sys.path.append("/insanely-fast-whisper/src/insanely_fast_whisper/utils")
from diarize import (
diarize_audio as diarize_audio_func,
post_process_segments_and_transcripts,
preprocess_inputs,
)
from pyannote.audio import Pipeline
from transformers import (
WhisperFeatureExtractor,
WhisperForConditionalGeneration,
WhisperTokenizerFast,
pipeline,
)
class TranscriptionRequest(BaseModel):
audio: str
language: Optional[str] = None
diarize_audio: bool = False
batch_size: int = 24
@stub.cls(
gpu=gpu.A10G(),
# To avoid excessive cold-starts, we set the idle timeout to two minutes
container_idle_timeout=120,
keep_warm=1,
image=whisper_image,
# TODO: reconcile hugging face secrets.
secrets=[Secret.from_name("huggingface-secret-2")],
)
class Model:
@build()
@enter()
def setup(self):
model_id = "openai/whisper-large-v3"
torch_dtype = torch.float16
self.device = "cuda:0"
model = WhisperForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch_dtype,
).to(self.device)
tokenizer = WhisperTokenizerFast.from_pretrained(model_id)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
self.pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=tokenizer,
feature_extractor=feature_extractor,
model_kwargs={"use_flash_attention_2": True},
torch_dtype=torch_dtype,
device=self.device,
)
self.diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
self.diarization_pipeline.to(torch.device(self.device))
@web_endpoint(method="POST", label="instant-whisper")
def transcribe(self, request: TranscriptionRequest):
"""Transcribes and optionally translates a single audio file"""
with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio:
audio_data = base64.b64decode(request.audio.split(",")[1])
temp_audio.write(audio_data)
outputs = self.pipe(
temp_audio.name,
chunk_length_s=30,
batch_size=request.batch_size,
generate_kwargs={
"task": "transcribe",
"language": None if request.language == "" else request.language,
},
return_timestamps="word",
)
if not request.diarize_audio:
return outputs
inputs, diarizer_inputs = preprocess_inputs(inputs=temp_audio.name)
segments = diarize_audio_func(diarizer_inputs, self.diarization_pipeline)
segmented_transcript = post_process_segments_and_transcripts(
segments, outputs["chunks"], group_by_speaker=False
)
outputs["chunks"] = segmented_transcript
return outputs
@aksh-at
Copy link
Author

aksh-at commented Mar 7, 2024

Thanks @jflam for the catch and investigation! Did not realize FA2 was not enabled, my bad.

It is indeed quite annoying if it changes behavior once serialized. We'll look into it ourselves to see if it's fixable.

@Evand3r
Copy link

Evand3r commented Apr 22, 2024

The code doesn't work anymore, there's an error on line 89 with .to(self.device):
AttributeError("'NoneType' object has no attribute 'to'")

@michaelevensen
Copy link

michaelevensen commented Apr 24, 2024

The code doesn't work anymore, there's an error on line 89 with .to(self.device): AttributeError("'NoneType' object has no attribute 'to'")

Encountered the same issue, you need to:

 self.diarization_pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=Secret.from_name("your-hugging-face-secret")
        )

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment