aksh-at/modal_fast_whisper.py

## modal_fast_whisper.py
import base64
import tempfile
from typing import Optional

from pydantic import BaseModel

from modal import Image, Secret, Stub, build, enter, gpu, web_endpoint

whisper_image = (
    Image.micromamba()
    .apt_install("ffmpeg", "ninja-build", "git")
    .micromamba_install(
        "cudatoolkit=11.8",
        "cudnn=8.1.0",
        "cuda-nvcc",
        channels=["conda-forge", "nvidia"],
    )
    .pip_install(
        "torch==2.0.1",
        "tqdm==4.66.1",
        "more-itertools==10.1.0",
        "transformers==4.37.2",
        "ffmpeg-python==0.2.0",
        "openai-whisper==20231106",
        "optimum==1.14.0",
        "pyannote-audio==3.1.0",
        "rich==13.7.0",
    )
    .pip_install("packaging")
    .run_commands("pip install flash-attn==2.5.2 --no-build-isolation")
    .run_commands(
        "git clone https://github.com/Vaibhavs10/insanely-fast-whisper.git",
        # Pin the commit version.
        "cd insanely-fast-whisper && git checkout ff0df400f4aed859375c2507ebcc21fe5f9b99e0",
    )
)

# Named to have the label prefix "modal-labs--instant".
stub = Stub("instant-whisper")

with whisper_image.imports():
    import sys

    import torch

    # The folders are missing `__init__.py` files, so we need to add them to the path.
    sys.path.append("/insanely-fast-whisper/src/insanely_fast_whisper/utils")
    from diarize import (
        diarize_audio as diarize_audio_func,
        post_process_segments_and_transcripts,
        preprocess_inputs,
    )
    from pyannote.audio import Pipeline
    from transformers import (
        WhisperFeatureExtractor,
        WhisperForConditionalGeneration,
        WhisperTokenizerFast,
        pipeline,
    )


class TranscriptionRequest(BaseModel):
    audio: str
    language: Optional[str] = None
    diarize_audio: bool = False
    batch_size: int = 24


@stub.cls(
    gpu=gpu.A10G(),
    # To avoid excessive cold-starts, we set the idle timeout to two minutes
    container_idle_timeout=120,
    keep_warm=1,
    image=whisper_image,
    # TODO: reconcile hugging face secrets.
    secrets=[Secret.from_name("huggingface-secret-2")],
)
class Model:
    @build()
    @enter()
    def setup(self):
        model_id = "openai/whisper-large-v3"
        torch_dtype = torch.float16
        self.device = "cuda:0"

        model = WhisperForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=torch_dtype,
        ).to(self.device)

        tokenizer = WhisperTokenizerFast.from_pretrained(model_id)
        feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)

        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=tokenizer,
            feature_extractor=feature_extractor,
            model_kwargs={"use_flash_attention_2": True},
            torch_dtype=torch_dtype,
            device=self.device,
        )

        self.diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
        self.diarization_pipeline.to(torch.device(self.device))

    @web_endpoint(method="POST", label="instant-whisper")
    def transcribe(self, request: TranscriptionRequest):
        """Transcribes and optionally translates a single audio file"""

        with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio:
            audio_data = base64.b64decode(request.audio.split(",")[1])
            temp_audio.write(audio_data)

            outputs = self.pipe(
                temp_audio.name,
                chunk_length_s=30,
                batch_size=request.batch_size,
                generate_kwargs={
                    "task": "transcribe",
                    "language": None if request.language == "" else request.language,
                },
                return_timestamps="word",
            )

            if not request.diarize_audio:
                return outputs

            inputs, diarizer_inputs = preprocess_inputs(inputs=temp_audio.name)
            segments = diarize_audio_func(diarizer_inputs, self.diarization_pipeline)
            segmented_transcript = post_process_segments_and_transcripts(
                segments, outputs["chunks"], group_by_speaker=False
            )
            outputs["chunks"] = segmented_transcript

        return outputs
	import base64
	import tempfile
	from typing import Optional

	from pydantic import BaseModel

	from modal import Image, Secret, Stub, build, enter, gpu, web_endpoint

	whisper_image = (
	Image.micromamba()
	.apt_install("ffmpeg", "ninja-build", "git")
	.micromamba_install(
	"cudatoolkit=11.8",
	"cudnn=8.1.0",
	"cuda-nvcc",
	channels=["conda-forge", "nvidia"],
	)
	.pip_install(
	"torch==2.0.1",
	"tqdm==4.66.1",
	"more-itertools==10.1.0",
	"transformers==4.37.2",
	"ffmpeg-python==0.2.0",
	"openai-whisper==20231106",
	"optimum==1.14.0",
	"pyannote-audio==3.1.0",
	"rich==13.7.0",
	)
	.pip_install("packaging")
	.run_commands("pip install flash-attn==2.5.2 --no-build-isolation")
	.run_commands(
	"git clone https://github.com/Vaibhavs10/insanely-fast-whisper.git",
	# Pin the commit version.
	"cd insanely-fast-whisper && git checkout ff0df400f4aed859375c2507ebcc21fe5f9b99e0",
	)
	)

	# Named to have the label prefix "modal-labs--instant".
	stub = Stub("instant-whisper")

	with whisper_image.imports():
	import sys

	import torch

	# The folders are missing `__init__.py` files, so we need to add them to the path.
	sys.path.append("/insanely-fast-whisper/src/insanely_fast_whisper/utils")
	from diarize import (
	diarize_audio as diarize_audio_func,
	post_process_segments_and_transcripts,
	preprocess_inputs,
	)
	from pyannote.audio import Pipeline
	from transformers import (
	WhisperFeatureExtractor,
	WhisperForConditionalGeneration,
	WhisperTokenizerFast,
	pipeline,
	)


	class TranscriptionRequest(BaseModel):
	audio: str
	language: Optional[str] = None
	diarize_audio: bool = False
	batch_size: int = 24


	@stub.cls(
	gpu=gpu.A10G(),
	# To avoid excessive cold-starts, we set the idle timeout to two minutes
	container_idle_timeout=120,
	keep_warm=1,
	image=whisper_image,
	# TODO: reconcile hugging face secrets.
	secrets=[Secret.from_name("huggingface-secret-2")],
	)
	class Model:
	@build()
	@enter()
	def setup(self):
	model_id = "openai/whisper-large-v3"
	torch_dtype = torch.float16
	self.device = "cuda:0"

	model = WhisperForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch_dtype,
	).to(self.device)

	tokenizer = WhisperTokenizerFast.from_pretrained(model_id)
	feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)

	self.pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=tokenizer,
	feature_extractor=feature_extractor,
	model_kwargs={"use_flash_attention_2": True},
	torch_dtype=torch_dtype,
	device=self.device,
	)

	self.diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
	self.diarization_pipeline.to(torch.device(self.device))

	@web_endpoint(method="POST", label="instant-whisper")
	def transcribe(self, request: TranscriptionRequest):
	"""Transcribes and optionally translates a single audio file"""

	with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio:
	audio_data = base64.b64decode(request.audio.split(",")[1])
	temp_audio.write(audio_data)

	outputs = self.pipe(
	temp_audio.name,
	chunk_length_s=30,
	batch_size=request.batch_size,
	generate_kwargs={
	"task": "transcribe",
	"language": None if request.language == "" else request.language,
	},
	return_timestamps="word",
	)

	if not request.diarize_audio:
	return outputs

	inputs, diarizer_inputs = preprocess_inputs(inputs=temp_audio.name)
	segments = diarize_audio_func(diarizer_inputs, self.diarization_pipeline)
	segmented_transcript = post_process_segments_and_transcripts(
	segments, outputs["chunks"], group_by_speaker=False
	)
	outputs["chunks"] = segmented_transcript

	return outputs