This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
STATE_DICT_MAPPING = { | |
# Subsampling layer | |
r"encoder\.pre_encode\.": r"encoder.subsampling.", | |
# Subsampling specific mappings | |
r"encoder\.subsampling\.conv\.": r"encoder.subsampling.layers.", | |
r"encoder\.subsampling\.out\.": r"encoder.subsampling.linear.", | |
# # Positional encoding (skip pe buffer) | |
# r"encoder\.pos_enc\.pe$": None, # Skip buffer | |
r"encoder\.pos_enc\.": r"encoder.encode_positions.", | |
# Conformer layers - attention (NeMo already uses self_attn) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To install NeMo, run: | |
# uv pip install git+https://github.com/NVIDIA/NeMo.git@b97e42b3dd1c9bcdf37c81c63220744af474c9c0 | |
from nemo.collections.asr.models import ASRModel | |
import torch | |
import os | |
from datasets import load_dataset | |
import soundfile as sf | |
TMP_DIR = "./tmp" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To install NeMo, run: | |
# uv pip install git+https://github.com/NVIDIA/NeMo.git@b97e42b3dd1c9bcdf37c81c63220744af474c9c0 | |
from nemo.collections.asr.models import ASRModel | |
import torch | |
import os | |
from datasets import load_dataset | |
import soundfile as sf | |
TMP_DIR = "./tmp" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn as nn | |
from torch.nn.utils.parametrizations import weight_norm | |
from torch.nn.utils.parametrize import remove_parametrizations | |
# Define dtypes to test | |
dtypes_to_test = [torch.float64, torch.float32, torch.float16] | |
for dtype in dtypes_to_test: | |
print(f"\nTesting with dtype: {dtype}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn as nn | |
from torch.nn.utils.parametrizations import weight_norm | |
from torch.nn.utils.parametrize import remove_parametrizations | |
# Check if CUDA is available | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
print(f"Using device: {device}") | |
# 1. Create conv layer and move to device |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from mistral_common.protocol.instruct.messages import TextChunk, AudioChunk, UserMessage, AssistantMessage, RawAudio | |
from mistral_common.audio import Audio | |
from huggingface_hub import hf_hub_download | |
from openai import OpenAI | |
# Modify OpenAI's API key and API base to use vLLM's API server. | |
openai_api_key = "EMPTY" | |
openai_api_base = "http://0.0.0.0:8000/v1" | |
client = OpenAI( | |
api_key=openai_api_key, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import load_dataset, Audio | |
from transformers import VoxtralForConditionalGeneration, VoxtralProcessor | |
import os | |
import torch | |
from whisper.normalizers import EnglishTextNormalizer | |
import jiwer | |
os.environ["CUDA_VISIBLE_DEVICES"] = "0" | |
torch_device = "cuda" if torch.cuda.is_available() else "cpu" # "cpu" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import load_dataset, Audio | |
from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration | |
import os | |
import torch | |
from whisper.normalizers import EnglishTextNormalizer | |
import jiwer | |
os.environ["CUDA_VISIBLE_DEVICES"] = "3" | |
torch_device = "cuda" if torch.cuda.is_available() else "cpu" # "cpu" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------ install moshi ------ | |
# git clone https://github.com/kyutai-labs/moshi.git | |
# cd moshi && git checkout 0395bd6c9a95e899c397a68c75f300f3b5409b2c | |
# uv pip install -e . | |
# ---------------------------- | |
import torch | |
from moshi import run_inference | |
args = { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------ install moshi ------ | |
# git clone https://github.com/kyutai-labs/moshi.git | |
# cd moshi && git checkout 0395bd6c9a95e899c397a68c75f300f3b5409b2c | |
# uv pip install -e . | |
# ---------------------------- | |
import torch | |
from moshi import run_inference | |
args = { |
NewerOlder