Skip to content

Instantly share code, notes, and snippets.

View arijitx's full-sized avatar

Arijit Mukherjee arijitx

View GitHub Profile
[PAD]
[UNK]
[CLS]
[SEP]
[MASK]
!
"
%
'
,
import json
import sys
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map
import librosa
import random
print("python create_manifest.py script_path create_train_test_bool(True/False)")
script_path = sys.argv[1]
@arijitx
arijitx / conformer_transducer_bpe.yaml
Created July 3, 2022 18:06
train_transducer NeMo
# It contains the default values for training a Conformer-Transducer ASR model, large size (~120M) with Transducer loss and sub-word encoding.
# Architecture and training config:
# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
# Here are the recommended configs for different variants of Conformer-Transducer, other parameters are the same as in this config file.
#
# +-------------+---------+---------+----------+--------------+--------------------------+
# | Model | d_model | n_heads | n_layers | weight_decay | pred_hidden/joint_hidden |
# +=============+=========+========+===========+==============+==========================+
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
import torch
from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
from torch.utils.data import Dataset
from transformers import AutoFeatureExtractor, AutoTokenizer, SpeechEncoderDecoderModel
from torch.utils.data import DataLoader
from transformers import TrainingArguments, Trainer
import librosa
import argparse
## Extended from https://github.com/huggingface/transformers/blob/master/notebooks/04-onnx-export.ipynb
from transformers import DistilBertTokenizerFast,DistilBertModel
from torch.cuda import get_device_name
from contextlib import contextmanager
from dataclasses import dataclass
from time import time
from tqdm import trange
from os import environ
min_trimmed_length = 3
min_word_count = 1
max_word_count = 16
min_characters = 2
may_end_with_colon = false
quote_start_with_letter = true
needs_punctuation_end = false
needs_letter_start = true
allowed_symbols_regex = "[।-,;: \\-\\?\\.!]"
needs_uppercase_start = false
@arijitx
arijitx / benchmark.py
Last active June 23, 2020 06:33
SQuAD Benchmark for Jetson Nano
## Extended from https://github.com/huggingface/transformers/blob/master/notebooks/04-onnx-export.ipynb
from transformers import DistilBertTokenizerFast,DistilBertModel
from torch.cuda import get_device_name
from contextlib import contextmanager
from dataclasses import dataclass
from time import time
from tqdm import trange
from os import environ
@arijitx
arijitx / README.md
Created June 21, 2020 06:51
Installing transformers on Jetson Nano

Query:Covid risks in diabetic patients? +------+----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Rank | Answer | Doc Id | +------+----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 1 | severe complications | b696d208705fcb1925693c5f0d118733bb557ea6_18 | Exploring diseases/traits and blood proteins causally related to expression of ACE2, the putative receptor of 2019-nCov: A Mendelian Randomization analysis | | 2