Arijit Mukherjee arijitx

## vocab.txt
[PAD]
[UNK]
[CLS]
[SEP]
[MASK]
!
"
%
'
,

## create_manifest.py
import json
import sys
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map
import librosa
import random

print("python create_manifest.py script_path create_train_test_bool(True/False)")

script_path = sys.argv[1]

## conformer_transducer_bpe.yaml
# It contains the default values for training a Conformer-Transducer ASR model, large size (~120M) with Transducer loss and sub-word encoding.

# Architecture and training config:
# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
# Here are the recommended configs for different variants of Conformer-Transducer, other parameters are the same as in this config file.
#
#  +-------------+---------+---------+----------+--------------+--------------------------+
#  | Model       | d_model | n_heads | n_layers | weight_decay | pred_hidden/joint_hidden |
#  +=============+=========+========+===========+==============+==========================+

## run_speech_recognition_ctc.py
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#

## S2T_finetune.py
import torch
from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
from torch.utils.data import Dataset
from transformers import AutoFeatureExtractor, AutoTokenizer, SpeechEncoderDecoderModel
from torch.utils.data import DataLoader
from transformers import TrainingArguments, Trainer

import librosa
import argparse

## run.py
## Extended from https://github.com/huggingface/transformers/blob/master/notebooks/04-onnx-export.ipynb

from transformers import DistilBertTokenizerFast,DistilBertModel
from torch.cuda import get_device_name
from contextlib import contextmanager
from dataclasses import dataclass
from time import time
from tqdm import trange

from os import environ

## bn.toml
min_trimmed_length = 3
min_word_count = 1
max_word_count = 16
min_characters = 2
may_end_with_colon = false
quote_start_with_letter = true
needs_punctuation_end = false
needs_letter_start = true
allowed_symbols_regex = "[।-,;:  \\-\\?\\.!]"
needs_uppercase_start = false

## benchmark.py
## Extended from https://github.com/huggingface/transformers/blob/master/notebooks/04-onnx-export.ipynb

from transformers import DistilBertTokenizerFast,DistilBertModel
from torch.cuda import get_device_name
from contextlib import contextmanager
from dataclasses import dataclass
from time import time
from tqdm import trange

from os import environ

## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                arijitx
                / README.md
            
            
              Created
              June 21, 2020 06:51
            
              
                Installing transformers on Jetson Nano
              
          
    FInd a better version here : https://benjcunningham.org/installing-transformers-on-jetson-nano.html
Install sentencepiece

git clone https://github.com/google/sentencepiece
cd /path/to/sentencepiece
mkdir build
cd build
cmake ..

make -j $(nproc)

  
## res.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                arijitx
                / res.md
            
            
              Created
              March 19, 2020 20:19
            
          
    Query:Covid risks in diabetic patients?
+------+----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Rank |        Answer                          |                                                                                                   Doc Id                                                                                                  |
+------+----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|  1   | severe complications                   | b696d208705fcb1925693c5f0d118733bb557ea6_18 | Exploring diseases/traits and blood proteins causally related to expression of ACE2, the putative receptor of 2019-nCov: A Mendelian Randomization analysis |
| 2
	import json
	import sys
	from tqdm import tqdm
	from tqdm.contrib.concurrent import thread_map
	import librosa
	import random

	print("python create_manifest.py script_path create_train_test_bool(True/False)")

	script_path = sys.argv[1]
	# It contains the default values for training a Conformer-Transducer ASR model, large size (~120M) with Transducer loss and sub-word encoding.

	# Architecture and training config:
	# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
	# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
	# Here are the recommended configs for different variants of Conformer-Transducer, other parameters are the same as in this config file.
	#
	# +-------------+---------+---------+----------+--------------+--------------------------+
	# \| Model \| d_model \| n_heads \| n_layers \| weight_decay \| pred_hidden/joint_hidden \|
	# +=============+=========+========+===========+==============+==========================+
	#!/usr/bin/env python
	# coding=utf-8
	# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	import torch
	from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
	from torch.utils.data import Dataset
	from transformers import AutoFeatureExtractor, AutoTokenizer, SpeechEncoderDecoderModel
	from torch.utils.data import DataLoader
	from transformers import TrainingArguments, Trainer

	import librosa
	import argparse
	## Extended from https://github.com/huggingface/transformers/blob/master/notebooks/04-onnx-export.ipynb

	from transformers import DistilBertTokenizerFast,DistilBertModel
	from torch.cuda import get_device_name
	from contextlib import contextmanager
	from dataclasses import dataclass
	from time import time
	from tqdm import trange

	from os import environ
	min_trimmed_length = 3
	min_word_count = 1
	max_word_count = 16
	min_characters = 2
	may_end_with_colon = false
	quote_start_with_letter = true
	needs_punctuation_end = false
	needs_letter_start = true
	allowed_symbols_regex = "[।-,;: \\-\\?\\.!]"
	needs_uppercase_start = false