Skip to content

Instantly share code, notes, and snippets.

text = # Tokenized Text Corresponding to Recording Transcript
audio = # Mel Spectrogram of the Recording
# Only Train Connector and Projection
self.encoder.freeze()
self.llama.freeze()
# Convert Raw Audio Signal to 1500 Embeddings with Whisper Encoder (CNN+Transformer)
audio_features = self.encoder(audio)
def _push_parquet_shards_to_hub( [1071/1877]
self,
repo_id: str,
data_dir: str = "data",
split: Optional[str] = None,
token: Optional[str] = None,
revision: Optional[str] = None,
create_pr: Optional[bool] = False,
max_shard_size: Optional[Union[int, str]] = None,
num_shards: Optional[int] = None,
import ast
# To Delete After Debug
import code
import copyreg
import datetime
import functools
import json
import os
import re
@Helw150
Helw150 / ot_loss.py
Last active April 27, 2023 22:02
OT TADA Loss
from typing import List, Optional, Tuple, Union
from torchtyping import TensorType
from transformers.adapters.modeling import Adapter
from transformers.adapters import (
BartAdapterModel,
RobertaAdapterModel,
BertAdapterModel,
AdapterConfig,
)
@Helw150
Helw150 / parallel_t5.py
Last active May 10, 2023 14:52
Flan T5 Parallel Usage
from transformers import AutoTokenizer, T5ForConditionalGeneration
# Model Init
n_gpu = 8
tokenizer = AutoTokenizer.from_pretrained("google/flan-ul2")
model = T5ForConditionalGeneration.from_pretrained("google/flan-ul2")
heads_per_gpu = len(model.encoder.block) // n_gpu
device_map = {
gpu: list(
range(
@Helw150
Helw150 / upload_csv.py
Created September 16, 2022 15:50
Lab Meeting Dataset upload Code
# See https://huggingface.co/docs/datasets/upload_dataset for more details
from datasets import load_dataset
dataset_name = "PUT_YOUR_NAME_HERE"
data_files = {"train": "train.csv", "dev": "dev.csv", "test": "test.csv"}
dataset = load_dataset("namespace/your_dataset_name", data_files=data_files)
datasets.push_to_hub(f"SALT-NLP/{dataset_name}", private=True)
@Helw150
Helw150 / save2gensim.py
Last active April 13, 2019 12:32
Saves a dictionary of vectors into the Gensim KeyedVectors format
from gensim import utils
def save2gensim(fname, word2vec_dict):
vectors = list(word2vec_dict.values())
vector_size = vectors[0].shape[0]
total_vec = len(vectors)
with utils.smart_open(fname, 'wb') as fout:
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
# store in sorted order: most frequent words at the top
for word, vector in word2vec_dict.items():
@Helw150
Helw150 / large-file-processing.py
Last active July 28, 2018 22:34
A Python Script which multi-processes large files with a rough progress bar
#!/usr/bin/env python
"""Counts the number of times a word occurs in a very large text file"""
from __future__ import print_function
import os
import sys
import argparse
import textacy
import multiprocessing
from tqdm import tqdm
@Helw150
Helw150 / zenburn.js
Last active November 6, 2017 15:07
Changing Chrome OS Shell to utilize the Zenburn Color Theme
// Disable bold.
term_.prefs_.set('enable-bold', false)
// Use this for Zenburn
term_.prefs_.set('background-color', "#3F3F3F");
term_.prefs_.set('foreground-color', "#DCDCCC");
base03 = "#002b36";
base02 = "#073642";
base01 = "#586e75";
@Helw150
Helw150 / createTree.py
Created October 24, 2017 22:37
Array to Min-Heap with In-Order Traversal the same as the Array
# i/p = array of numbers
# create a binary tree such that each subtree is a min-heap and the inorder traversal // of the binary tree is same as the array provided
# [5, 7, 10, 8, 1, 4]
# 1
# / \
# 5 4
# \
# 7