Skip to content

Instantly share code, notes, and snippets.

@Helw150
Helw150 / ot_loss.py
Last active April 27, 2023 22:02
OT TADA Loss
from typing import List, Optional, Tuple, Union
from torchtyping import TensorType
from transformers.adapters.modeling import Adapter
from transformers.adapters import (
BartAdapterModel,
RobertaAdapterModel,
BertAdapterModel,
AdapterConfig,
)
@Helw150
Helw150 / parallel_t5.py
Last active May 10, 2023 14:52
Flan T5 Parallel Usage
from transformers import AutoTokenizer, T5ForConditionalGeneration
# Model Init
n_gpu = 8
tokenizer = AutoTokenizer.from_pretrained("google/flan-ul2")
model = T5ForConditionalGeneration.from_pretrained("google/flan-ul2")
heads_per_gpu = len(model.encoder.block) // n_gpu
device_map = {
gpu: list(
range(
@Helw150
Helw150 / upload_csv.py
Created September 16, 2022 15:50
Lab Meeting Dataset upload Code
# See https://huggingface.co/docs/datasets/upload_dataset for more details
from datasets import load_dataset
dataset_name = "PUT_YOUR_NAME_HERE"
data_files = {"train": "train.csv", "dev": "dev.csv", "test": "test.csv"}
dataset = load_dataset("namespace/your_dataset_name", data_files=data_files)
datasets.push_to_hub(f"SALT-NLP/{dataset_name}", private=True)
@Helw150
Helw150 / save2gensim.py
Last active April 13, 2019 12:32
Saves a dictionary of vectors into the Gensim KeyedVectors format
from gensim import utils
def save2gensim(fname, word2vec_dict):
vectors = list(word2vec_dict.values())
vector_size = vectors[0].shape[0]
total_vec = len(vectors)
with utils.smart_open(fname, 'wb') as fout:
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
# store in sorted order: most frequent words at the top
for word, vector in word2vec_dict.items():
@Helw150
Helw150 / large-file-processing.py
Last active July 28, 2018 22:34
A Python Script which multi-processes large files with a rough progress bar
#!/usr/bin/env python
"""Counts the number of times a word occurs in a very large text file"""
from __future__ import print_function
import os
import sys
import argparse
import textacy
import multiprocessing
from tqdm import tqdm
@Helw150
Helw150 / zenburn.js
Last active November 6, 2017 15:07
Changing Chrome OS Shell to utilize the Zenburn Color Theme
// Disable bold.
term_.prefs_.set('enable-bold', false)
// Use this for Zenburn
term_.prefs_.set('background-color', "#3F3F3F");
term_.prefs_.set('foreground-color', "#DCDCCC");
base03 = "#002b36";
base02 = "#073642";
base01 = "#586e75";
@Helw150
Helw150 / createTree.py
Created October 24, 2017 22:37
Array to Min-Heap with In-Order Traversal the same as the Array
# i/p = array of numbers
# create a binary tree such that each subtree is a min-heap and the inorder traversal // of the binary tree is same as the array provided
# [5, 7, 10, 8, 1, 4]
# 1
# / \
# 5 4
# \
# 7
@Helw150
Helw150 / OG-articles.py
Last active September 15, 2017 17:28
Functions to work with Open Graph efficiently
# This function takes Open Graph info and just returns the articles
def returnArticles(og_array):
article_urls = []
for og in og_array:
is_article = False
for prop in og:
if hasattr(prop, "property"):
if prop["property"] == "og:type":
if prop["content"] == "article":
is_article = True
@Helw150
Helw150 / aws_control.py
Created July 18, 2017 18:41
Some more human controls for boto3
import boto3
def get_id_from_name(name):
description = [instance for r in response['Reservations'] for instance in r['Instances'] for tag in instance['Tags'] if tag['Key'] == 'Name' if tag['Value'] == name]
return description['InstanceId']
def start_instance_by_name(name):
ec2 = boto3.client('ec2')
instance_id = get_id_from_name(name)
# Do a dryrun first to verify permissions
def brand_from_url(url):
domain_list = urlparse(url).netloc.split('.')
if(len(domain_list) < 3):
brand= domain_list[0]
else:
brand = domain_list[1]
return brand