Skip to content

Instantly share code, notes, and snippets.

View NaxAlpha's full-sized avatar
😎
Solving Intelligence

Nauman Mustafa NaxAlpha

😎
Solving Intelligence
View GitHub Profile
@NaxAlpha
NaxAlpha / mmc4_pythia.py
Created May 24, 2023 23:29
Fine-tune Pythia model on Multimodal C4 dataset
# WIP: Fine-tuned a Causal LM with images & text mixed on MMC4 Dataset
import os
import json
import random
from PIL import Image
from concurrent.futures import ThreadPoolExecutor
import torch
import torch.nn as nn
import torch.nn.functional as F
@NaxAlpha
NaxAlpha / compressor.py
Last active August 4, 2023 04:47
Train a semantic text compressor, potentially useful for very long context language modeling
import random
from time import sleep
from functools import partial
from threading import Thread, Lock
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn
@NaxAlpha
NaxAlpha / pythia_1b4_8k.py
Last active April 4, 2024 17:19
Fine-tune Pythia 1.4b model for 8k context window, this script requires at least 40 GB of memory, 15-20 hours of fine-tune is sufficient.
import copy
import torch
import torch.nn.functional as F
import torch.backends.cuda as cuda
from torch.utils.data import DataLoader, IterableDataset
import wandb
from tqdm import tqdm
import bitsandbytes as bnb
@NaxAlpha
NaxAlpha / gpta.py
Created April 18, 2023 09:42
a custom gpt-like model that is tiny but can also scale context very long
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cuda as cuda
class NewGELU(nn.Module):
def forward(self, x):
@NaxAlpha
NaxAlpha / long_gpt.py
Last active July 23, 2024 13:07
Training script for LongGPT; Fine-tunes GPT-2 (335M) on The Pile Dataset with a context size of 8k tokens. (requires > 16GB RAM)
import time
from contextlib import suppress
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cuda as cuda
from torch.utils.data import DataLoader, IterableDataset
@NaxAlpha
NaxAlpha / softformer.py
Last active January 23, 2024 02:28
Softformer - An Attention-free, softmax based transformer for causal language modeling.
import torch
import torch.nn as nn
import torch.nn.functional as F
def cum_softmax(x, dim=1): # <- main novelty
z = x.exp()
d = z.cumsum(dim)
return z / d
@NaxAlpha
NaxAlpha / hf_pile.py
Last active September 2, 2024 01:39
Efficiently stream "The Pile" Dataset directly from the web. requires `pip install zstandard`
import torch
from torch.utils.data import IterableDataset
from transformers import PreTrainedTokenizerBase
from pile import ThePile
class ThePileTokenized(IterableDataset):
def __init__(
@NaxAlpha
NaxAlpha / c4x2.py
Created December 12, 2022 02:33
When sequence lengths are small, it takes some time to fetch from the HuggingFace dataset server. So to keep feed data to the model, we need to cache already fetched files in memory and feed one of those every time.
import json
import torch
import random
from time import sleep
from threading import Thread
from datasets import load_dataset
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset, get_worker_info
# stream C4 dataset from Huggingface with GPT-2 Tokenizer for PyTorch Language Model Training
import json
import torch
import random
from datasets import load_dataset
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset, get_worker_info
def cycled(itr):
@NaxAlpha
NaxAlpha / 1.json
Last active September 9, 2021 05:58
Badge Test
{
"message": "healthy",
"label": "ping",
"color": "green"
}