Skip to content

Instantly share code, notes, and snippets.

@daskol
daskol / compress.c
Created February 10, 2024 23:17
Philip Gage's original code of Byte-Pair Encoding (BPE) compression technique.
/* compress.c */
/* Copyright 1994 by Philip Gage */
#include <stdio.h>
#define BLOCKSIZE 5000 /* Maximum block size */
#define HASHSIZE 4096 /* Size of hash table */
#define MAXCHARS 200 /* Char set per block */
#define THRESHOLD 3 /* Minimum pair count */
@daskol
daskol / main.py
Last active December 11, 2023 21:18
Benchmarking of OpenWebText parallel loader.
from argparse import ArgumentParser, Namespace
from pathlib import Path
from random import shuffle
from tqdm import tqdm
from openwebtext import OpenWebTextLoader, shuffle, take
parser = ArgumentParser()
parser.add_argument('-j', '--jobs', type=int, default=2)
@daskol
daskol / test_dali.py
Created September 20, 2023 11:49
Benchmark DALI against PyTorch
from os import getenv
from pathlib import Path
from typing import Callable
import numpy as np
import nvidia.dali.fn as fn
import nvidia.dali.ops as ops
import torch as T
from nvidia.dali.pipeline import pipeline_def
from nvidia.dali.plugin.pytorch import DALIGenericIterator, LastBatchPolicy
@daskol
daskol / reset-device.cc
Last active June 30, 2023 20:10
Reset CUDA device if program counter was corrupted.
/**
* reset-device.cc
*
* Simple program which resets a devices if kernel launch was failed. This
* could happend due to invalida program counter as an example.
*
* $ nvcc -o reset-device reset-device.cc
* $ ./reset-device
* cudaSuccess: no error
*/
@daskol
daskol / peds.py
Last active September 29, 2023 15:35
Py + Deps = <3
#!/usr/bin/env python
"""Py + Deps = <3: Simple script to install only dependencies of a python
package which follows PEP-517/PEP-518 guidelines.
"""
from argparse import ArgumentParser, Namespace
from dataclasses import dataclass, field
from pathlib import Path
from subprocess import check_call
from sys import version_info
@daskol
daskol / hf2aria.py
Last active June 28, 2023 13:45
Download a repo from HuggingFace Hub with aria2c.
#!/usr/bin/env python3
"""Little script for generating a download list for fetching model weights and
configuration files of a model from HuggingFace Hub. With download list is
ready, you can easily fetch all files with throatling and supspending or
resuming with `aria2c` as an example: aria2c -c -i index.txt.
"""
from pathlib import Path
from argparse import ArgumentParser, Namespace
@daskol
daskol / youtube-text.py
Created June 18, 2023 11:05
Fetch and parses auto subtitles from YouTube video
#!/usr/bin/env python
from argparse import ArgumentParser, Namespace
from datetime import datetime
from json import dump
from os import rename
from pathlib import Path
from urllib.parse import parse_qs, urlparse
from xml.sax import ContentHandler
from xml.sax import parse as parse_xml
@daskol
daskol / llama.cpp-quantize.py
Created March 28, 2023 12:15
Post-Train Quantization for llama.cpp in Python
import numpy as np
def pack(xs, dtype='q4_0'):
assert dtype == 'q4_0', 'Only quantized int4 type is supported.'
assert xs.size % 2 == 0, 'Only arrays of even length.'
# Estimate magnitude of array elements and its inverse.
amax = abs(xs).max()
magnitude = amax / 0b0111
precision = np.float32(1) / magnitude if magnitude else np.float32(0)
@daskol
daskol / cnn.py
Created December 22, 2022 09:23
Non-jittable initialization of model in JAX/FLAX
from typing import Any, Callable
import flax.linen as nn
import jax
import jax.experimental.host_callback
import jax.numpy as jnp
from flax.linen.initializers import delta_orthogonal, variance_scaling
from phase_diagram import phase_boundary
@daskol
daskol / repartitioning.py
Created May 5, 2022 17:29
Repartitioning of LSapp dataset
import pandas as pd
def pick_uniques(df: pd.DataFrame, size: int = 100):
assert a.index.is_monotonic
assert 'uid' in df
index = []
uids = set()
for i, uid in df.uid.items():
if uid in uids: