Skip to content

Instantly share code, notes, and snippets.

View djberenberg's full-sized avatar

Dan Berenberg djberenberg

View GitHub Profile
import json
import gzip
from typing import Iterable
import pandas as pd
from tqdm import tqdm
def jsonl_to_df(filename: str) -> pd.DataFrame:
with (gzip.open(filename, "rt") if filename.endswith(".gz") else open(filename, "r")) as f:
return pd.DataFrame.from_records(list(tqdm(map(json.loads, f), desc=f"reading {filename}"))
@djberenberg
djberenberg / test_model.py
Created June 17, 2025 18:43
inference harness
"""
Runs inference using a model checkpoint on an input dataset formatted as a jsonl.
Writes the output to a directory as a jsonl the same length as the input.
"""
import argparse
import gzip
import json
import os
@djberenberg
djberenberg / minority_upsampler.py
Created December 17, 2024 16:45
minority upsampler
import itertools
import random
from abc import ABCMeta
from typing import Iterable, Sized
from torch.utils.data import Sampler
class SizedIterable(Sized, Iterable, metaclass=ABCMeta):
pass
@djberenberg
djberenberg / doi2bib.py
Created September 16, 2024 11:08
doi2bib
"""
Generate bibtex string from DOI.
Usage: python doi2bib.py DOI_STRING
Example: python doi2bib.py "10.1016/0022-2836(87)90412-8"
"""
import sys
import re
import gzip
exec(''.join(map(chr, map(lambda x: int(x, 2), gzip.decompress(b'\x1f\x8b\x08\x00J\xdf\xece\x02\xff\xcdYIb\xe30\x0c\xfbJ\x9f@\xfc\xffsM#G\x02\x17\xc8\xf2\xd6v\xe60\x9e\xd8\x96\xb8\x80 (\x9b\x99\xe1\xf5\xf7\xcb\xf0\xf3\x8f\xa1]\xa0]\xbc~\xb0\xed\xd6\xeb\xcf\xf6K{\xf8\xfd\xf4\xd7\xcf\xff\xf8\x99~K\xac\x83\xb6\x05\xbfe\xd6\x9f\x01\xd4\x82\xef_\xd8T\x1b\xb7\xe0-\xdc6\xa5u6S\xfb\xee\xce\xaf\xe1\xf2X9\x9aq%,6\xdb\xbd\x07j{\xc6>\x17\xb4i\x0b\x0b\xaf\x8c\xb8\x05\x9b\x11\\v\x0eFO\xc9/\xbdrXg\xacl#\x1a\x07\x16\xfc\xb8\x13\x1f\xe6\x0b#\xdf\xe3\xad\x96\xdc\x93H\xc8\x81\x9axA1\xf4Y\xbe\r\x1b\x1f\xb0\r\xbfr\xc4\xc6\xcad*%%\xb8<\xb2\xb3\x12g\xef{\xbd\xbb\xcc;\x1b\x06a\xa1DK\xcf\xa0K\xa5\x015\x120\xaa \xb0\x84!\x14\x08\xf3Oe\xb3\xcbE{=\x16\xe30\x03\x11\x1b\x1d\xd8\xcfU\xf7\xc6\x90\x08\xf1\xe9.O.\xc8\x0cv\'d\x19a\xc1\x8be\xde\x80\xd0l\xee\xf6\x04 m\xae\xca[\xd0o\xbd\xe3\xd3,:\x1c\x8d\x8eL\x84v#\x82p\x03\x8b\xf6h\x105q\xe9\x01%\xc9d\x07\x1d\xc5eDA\xb2D\xb7\xc7|U\xbe7\xc74,\xfe\x19S|\x98\xaa2\x16\xbeyj\xaaQW4_\xdd\
@djberenberg
djberenberg / word_cooccurences.py
Last active October 7, 2023 15:13
Counting word co-occurrences
import argparse
import gzip
import io
import re
import time
import itertools
from collections import Counter, defaultdict
from multiprocessing import Manager, Process, Queue, current_process
from string import punctuation
from typing import Iterable
@djberenberg
djberenberg / NCCL_errors.md
Last active April 21, 2021 02:56
NCCL errors

To reproduce

cd /mnt/home/dberenberg/projects/metagenomics

module load slurm
source load_env
source huggingface_meta/bin/activate

salloc -N2 -p gpu --gres=gpu:v100-32gb:02
@djberenberg
djberenberg / INSTRUCTIONS.md
Last active April 9, 2021 21:36
tensorboard port forwarding

Monitoring training via tensorboard on the flatiron cluster

To monitor training on a remote server via tensorboard, start a training instance and do the following:

ssh -NfL localhost:13000:localhost:13001 flatiron # on local
ssh flatiron # ssh into gateway
ssh -NfL localhost:13001:localhost:13002 rusty # on gateway
ssh rusty # ssh into rusty
@djberenberg
djberenberg / ae.py
Last active February 9, 2021 21:42
simple cnn ae
# miniest example, py3.8
from torch import nn
batchsize, dim = (64, 48)
inchannels, outchannels, kernel = (1, 16, 7)
conv = nn.Conv2d(inchannels, outchannels, kernel)
deconv = nn.Sequential( nn.ConvTranspose2d(outchannels, 2, kernel), nn.Softmax(dim=1) )
# ^^ --------- ^^ softmax classifier
@djberenberg
djberenberg / config
Created January 14, 2021 20:44
example of ssh aliasing (write to ~/.ssh/config)
Host ALIAS_NAME
Hostname HOST_DESTINATION
Port YOUR_DESIGNATED_SSH_PORT
User YOUR_USERNAME
ForwardX11 yes
ForwardX11Trusted yes
ControlPath ~/.ssh/.%r@%h:%p
ControlMaster auto