Skip to content

Instantly share code, notes, and snippets.

View jelmervdl's full-sized avatar

Jelmer jelmervdl

View GitHub Profile
import sys
import contextlib
import subprocess
@contextlib.contextmanager
def gzip(path, mode='r'):
"""Like gzip.open(), but using external gzip process which for some reason
is a lot faster on macOS."""
try:
@jelmervdl
jelmervdl / shuffle.py
Created September 29, 2022 15:53
Batch shuffling
#!/usr/bin/env python3
import subprocess
import random
import os
import re
import sys
import numpy as np
from collections import defaultdict
from itertools import accumulate
from bisect import bisect_right
@jelmervdl
jelmervdl / sha256sum.js
Created September 19, 2022 15:31
Get same output as sha256sum for an url. Equivalent of `curl -L --compressed $URL | sha256sum -b`
async function hash(url) {
const r = await fetch(url, {
'credentials': 'omit',
'method': 'GET',
'mode': 'cors'
});
const h = await crypto.subtle.digest('sha-256', await r.arrayBuffer());
const a = Array.from(new Uint8Array(h));
#!/usr/bin/env python3
import boto3
import gzip
import sys
import mimetypes
from pprint import pprint
from typing import Dict, Any
mimetypes.add_type('application/wasm', '.wasm')
mimetypes.add_type('text/markdown', '.md')
@jelmervdl
jelmervdl / col.py
Created June 21, 2022 11:44
Apply subprocess to only a single column in a TSV
#!/usr/bin/env python3
import sys
import os
import signal
from traceback import print_exc
from subprocess import Popen, PIPE
from threading import Thread
from queue import SimpleQueue
from typing import Optional, TypeVar
from functools import wraps
@jelmervdl
jelmervdl / sample.py
Created June 17, 2022 14:52
Head + tail + random sample of file
import random
from math import exp, log, floor
def reservoir_sample(k, it, *, rand: random.Random = random._inst):
sample = []
numbered_it = enumerate(it)
for i, (_, line) in zip(range(k), numbered_it):
@jelmervdl
jelmervdl / dedupe.py
Created June 9, 2022 21:03
Near duplicate deduplication
#!/usr/bin/env python3
import sys
import gzip
import os
from collections import defaultdict
from xxhash import xxh64
from unicodedata import category as cat
from unidecode import unidecode
from functools import reduce
from tqdm.autonotebook import tqdm
@jelmervdl
jelmervdl / bergamot-translator-node.js
Last active March 24, 2022 18:22
Call wasm bergamot translator from node.js (tested with Node v17.2.0)
const {Blob} = require('buffer');
const fs = require('fs');
const https = require('https');
const wasmBinary = fs.readFileSync('./bergamot-translator-worker.wasm');
global.Module = {wasmBinary, onRuntimeInitialized};
// Execute bergamot-translation-worker.js in this scope
const js = fs.readFileSync('./bergamot-translator-worker.js', {encoding:'utf8'});
eval.call(global, js);
@jelmervdl
jelmervdl / parse-model.py
Created March 14, 2022 15:04
Parse marian binary model
#!/usr/bin/env python3
import sys
import struct
from pprint import pprint
import argparse
import mmap
from typing import NamedTuple
class Reader:
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: sentencepiece_model.proto
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
# @@protoc_insertion_point(imports)