Skip to content

Instantly share code, notes, and snippets.

@jelmervdl
jelmervdl / oxford-comma.js
Created Jan 25, 2023
Pointless exercise in expressing the difference of an oxford comma
View oxford-comma.js
const joinA = (strs) => [...strs.slice(0,-1), `and ${strs[strs.length-1]}`].join(', ');
const joinB = (strs) => `${strs.slice(0,-1).join(', ')} and ${strs[strs.length-1]}`;
const strs = new Array(10).fill().map((_, i) => (i + 1).toString());
console.log(joinA(strs))
console.log(joinB(strs))
View external_gzip.py
import sys
import contextlib
import subprocess
@contextlib.contextmanager
def gzip(path, mode='r'):
"""Like gzip.open(), but using external gzip process which for some reason
is a lot faster on macOS."""
try:
@jelmervdl
jelmervdl / shuffle.py
Created Sep 29, 2022
Batch shuffling
View shuffle.py
#!/usr/bin/env python3
import subprocess
import random
import os
import re
import sys
import numpy as np
from collections import defaultdict
from itertools import accumulate
from bisect import bisect_right
@jelmervdl
jelmervdl / sha256sum.js
Created Sep 19, 2022
Get same output as sha256sum for an url. Equivalent of `curl -L --compressed $URL | sha256sum -b`
View sha256sum.js
async function hash(url) {
const r = await fetch(url, {
'credentials': 'omit',
'method': 'GET',
'mode': 'cors'
});
const h = await crypto.subtle.digest('sha-256', await r.arrayBuffer());
const a = Array.from(new Uint8Array(h));
View patch-s3-bucket.py
#!/usr/bin/env python3
import boto3
import gzip
import sys
import mimetypes
from pprint import pprint
from typing import Dict, Any
mimetypes.add_type('application/wasm', '.wasm')
mimetypes.add_type('text/markdown', '.md')
@jelmervdl
jelmervdl / col.py
Created Jun 21, 2022
Apply subprocess to only a single column in a TSV
View col.py
#!/usr/bin/env python3
import sys
import os
import signal
from traceback import print_exc
from subprocess import Popen, PIPE
from threading import Thread
from queue import SimpleQueue
from typing import Optional, TypeVar
from functools import wraps
@jelmervdl
jelmervdl / sample.py
Created Jun 17, 2022
Head + tail + random sample of file
View sample.py
import random
from math import exp, log, floor
def reservoir_sample(k, it, *, rand: random.Random = random._inst):
sample = []
numbered_it = enumerate(it)
for i, (_, line) in zip(range(k), numbered_it):
@jelmervdl
jelmervdl / dedupe.py
Created Jun 9, 2022
Near duplicate deduplication
View dedupe.py
#!/usr/bin/env python3
import sys
import gzip
import os
from collections import defaultdict
from xxhash import xxh64
from unicodedata import category as cat
from unidecode import unidecode
from functools import reduce
from tqdm.autonotebook import tqdm
@jelmervdl
jelmervdl / bergamot-translator-node.js
Last active Mar 24, 2022
Call wasm bergamot translator from node.js (tested with Node v17.2.0)
View bergamot-translator-node.js
const {Blob} = require('buffer');
const fs = require('fs');
const https = require('https');
const wasmBinary = fs.readFileSync('./bergamot-translator-worker.wasm');
global.Module = {wasmBinary, onRuntimeInitialized};
// Execute bergamot-translation-worker.js in this scope
const js = fs.readFileSync('./bergamot-translator-worker.js', {encoding:'utf8'});
eval.call(global, js);
@jelmervdl
jelmervdl / parse-model.py
Created Mar 14, 2022
Parse marian binary model
View parse-model.py
#!/usr/bin/env python3
import sys
import struct
from pprint import pprint
import argparse
import mmap
from typing import NamedTuple
class Reader: