Skip to content

Instantly share code, notes, and snippets.

@dirkgr
dirkgr / mmap_hash.py
Created December 14, 2021 19:28
Hashing a memory-mapped file with Python
import mmap
import xxhash
def _checksum_artifact(path: PathOrStr) -> str:
filepath = Path(path)
if not filepath.is_file():
raise FileNotFoundError(str(filepath))
h = xxhash.xxh128()
with filepath.open("rb") as f:
@dirkgr
dirkgr / gist:25aac9f8dc24c8f3d548ec20fd967002
Created December 14, 2021 19:23
Multiplying a 60000x60000 matrix with itself in torch
float32: 3.71 s ± 2.95 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
float16: 2.29 s ± 8.15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
bfloat16: 2.29 s ± 9.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
torch.backends.cuda.matmul.allow_tf32 = False
float32: 24.4 s ± 41.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
@dirkgr
dirkgr / unique.py
Created September 23, 2020 01:36
A wrapper for a Python generator that ensures all returned items are unique
import dill
import mmh3
import typing
import io
def hash_object(o: typing.Any) -> str:
with io.BytesIO() as buffer:
dill.dump(o, buffer)
return mmh3.hash_bytes(buffer.getvalue(), x64arch=True)
@dirkgr
dirkgr / stopwords.py
Created September 23, 2020 01:33
A better list of stop words in Python
STOPWORDS = {
"i",
"me",
"my",
"myself",
"we",
"our",
"ours",
"ourselves",
"you",
@dirkgr
dirkgr / near_duplicates.py
Created September 23, 2020 01:31
A generator that wraps another generator, but filters out near-duplicate strings as it goes along
import collections
import typing
_T = typing.TypeVar('_T')
def filter_near_duplicates(items: typing.Iterable[_T], key = lambda x: x) -> typing.Generator[_T, None, None]:
"""Filters out items that overlap too much with items we've seen earlier in the sequence."""
trigram_to_sentence_indices = collections.defaultdict(set)
for sentence_index, item in enumerate(items):
sentence = key(item)
trigrams = [sentence[i:i+3] for i in range(len(sentence) - 1)]
@dirkgr
dirkgr / logging_tqdm.py
Last active November 14, 2023 04:54
A drop-in replacement for TQDM which writes log messages instead of progress bars
from typing import *
import time
import logging
def logging_tqdm(
i,
*,
logger: Optional[logging.Logger] = None,
desc: str = "Working",
total: Optional[int] = None,
@dirkgr
dirkgr / distinctIterator.scala
Created November 30, 2018 23:39
distinct iterator
def distinctFromIterator[T](input: Iterator[T]): Iterator[T] = new Iterator[T] {
private val seen: mutable.Set[T] = mutable.Set[T]()
private def findNextItem(): Option[T] = {
if(input.hasNext) {
val n = input.next()
val newItem = seen.add(n)
if(newItem)
Some(n)
else
findNextItem()
@dirkgr
dirkgr / linesFromFile.scala
Created November 15, 2018 23:29
Iterator of lines of text from file
def linesFromFile(filename: String): Iterator[String] = new Iterator[String] {
val bufferedReader = {
val fileInputStream = new FileInputStream(filename)
val decompressedInputStream =
if(filename.endsWith(".gz")) new GZIPInputStream(fileInputStream) else fileInputStream
val reader = new InputStreamReader(decompressedInputStream, "UTF-8")
new BufferedReader(reader)
}
private var nextLine = bufferedReader.readLine()
@dirkgr
dirkgr / mp_map.py
Created October 16, 2018 00:15
A map function that uses multiple processes to map, but does it more efficiently than the `multiprocessing` library by using the magic of Unix forking
from typing import *
import multiprocessing as mp
def mp_map(fn, input_sequence: Iterable) -> Iterable:
input_queue = mp.Queue()
output_queue = mp.Queue()
def process_items():
while True:
item = input_queue.get()
@dirkgr
dirkgr / map_per_process.py
Last active October 19, 2018 23:43
A map function that runs every iteration in a separate process, in parallel
from typing import *
import multiprocessing as mp
import multiprocessing.connection
def map_per_process(fn, input_sequence: Iterable) -> Iterable:
pipeno_to_pipe: Dict[int, multiprocessing.connection.Connection] = {}
pipeno_to_process: Dict[int, mp.Process] = {}
def process_one_item(send_pipe: multiprocessing.connection.Connection, item):
try: