This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mmap | |
import xxhash | |
def _checksum_artifact(path: PathOrStr) -> str: | |
filepath = Path(path) | |
if not filepath.is_file(): | |
raise FileNotFoundError(str(filepath)) | |
h = xxhash.xxh128() | |
with filepath.open("rb") as f: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
float32: 3.71 s ± 2.95 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) | |
float16: 2.29 s ± 8.15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) | |
bfloat16: 2.29 s ± 9.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) | |
torch.backends.cuda.matmul.allow_tf32 = False | |
float32: 24.4 s ± 41.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import dill | |
import mmh3 | |
import typing | |
import io | |
def hash_object(o: typing.Any) -> str: | |
with io.BytesIO() as buffer: | |
dill.dump(o, buffer) | |
return mmh3.hash_bytes(buffer.getvalue(), x64arch=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
STOPWORDS = { | |
"i", | |
"me", | |
"my", | |
"myself", | |
"we", | |
"our", | |
"ours", | |
"ourselves", | |
"you", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import typing | |
_T = typing.TypeVar('_T') | |
def filter_near_duplicates(items: typing.Iterable[_T], key = lambda x: x) -> typing.Generator[_T, None, None]: | |
"""Filters out items that overlap too much with items we've seen earlier in the sequence.""" | |
trigram_to_sentence_indices = collections.defaultdict(set) | |
for sentence_index, item in enumerate(items): | |
sentence = key(item) | |
trigrams = [sentence[i:i+3] for i in range(len(sentence) - 1)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import * | |
import time | |
import logging | |
def logging_tqdm( | |
i, | |
*, | |
logger: Optional[logging.Logger] = None, | |
desc: str = "Working", | |
total: Optional[int] = None, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def distinctFromIterator[T](input: Iterator[T]): Iterator[T] = new Iterator[T] { | |
private val seen: mutable.Set[T] = mutable.Set[T]() | |
private def findNextItem(): Option[T] = { | |
if(input.hasNext) { | |
val n = input.next() | |
val newItem = seen.add(n) | |
if(newItem) | |
Some(n) | |
else | |
findNextItem() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def linesFromFile(filename: String): Iterator[String] = new Iterator[String] { | |
val bufferedReader = { | |
val fileInputStream = new FileInputStream(filename) | |
val decompressedInputStream = | |
if(filename.endsWith(".gz")) new GZIPInputStream(fileInputStream) else fileInputStream | |
val reader = new InputStreamReader(decompressedInputStream, "UTF-8") | |
new BufferedReader(reader) | |
} | |
private var nextLine = bufferedReader.readLine() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import * | |
import multiprocessing as mp | |
def mp_map(fn, input_sequence: Iterable) -> Iterable: | |
input_queue = mp.Queue() | |
output_queue = mp.Queue() | |
def process_items(): | |
while True: | |
item = input_queue.get() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import * | |
import multiprocessing as mp | |
import multiprocessing.connection | |
def map_per_process(fn, input_sequence: Iterable) -> Iterable: | |
pipeno_to_pipe: Dict[int, multiprocessing.connection.Connection] = {} | |
pipeno_to_process: Dict[int, mp.Process] = {} | |
def process_one_item(send_pipe: multiprocessing.connection.Connection, item): | |
try: |
NewerOlder