Skip to content

Instantly share code, notes, and snippets.

View cemoody's full-sized avatar
👋

Christopher Erick Moody cemoody

👋
View GitHub Profile
"""Wrapper around BigQuery call."""
from __future__ import annotations
from typing import Any, Iterable
import logging
from google.cloud import bigquery_storage
from google.cloud.bigquery_storage_v1 import exceptions as bqstorage_exceptions
from google.cloud.bigquery_storage_v1 import types, writer
from google.protobuf import descriptor_pb2
from google.protobuf.descriptor import Descriptor
from google.cloud.bigquery_storage import BigQueryReadClient
from google.cloud.bigquery_storage import types
from google.cloud import bigquery_storage
from tqdm import tqdm
import pandas
import os
import dill
project_id = (
import os
import io
import json
import math
import time
import random
import numpy as np
import cachetools.func
import sqlite3
from loguru import logger
✓ Initialized. View app at https://modal.com/apps/ap-lHATR9JHJ7S5eGXYHigc75
✓ Created objects.
├── 🔨 Created sample_fn.
├── 🔨 Mounted /Users/chris/code/search/gumbase/jobs/partition.py at /root
├── 🔨 Mounted /Users/chris/code/gumhouse/gumhouse at /root/gumhouse
├── 🔨 Created sample_job.
├── 🔨 Created fit_top_level_kmeans.
├── 🔨 Created scatter_by_centroids_single.
├── 🔨 Created scatter_by_centroids.
├── 🔨 Created gather_single.
apiVersion: v1
kind: Service
metadata:
name: vespa
labels:
app: vespa
spec:
selector:
app: vespa
type: NodePort
@cemoody
cemoody / file_data_loader.py
Created December 16, 2022 21:00
A multiprocess Parquet DataLoader for PyTorch. Great for loading large sequential access datasets. Easy to install, modify, and use.
import multiprocessing
import queue
from loguru import logger
import pandas as pd
def chunks(df, chunk_size=1000):
for i in range(0, len(df), chunk_size):
yield df[i : i + chunk_size]
import os
import pickle
import time
import glob
import joblib
import inspect
from loguru import logger
def persist_to_file():
def switch_modal_function(stub=None, use_modal=True, **kwargs):
def wrapper(inner_func):
if use_modal:
assert stub is not None, "If using modal, please provide `stub`."
return stub.function(**kwargs)(inner_func)
else:
return inner_func
return wrapper
def check_line_by_line(fn_result, fn_truth):
logger.info(f"Checking line by line in {fn_result} and {fn_truth}")
with open(fn_result, 'r') as results:
res = results.readlines()
with open(fn_truth, 'r') as truth:
tru = truth.readlines()
for i, (line_t, line_r) in enumerate(zip(tru, res)):
features = line_t.split('\x01')
@cemoody
cemoody / typecast.py
Last active September 22, 2022 18:33
from typing import get_type_hints
def cast(argname, value, hints):
""" Only cast arguments if type hints are available for them.
"""
if argname in hints:
expected_type = hints[argname]
if not issubclass(type(value), expected_type):
# Will throw type error if argument cannot be cast