Make sure slackclient v1.3.1 is installed (for apache-airflow 1.10).
pip install -U "apache-airflow[slack,...]"
import random | |
from metaflow import FlowSpec, step, S3, Flow, Parameter, profile, kubernetes, conda, conda_base | |
# change columns according to your schema (or remove column list to load all) | |
COLUMNS = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime'] | |
# group parquet files as 1GB batches | |
def shard_data(src, batch_size=1_000_000_000): | |
with S3() as s3: | |
objs = s3.list_recursive([src]) |
import os | |
from metaflow import S3 | |
def put_dir(local_root, s3root): | |
root = os.path.abspath(local_root) | |
objs = [] | |
for p, _, files in os.walk(root): | |
for f in files: | |
path = os.path.join(p, f) | |
key = os.path.relpath(path, start=root) |
import torch | |
import torch.distributed as dist | |
import torch.multiprocessing as mp | |
from torch.utils.data import IterableDataset, DataLoader | |
class DistributedIterableDataset(IterableDataset): | |
""" | |
Example implementation of an IterableDataset that handles both multiprocessing (num_workers > 0) |
name: CI | |
on: [push] | |
jobs: | |
test: | |
runs-on: ubuntu-latest | |
services: |
Make sure slackclient v1.3.1 is installed (for apache-airflow 1.10).
pip install -U "apache-airflow[slack,...]"
This gist started with a collection of resources I was maintaining on stream data processing — also known as distributed logs, data pipelines, event sourcing, CQRS, and other names.
Over time the set of resources grew quite large and I received some interest in a more guided, opinionated path for learning about stream data processing. So I added the reading list.
Please send me feedback!
# -*- coding: utf-8 -*- | |
""" | |
@author: goraj | |
""" | |
import lightgbm as lgbm | |
from sklearn.datasets import load_digits | |
import numpy as np | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import roc_auc_score |
""" | |
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy) | |
BSD License | |
""" | |
import numpy as np | |
# data I/O | |
data = open('input.txt', 'r').read() # should be simple plain text file | |
chars = list(set(data)) | |
data_size, vocab_size = len(data), len(chars) |