- Install Jupyter Lab
conda install -c conda-forge jupyterlab
- Create certs
openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout mycert.pem -out mycert.pem
# Let's add an row number to indicate the first message per app & microservice | |
# This code is analagous to the SQL: row_number() over (partition by id, topic order by msg_ts asc) | |
df['row_num'] = df.sort_values(['id', 'msg_ts'], ascending=True).groupby(['id', 'topic']).cumcount() + 1 |
# Source: https://towardsdatascience.com/a-data-science-for-good-machine-learning-project-walk-through-in-python-part-one-1977dd701dbc | |
import pandas as pd | |
# Number of missing in each column | |
missing = pd.DataFrame(data.isnull().sum()).rename(columns = {0: 'total'}) | |
# Create a percentage missing | |
missing['percent'] = missing['total'] / len(data) |
"""Generate a bunch of fake avro data and upload to s3 | |
Running in python 3.7. Installed the following: | |
- pip install Faker | |
- pip install fastavro | |
- pip install boto3 | |
- pip install graphviz | |
- brew install graphviz | |
""" |
import sys | |
import dask.bag as db | |
def gt(x): | |
return x > 3 | |
def even(x): | |
return x % 2 == 0 |
SELECT *
, Specify explicit column names (columnar store)""" | |
Read accompanying blog post: https://ianwhitestone.work/Zappa-Zip-Callbacks | |
""" | |
import os | |
import re | |
import shutil | |
import tarfile | |
import zipfile | |
## Pandas | |
import great_expectations as ge | |
# Build up expectations on a sample dataset and save them | |
train = ge.read_csv("data/npi.csv") | |
train.expect_column_values_to_not_be_null("NPI") | |
train.save_expectation_suite("npi_csv_expectations.json") | |
# Load in a new dataset and test them | |
test = ge.read_csv("data/npi_new.csv") |
""" | |
Trigger slack notifications | |
""" | |
import argparse | |
import logging | |
import os | |
from slack.web.client import WebClient | |
LOGGER = logging.getLogger(__name__) |