Ian Whitestone ian-whitestone

## row_num.py
# Let's add an row number to indicate the first message per app & microservice
# This code is analagous to the SQL: row_number() over (partition by id, topic order by msg_ts asc)
df['row_num'] = df.sort_values(['id', 'msg_ts'], ascending=True).groupby(['id', 'topic']).cumcount() + 1

## jupyter_setup.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                ian-whitestone
                / jupyter_setup.md
            
            
              Created
              August 29, 2018 13:02
            
              
                Setting up jupyter lab on a ubuntu instance
              
          
    Setting Up Jupyter Lab on an EC2


Install Jupyter Lab

conda install -c conda-forge jupyterlab

Create certs

openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout mycert.pem -out mycert.pem

  
## missing.py
# Source: https://towardsdatascience.com/a-data-science-for-good-machine-learning-project-walk-through-in-python-part-one-1977dd701dbc

import pandas as pd

# Number of missing in each column
missing = pd.DataFrame(data.isnull().sum()).rename(columns = {0: 'total'})

# Create a percentage missing
missing['percent'] = missing['total'] / len(data)

## fake_data.py
"""Generate a bunch of fake avro data and upload to s3

Running in python 3.7. Installed the following:

- pip install Faker
- pip install fastavro
- pip install boto3
- pip install graphviz
- brew install graphviz
"""

## dask_bag_filter_issue.py
import sys

import dask.bag as db


def gt(x):
    return x > 3

def even(x):
    return x % 2 == 0

## notes.md

      
              1 file
            
          
              7 forks
            
          
              1 comment
            
          
              56 stars
            
          
                ian-whitestone
                / notes.md
            
            
              Last active
              March 1, 2023 01:45
            
              
                Best practices for presto sql
              
          
    Presto Specific


Don’t SELECT *, Specify explicit column names (columnar store)
Avoid large JOINs (filter each table first)

In PRESTO tables are joined in the order they are listed!!
Join small tables earlier in the plan and leave larger fact tables to the end
Avoid cross joins or 1 to many joins as these can degrade performance


Order by and group by take time

only use order by in subqueries if it is really necessary


When using GROUP BY, order the columns by the highest cardinality (that is, most number of unique values) to the lowest.


## resources.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                ian-whitestone
                / resources.md
            
            
              Last active
              February 18, 2019 17:33
            
              
                Plotting & analysis with R
              
          
https://github.com/ian-whitestone/toronto-housing/blob/master/data/analysis/analysis.R
https://github.com/ian-whitestone/khp-analytics/blob/master/analysis/eda.R
https://github.com/ian-whitestone/nba-dfs/blob/master/NBA_eda_report.Rmd


## zappa_package_cleaner.py
"""
Read accompanying blog post: https://ianwhitestone.work/Zappa-Zip-Callbacks
"""
import os
import re
import shutil
import tarfile
import zipfile


## great_expecations_examples.py
## Pandas
import great_expectations as ge

# Build up expectations on a sample dataset and save them
train = ge.read_csv("data/npi.csv")
train.expect_column_values_to_not_be_null("NPI")
train.save_expectation_suite("npi_csv_expectations.json")

# Load in a new dataset and test them
test = ge.read_csv("data/npi_new.csv")

## notify.py
"""
Trigger slack notifications
"""
import argparse
import logging
import os

from slack.web.client import WebClient

LOGGER = logging.getLogger(__name__)
	# Let's add an row number to indicate the first message per app & microservice
	# This code is analagous to the SQL: row_number() over (partition by id, topic order by msg_ts asc)
	df['row_num'] = df.sort_values(['id', 'msg_ts'], ascending=True).groupby(['id', 'topic']).cumcount() + 1
	# Source: https://towardsdatascience.com/a-data-science-for-good-machine-learning-project-walk-through-in-python-part-one-1977dd701dbc

	import pandas as pd

	# Number of missing in each column
	missing = pd.DataFrame(data.isnull().sum()).rename(columns = {0: 'total'})

	# Create a percentage missing
	missing['percent'] = missing['total'] / len(data)
	"""Generate a bunch of fake avro data and upload to s3

	Running in python 3.7. Installed the following:

	- pip install Faker
	- pip install fastavro
	- pip install boto3
	- pip install graphviz
	- brew install graphviz
	"""
	import sys

	import dask.bag as db


	def gt(x):
	return x > 3

	def even(x):
	return x % 2 == 0
	"""
	Read accompanying blog post: https://ianwhitestone.work/Zappa-Zip-Callbacks
	"""
	import os
	import re
	import shutil
	import tarfile
	import zipfile
	## Pandas
	import great_expectations as ge

	# Build up expectations on a sample dataset and save them
	train = ge.read_csv("data/npi.csv")
	train.expect_column_values_to_not_be_null("NPI")
	train.save_expectation_suite("npi_csv_expectations.json")

	# Load in a new dataset and test them
	test = ge.read_csv("data/npi_new.csv")
	"""
	Trigger slack notifications
	"""
	import argparse
	import logging
	import os

	from slack.web.client import WebClient

	LOGGER = logging.getLogger(__name__)