Ian Whitestone ian-whitestone

## zappa_package_cleaner.py
"""
Read accompanying blog post: https://ianwhitestone.work/Zappa-Zip-Callbacks
"""
import os
import re
import shutil
import tarfile
import zipfile


## resources.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                ian-whitestone
                / resources.md
            
            
              Last active
              February 18, 2019 17:33
            
              
                Plotting & analysis with R
              
          
https://github.com/ian-whitestone/toronto-housing/blob/master/data/analysis/analysis.R
https://github.com/ian-whitestone/khp-analytics/blob/master/analysis/eda.R
https://github.com/ian-whitestone/nba-dfs/blob/master/NBA_eda_report.Rmd


## notes.md

      
              1 file
            
          
              7 forks
            
          
              1 comment
            
          
              56 stars
            
          
                ian-whitestone
                / notes.md
            
            
              Last active
              March 1, 2023 01:45
            
              
                Best practices for presto sql
              
          
    Presto Specific


Don’t SELECT *, Specify explicit column names (columnar store)
Avoid large JOINs (filter each table first)

In PRESTO tables are joined in the order they are listed!!
Join small tables earlier in the plan and leave larger fact tables to the end
Avoid cross joins or 1 to many joins as these can degrade performance


Order by and group by take time

only use order by in subqueries if it is really necessary


When using GROUP BY, order the columns by the highest cardinality (that is, most number of unique values) to the lowest.


## dask_bag_filter_issue.py
import sys

import dask.bag as db


def gt(x):
    return x > 3

def even(x):
    return x % 2 == 0

## fake_data.py
"""Generate a bunch of fake avro data and upload to s3

Running in python 3.7. Installed the following:

- pip install Faker
- pip install fastavro
- pip install boto3
- pip install graphviz
- brew install graphviz
"""

## missing.py
# Source: https://towardsdatascience.com/a-data-science-for-good-machine-learning-project-walk-through-in-python-part-one-1977dd701dbc

import pandas as pd

# Number of missing in each column
missing = pd.DataFrame(data.isnull().sum()).rename(columns = {0: 'total'})

# Create a percentage missing
missing['percent'] = missing['total'] / len(data)

## jupyter_setup.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                ian-whitestone
                / jupyter_setup.md
            
            
              Created
              August 29, 2018 13:02
            
              
                Setting up jupyter lab on a ubuntu instance
              
          
    Setting Up Jupyter Lab on an EC2


Install Jupyter Lab

conda install -c conda-forge jupyterlab

Create certs

openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout mycert.pem -out mycert.pem

  
## row_num.py
# Let's add an row number to indicate the first message per app & microservice
# This code is analagous to the SQL: row_number() over (partition by id, topic order by msg_ts asc)
df['row_num'] = df.sort_values(['id', 'msg_ts'], ascending=True).groupby(['id', 'topic']).cumcount() + 1
	"""
	Read accompanying blog post: https://ianwhitestone.work/Zappa-Zip-Callbacks
	"""
	import os
	import re
	import shutil
	import tarfile
	import zipfile
	import sys

	import dask.bag as db


	def gt(x):
	return x > 3

	def even(x):
	return x % 2 == 0
	"""Generate a bunch of fake avro data and upload to s3

	Running in python 3.7. Installed the following:

	- pip install Faker
	- pip install fastavro
	- pip install boto3
	- pip install graphviz
	- brew install graphviz
	"""
	# Source: https://towardsdatascience.com/a-data-science-for-good-machine-learning-project-walk-through-in-python-part-one-1977dd701dbc

	import pandas as pd

	# Number of missing in each column
	missing = pd.DataFrame(data.isnull().sum()).rename(columns = {0: 'total'})

	# Create a percentage missing
	missing['percent'] = missing['total'] / len(data)
	# Let's add an row number to indicate the first message per app & microservice
	# This code is analagous to the SQL: row_number() over (partition by id, topic order by msg_ts asc)
	df['row_num'] = df.sort_values(['id', 'msg_ts'], ascending=True).groupby(['id', 'topic']).cumcount() + 1