Andrew Gross andrewgross

## Makefile
tag:
	@if [ $$(git rev-list $$(git describe --abbrev=0 --tags)..HEAD --count) -gt 0 ]; then \
		if [ $$(git log  -n 1 --oneline $$(git describe --abbrev=0 --tags)..HEAD CHANGELOG.md | wc -l) -gt 0 ]; then \
			git tag $$(python setup.py --version) && git push --tags || echo 'Version already released, update your version!'
		else \
			echo "CHANGELOG not updated since last release!"; \
			exit 1; \
		fi; \
	else \
		echo "No commits since last release!"; \

## travis_webhook_checker.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import base64
import json
import logging

from urlparse import parse_qs

import requests

## browse_s3.py
import os

from urllib.parse import urlparse

from pyspark.sql.functions import desc, asc
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    LongType,

## optimize_filesize.py
import re
import pyspark.sql.types as T
from math import ceil


def repartition_for_writing(df):
    count = df.count()
    sampled_df = get_sampled_df(df, count=count)
    string_column_sizes = get_string_column_sizes(sampled_df)
    num_files = get_num_files(count, df.schema, string_column_sizes)

## s3_inventory.py
import datetime
import json

BUCKET_NAME = "<s3_bucket_name>"
INVENTORY_PREFIX = "<prefix_given_to_s3_inventory>" # Should have data/, hive/, and some dated folders inside of it
ACCOUNT_CUID = "<your_canonical_user_id_for_cross_account>"  # Account which is not the owner of S3 bucket, but trying to access it. Controls ROLE_ARN
ROLE_ARN = "<role_in_cross_account_that_can_assume_to_main_account>"


def role_arn_to_session(role_arn):

## upload_memcache_stats.py
import sys, time, subprocess, socket, telnetlib
from datetime import datetime
from collections import defaultdict

from boto.ec2.cloudwatch import CloudWatchConnection


MAPPINGS = {
    # Memcached name:       (AWS Name, AWS Metric Type, Calculation Method)
    'uptime':               ('Uptime', 'Count', 'gauge'),

## functional.sh
my_name_function() {

    # There aren't many use cases where we would really want to do things this
    # way instead of just using a global. Unless we are writing libraries in bash (the horror)
    # or have an extremely large script where we are not sure we won't be clobbering
    # variable names (equally terrifying)

    local  __assign_my_results_to_this_variable=$1

    local do_some_work=$(echo $ALL_MY_COMMANDS_NAMES | grep -v "bad commands")

## dynamic_partition.py
def get_files_per_partition(df, partition_key, file_type="parquet", compression="snappy", byte_array_size=256):
    rows = df.count()
    print "Dataset has {} rows".format(rows)
    schema = df.schema
    num_partitions = 1
    if partition_key is not None:
        num_partitions = df.select([partition_key]).distinct().count()
        print "Dataset has {} distinct partition keys".format(num_partitions)
        _df = df.drop(partition_key)
        schema = _df.schema

## gist:300286593b0bd2c2cc0ace5db819095e
Hey,

This is going to be a bit of an esoteric ticket.  I noticed some strange behavior recently when comparing Spectrum and Redshift results on the same dataset.

Redshift Data: fare.txt
Parquet Data: fare.parquet

The parquet data was generated from fare.txt with PySpark using convert.py on Spark 2.2.0

Redshift Table Schema:

## keybase.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                andrewgross
                / keybase.md
            
            
              Created
              April 6, 2017 00:18
            
          
    Keybase proof

I hereby claim:

I am andrewgross on github.
I am andrewwgross (https://keybase.io/andrewwgross) on keybase.
I have a public key ASDsj8ie3y_QBUpm4aBzm-ty7Hr9w_Y5PtWIcLZfQlt9JQo

To claim this, I am signing this object:
	tag:
	@if [ $$(git rev-list $$(git describe --abbrev=0 --tags)..HEAD --count) -gt 0 ]; then \
	if [ $$(git log -n 1 --oneline $$(git describe --abbrev=0 --tags)..HEAD CHANGELOG.md \| wc -l) -gt 0 ]; then \
	git tag $$(python setup.py --version) && git push --tags \|\| echo 'Version already released, update your version!'
	else \
	echo "CHANGELOG not updated since last release!"; \
	exit 1; \
	fi; \
	else \
	echo "No commits since last release!"; \
	# -- coding: utf-8 --
	from __future__ import unicode_literals

	import base64
	import json
	import logging

	from urlparse import parse_qs

	import requests
	import os

	from urllib.parse import urlparse

	from pyspark.sql.functions import desc, asc
	from pyspark.sql.types import (
	StructType,
	StructField,
	StringType,
	LongType,
	import re
	import pyspark.sql.types as T
	from math import ceil


	def repartition_for_writing(df):
	count = df.count()
	sampled_df = get_sampled_df(df, count=count)
	string_column_sizes = get_string_column_sizes(sampled_df)
	num_files = get_num_files(count, df.schema, string_column_sizes)
	import datetime
	import json

	BUCKET_NAME = "<s3_bucket_name>"
	INVENTORY_PREFIX = "<prefix_given_to_s3_inventory>" # Should have data/, hive/, and some dated folders inside of it
	ACCOUNT_CUID = "<your_canonical_user_id_for_cross_account>" # Account which is not the owner of S3 bucket, but trying to access it. Controls ROLE_ARN
	ROLE_ARN = "<role_in_cross_account_that_can_assume_to_main_account>"


	def role_arn_to_session(role_arn):
	import sys, time, subprocess, socket, telnetlib
	from datetime import datetime
	from collections import defaultdict

	from boto.ec2.cloudwatch import CloudWatchConnection


	MAPPINGS = {
	# Memcached name: (AWS Name, AWS Metric Type, Calculation Method)
	'uptime': ('Uptime', 'Count', 'gauge'),
	my_name_function() {

	# There aren't many use cases where we would really want to do things this
	# way instead of just using a global. Unless we are writing libraries in bash (the horror)
	# or have an extremely large script where we are not sure we won't be clobbering
	# variable names (equally terrifying)

	local __assign_my_results_to_this_variable=$1

	local do_some_work=$(echo $ALL_MY_COMMANDS_NAMES \| grep -v "bad commands")
	def get_files_per_partition(df, partition_key, file_type="parquet", compression="snappy", byte_array_size=256):
	rows = df.count()
	print "Dataset has {} rows".format(rows)
	schema = df.schema
	num_partitions = 1
	if partition_key is not None:
	num_partitions = df.select([partition_key]).distinct().count()
	print "Dataset has {} distinct partition keys".format(num_partitions)
	_df = df.drop(partition_key)
	schema = _df.schema
	Hey,

	This is going to be a bit of an esoteric ticket. I noticed some strange behavior recently when comparing Spectrum and Redshift results on the same dataset.

	Redshift Data: fare.txt
	Parquet Data: fare.parquet

	The parquet data was generated from fare.txt with PySpark using convert.py on Spark 2.2.0

	Redshift Table Schema: