I hereby claim:
- I am andrewgross on github.
- I am andrewwgross (https://keybase.io/andrewwgross) on keybase.
- I have a public key ASDsj8ie3y_QBUpm4aBzm-ty7Hr9w_Y5PtWIcLZfQlt9JQo
To claim this, I am signing this object:
tag: | |
@if [ $$(git rev-list $$(git describe --abbrev=0 --tags)..HEAD --count) -gt 0 ]; then \ | |
if [ $$(git log -n 1 --oneline $$(git describe --abbrev=0 --tags)..HEAD CHANGELOG.md | wc -l) -gt 0 ]; then \ | |
git tag $$(python setup.py --version) && git push --tags || echo 'Version already released, update your version!' | |
else \ | |
echo "CHANGELOG not updated since last release!"; \ | |
exit 1; \ | |
fi; \ | |
else \ | |
echo "No commits since last release!"; \ |
# -*- coding: utf-8 -*- | |
from __future__ import unicode_literals | |
import base64 | |
import json | |
import logging | |
from urlparse import parse_qs | |
import requests |
import os | |
from urllib.parse import urlparse | |
from pyspark.sql.functions import desc, asc | |
from pyspark.sql.types import ( | |
StructType, | |
StructField, | |
StringType, | |
LongType, |
import re | |
import pyspark.sql.types as T | |
from math import ceil | |
def repartition_for_writing(df): | |
count = df.count() | |
sampled_df = get_sampled_df(df, count=count) | |
string_column_sizes = get_string_column_sizes(sampled_df) | |
num_files = get_num_files(count, df.schema, string_column_sizes) |
import datetime | |
import json | |
BUCKET_NAME = "<s3_bucket_name>" | |
INVENTORY_PREFIX = "<prefix_given_to_s3_inventory>" # Should have data/, hive/, and some dated folders inside of it | |
ACCOUNT_CUID = "<your_canonical_user_id_for_cross_account>" # Account which is not the owner of S3 bucket, but trying to access it. Controls ROLE_ARN | |
ROLE_ARN = "<role_in_cross_account_that_can_assume_to_main_account>" | |
def role_arn_to_session(role_arn): |
import sys, time, subprocess, socket, telnetlib | |
from datetime import datetime | |
from collections import defaultdict | |
from boto.ec2.cloudwatch import CloudWatchConnection | |
MAPPINGS = { | |
# Memcached name: (AWS Name, AWS Metric Type, Calculation Method) | |
'uptime': ('Uptime', 'Count', 'gauge'), |
my_name_function() { | |
# There aren't many use cases where we would really want to do things this | |
# way instead of just using a global. Unless we are writing libraries in bash (the horror) | |
# or have an extremely large script where we are not sure we won't be clobbering | |
# variable names (equally terrifying) | |
local __assign_my_results_to_this_variable=$1 | |
local do_some_work=$(echo $ALL_MY_COMMANDS_NAMES | grep -v "bad commands") |
def get_files_per_partition(df, partition_key, file_type="parquet", compression="snappy", byte_array_size=256): | |
rows = df.count() | |
print "Dataset has {} rows".format(rows) | |
schema = df.schema | |
num_partitions = 1 | |
if partition_key is not None: | |
num_partitions = df.select([partition_key]).distinct().count() | |
print "Dataset has {} distinct partition keys".format(num_partitions) | |
_df = df.drop(partition_key) | |
schema = _df.schema |
Hey, | |
This is going to be a bit of an esoteric ticket. I noticed some strange behavior recently when comparing Spectrum and Redshift results on the same dataset. | |
Redshift Data: fare.txt | |
Parquet Data: fare.parquet | |
The parquet data was generated from fare.txt with PySpark using convert.py on Spark 2.2.0 | |
Redshift Table Schema: |
I hereby claim:
To claim this, I am signing this object: