Skip to content

Instantly share code, notes, and snippets.

View graham-thomson's full-sized avatar

Graham Thomson graham-thomson

  • United States
View GitHub Profile
create external table audience_sessions (
machine_id string,
domain_name string,
url_hosts string,
url_dirs string,
url_pages string,
patterns string,
actions int,
start_time bigint,
end_time bigint,
@graham-thomson
graham-thomson / remove_non_ascii.py
Created February 17, 2016 23:25
python lambda function to remove non-ascii characters from a string
remove_non_ascii = lambda string: "".join(letter for letter in string if ord(letter) < 128)
--Create a temp table with a sample of recent daily pivot date (Note: TEMP TABLEs only exist for the duration of your connection to the database.)--
CREATE TEMP TABLE sample_daily AS(SELECT * FROM adobe_data.dat_daily_pivot WHERE sample_start_date <= '2016-03-30' ORDER BY sample_start_date DESC LIMIT 100);
--Check initial total_sales sum--
SELECT SUM(total_sales) FROM sample_daily;
-->>1800.2825
--Update <table_name> SET <col_name> = <what you want to update the column to> WHERE <where clause>--
UPDATE sample_daily SET total_sales = total_sales * 4 WHERE sample_start_date <= '2016-03-30';
import psycopg2 as pg
# import pymysql
query = """
SELECT
report_campaign_id,
SUM(impressions)
FROM
adobe_data.dat_daily_pivot
WHERE
@graham-thomson
graham-thomson / create_doubleclick_matchtable_ddl.py
Created June 29, 2016 13:58
Quick script to create DDL statements for DoubleClick Matchtables based on the html tables in their documentation
import bs4
import requests
soup = bs4.BeautifulSoup(requests.get("https://support.google.com/dcm/partner/answer/6005910?hl=en").content, "html.parser")
table_names = []
for a in soup.find_all("a"):
try:
if a.get('class')[0] == 'zippy':
table_names.append(a.get('id'))
@graham-thomson
graham-thomson / date_list.py
Last active August 4, 2016 21:48
Function returns a list of string dates in YYYY-MM-DD format between and including start and end date.
import datetime as dt
def list_of_dates(start_date, end_date):
"""
Function returns a list of string dates in YYYY-MM-DD format between and including start and end date.
:param start_date: Start date string in YYYY-MM-DD format
:param end_date: End date string in YYYY-MM-DD format
:return: List of date strings
"""
from pyspark.ml.linalg import Vectors
def delete_index_from_dv(dense_vector, index):
"""
Takes PySpark dense vector, removes element at index.
:dense_vector: PySpark dense vector to operate on.
:index: Index of element to delete.
:return: Dense vector with element at index removed.
"""
temp_list = list(dense_vector)
@graham-thomson
graham-thomson / make_sure_digits.py
Created October 6, 2016 18:42
two functions to make sure your string only contains digits [0-9]
def make_sure_digits(string):
import re
regex = u"(^\d+$)"
match = re.match(regex, string)
if match:
return match.group()
return ""
def make_sure_int(string):
try:
@graham-thomson
graham-thomson / raid_ebs_aws.sh
Created October 20, 2016 18:58
raid two ebs volumes on ec2
sudo mdadm --create --verbose /dev/md0 --level=0 --name=MY_RAID --raid-devices=2 /dev/xvdb /dev/xvdc
sudo cat /proc/mdstat
sudo mdadm --detail /dev/md0
sudo mkfs.ext4 -L MY_RAID /dev/md0
sudo mkdir -p /data/raid
sudo mount LABEL=MY_RAID /data/raid
@graham-thomson
graham-thomson / get_top_alexa_sites.py
Created March 7, 2017 15:57
Function to get top Alexa.com domains by category
import requests
from bs4 import BeautifulSoup
from urlparse import urlparse
def get_top_alexa_sites(category):
category = category.title()
categories = ['Adult','Arts','Business','Computers','Games','Health','Home',
'Kids and Teens','News','Recreation','Reference','Regional','Science',
'Shopping','Society','Sports','World']
assert(category in categories), "Category {} not in category list: {}".format(category, ", ".join(categories))