Graham Thomson graham-thomson

## .sql
create external table audience_sessions (
 machine_id string,
 domain_name string,
 url_hosts string,
 url_dirs string,
 url_pages string,
 patterns string,
 actions int,
 start_time bigint,
 end_time bigint,

## remove_non_ascii.py
remove_non_ascii = lambda string: "".join(letter for letter in string if ord(letter) < 128)

## update_dat_daily_pivot_sales.sql
--Create a temp table with a sample of recent daily pivot date (Note: TEMP TABLEs only exist for the duration of your connection to the database.)--
CREATE TEMP TABLE sample_daily AS(SELECT * FROM adobe_data.dat_daily_pivot WHERE sample_start_date <= '2016-03-30' ORDER BY sample_start_date DESC LIMIT 100);

--Check initial total_sales sum--
SELECT SUM(total_sales) FROM sample_daily;
-->>1800.2825

--Update <table_name> SET <col_name> = <what you want to update the column to> WHERE <where clause>--
UPDATE sample_daily SET total_sales = total_sales * 4 WHERE sample_start_date <= '2016-03-30';

## simple_psycopg2_connect.py
import psycopg2 as pg
# import pymysql

query = """
SELECT
report_campaign_id,
SUM(impressions)
FROM
adobe_data.dat_daily_pivot
WHERE

## create_doubleclick_matchtable_ddl.py
import bs4
import requests

soup = bs4.BeautifulSoup(requests.get("https://support.google.com/dcm/partner/answer/6005910?hl=en").content, "html.parser")
table_names = []

for a in soup.find_all("a"):
    try:
        if a.get('class')[0] == 'zippy':
            table_names.append(a.get('id'))

## date_list.py
import datetime as dt


def list_of_dates(start_date, end_date):
    """
    Function returns a list of string dates in YYYY-MM-DD format between and including start and end date.
    :param start_date: Start date string in YYYY-MM-DD format
    :param end_date: End date string in YYYY-MM-DD format
    :return: List of date strings
    """

## remove_dense_vector_element.py
from pyspark.ml.linalg import Vectors

def delete_index_from_dv(dense_vector, index):
  """
  Takes PySpark dense vector, removes element at index.
  :dense_vector: PySpark dense vector to operate on.
  :index: Index of element to delete.
  :return: Dense vector with element at index removed.
  """
  temp_list = list(dense_vector)

## make_sure_digits.py
def make_sure_digits(string):
    import re
    regex = u"(^\d+$)"
    match = re.match(regex, string)
    if match:
        return match.group()
    return ""

def make_sure_int(string):
    try:

## raid_ebs_aws.sh
sudo mdadm --create --verbose /dev/md0 --level=0 --name=MY_RAID --raid-devices=2 /dev/xvdb /dev/xvdc

sudo cat /proc/mdstat
sudo mdadm --detail /dev/md0

sudo mkfs.ext4 -L MY_RAID /dev/md0
sudo mkdir -p /data/raid
sudo mount LABEL=MY_RAID /data/raid

## get_top_alexa_sites.py
import requests
from bs4 import BeautifulSoup
from urlparse import urlparse

def get_top_alexa_sites(category):
    category = category.title()
    categories = ['Adult','Arts','Business','Computers','Games','Health','Home',
                  'Kids and Teens','News','Recreation','Reference','Regional','Science',
                  'Shopping','Society','Sports','World']
    assert(category in categories), "Category {} not in category list: {}".format(category, ", ".join(categories))
	create external table audience_sessions (
	machine_id string,
	domain_name string,
	url_hosts string,
	url_dirs string,
	url_pages string,
	patterns string,
	actions int,
	start_time bigint,
	end_time bigint,
	--Create a temp table with a sample of recent daily pivot date (Note: TEMP TABLEs only exist for the duration of your connection to the database.)--
	CREATE TEMP TABLE sample_daily AS(SELECT * FROM adobe_data.dat_daily_pivot WHERE sample_start_date <= '2016-03-30' ORDER BY sample_start_date DESC LIMIT 100);

	--Check initial total_sales sum--
	SELECT SUM(total_sales) FROM sample_daily;
	-->>1800.2825

	--Update <table_name> SET <col_name> = <what you want to update the column to> WHERE <where clause>--
	UPDATE sample_daily SET total_sales = total_sales * 4 WHERE sample_start_date <= '2016-03-30';
	import psycopg2 as pg
	# import pymysql

	query = """
	SELECT
	report_campaign_id,
	SUM(impressions)
	FROM
	adobe_data.dat_daily_pivot
	WHERE
	import bs4
	import requests

	soup = bs4.BeautifulSoup(requests.get("https://support.google.com/dcm/partner/answer/6005910?hl=en").content, "html.parser")
	table_names = []

	for a in soup.find_all("a"):
	try:
	if a.get('class')[0] == 'zippy':
	table_names.append(a.get('id'))
	import datetime as dt


	def list_of_dates(start_date, end_date):
	"""
	Function returns a list of string dates in YYYY-MM-DD format between and including start and end date.
	:param start_date: Start date string in YYYY-MM-DD format
	:param end_date: End date string in YYYY-MM-DD format
	:return: List of date strings
	"""
	from pyspark.ml.linalg import Vectors

	def delete_index_from_dv(dense_vector, index):
	"""
	Takes PySpark dense vector, removes element at index.
	:dense_vector: PySpark dense vector to operate on.
	:index: Index of element to delete.
	:return: Dense vector with element at index removed.
	"""
	temp_list = list(dense_vector)
	def make_sure_digits(string):
	import re
	regex = u"(^\d+$)"
	match = re.match(regex, string)
	if match:
	return match.group()
	return ""

	def make_sure_int(string):
	try:
	sudo mdadm --create --verbose /dev/md0 --level=0 --name=MY_RAID --raid-devices=2 /dev/xvdb /dev/xvdc

	sudo cat /proc/mdstat
	sudo mdadm --detail /dev/md0

	sudo mkfs.ext4 -L MY_RAID /dev/md0
	sudo mkdir -p /data/raid
	sudo mount LABEL=MY_RAID /data/raid
	import requests
	from bs4 import BeautifulSoup
	from urlparse import urlparse

	def get_top_alexa_sites(category):
	category = category.title()
	categories = ['Adult','Arts','Business','Computers','Games','Health','Home',
	'Kids and Teens','News','Recreation','Reference','Regional','Science',
	'Shopping','Society','Sports','World']
	assert(category in categories), "Category {} not in category list: {}".format(category, ", ".join(categories))