Skip to content

Instantly share code, notes, and snippets.

View qi-qi's full-sized avatar

Qi Qi qi-qi

View GitHub Profile
#!/bin/bash
set -euxo pipefail
# https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail
test("hello-rdd") {
// sample input
val df = Seq(
("a", 0, 200),
("a", 1000, 2000),
("a", 150, 160),
("b", 0, 2),
("b", 2, 8),
("b", 5, 15),
("c", 5, 15),
@qi-qi
qi-qi / athena-sql-snippet.sql
Last active June 27, 2019 07:30
Athena SQL Snippet
select dt, count(*), count(distinct batch_id), count(distinct request_id)
from data_extract.requests_batch_parquet
group by dt
order by dt;
select dt, count(*), count(distinct batch_id), count(distinct request_id)
from data_extract.requests_ad_parquet
group by dt
order by dt;
@qi-qi
qi-qi / azure-sql-dw-sql-snippet.sql
Last active May 28, 2019 13:17
Azure SQL Datawarehouse
--admin
SELECT *
FROM sys.dm_pdw_exec_requests
-- WHERE status not in ('Completed','Failed','Cancelled')
-- AND session_id <> session_id()
ORDER BY submit_time DESC;
EXEC sp_addrolemember 'xlargerc', 'dbo';
SELECT DP1.name AS DatabaseRoleName,
# Enable werkzeug `ProxyFix` middleware => for AWS ELB + Cognito Forwarding
enable_proxy_fix = True
import re
import subprocess
import json
name = "qi"
aws = "aws route53domains check-domain-availability --region us-east-1 --domain-name {}"
# raw html text scratched from AWS Route 53 Web Page
text1 = '<optgroup label="Popular"><option value=".com">.com - $12.00</option><option value=".net">.net - $11.00</option><option value=".org">.org - $12.00</option><option value=".com.au">.com.au - $15.00</option><option value=".co.uk">.co.uk - $9.00</option><option value=".io">.io - $39.00</option><option value=".info">.info - $12.00</option><option value=".co">.co - $25.00</option><option value=".de">.de - $9.00</option><option value=".ca">.ca - $13.00</option><option value=".me">.me - $17.00</option><option value=".us">.us - $15.00</option><option value=".eu">.eu - $13.00</option></optgroup>'
text2 = '<optgroup label="Other"><option value=".ac">.ac - $48.00</option><option value=".academy">.academy - $32.00</option><option value=".accountants">.accountants - $94.00</option><option value=".adult">.adult - $100.00</
test("hello") {
val dw = spark.read.option("header", "true").csv("/Users/qi/Desktop/old_dw.csv")
val iab = spark.read.option("header", "true").csv("/Users/qi/Desktop/iab.csv")
val podindex = spark.read.option("header", "true").csv("/Users/qi/Desktop/podindex.csv")
val legacyindex = spark.read.option("header", "true").csv("/Users/qi/Desktop/legacyindex.csv")
val realtimeindex = spark.read.option("header", "true").csv("/Users/qi/Desktop/realtimeindex.csv")
val legacyDiff = (($"legacyindex_counting" - $"old_dw_counting") / $"old_dw_counting").as("legacyindex_diff")
val podindexDiff = (($"podindex_counting" - $"old_dw_counting") / $"old_dw_counting").as("podindex_diff")
val iabDiff = (($"iab_counting" - $"old_dw_counting") / $"old_dw_counting").as("iab_diff")
aws s3 sync s3://acast-data-extract-requests/ad_parquet/dt=2019-03-01/ ./dt=2019-03-01/ --exclude="*/*folder*"
object Job {
private lazy val spark = SparkSession.active
import spark.implicits._
def run(srcPath: String, dstPathBatch: String, dstPathAd: String, runDate: LocalDate) {
val batchDf = loadBatchExtraDates(srcPath, runDate)
.transform(Util.filterByEventTimestamp(runDate))
.transform(Util.removeDuplicates)
.transform(Util.makeBatchId)
SELECT *
FROM "data_transform"."session_parquet" t1
JOIN "data_transform"."index_iab_parquet" t2
ON t1.session_id=t2.session_id
AND t1.dt=t2.dt
AND t1.h = t2.h
WHERE t1.dt='2019-02-02'
AND t2.h=0
limit 100;