Skip to content

Instantly share code, notes, and snippets.

View qi-qi's full-sized avatar

Qi Qi qi-qi

View GitHub Profile
@qi-qi
qi-qi / gist:46598454fdbf6acac0bd525be7f5389a
Created June 7, 2018 09:36
Remove all local git branches but keep master
git branch | grep -v "master" | xargs git branch -D
@qi-qi
qi-qi / firehose-s3-prefix.txt
Last active April 17, 2019 18:27
AWS Firehose Prefix
aws firehose update-destination --delivery-stream-name 'test' --current-delivery-stream-version-id '7' --destination-id 'destinationId-000000000001' --extended-s3-destination-update '{"Prefix":"test_hose/dt=!{timestamp:yyyy'-'MM'-'dd}/h=!{timestamp:HH}/","ErrorOutputPrefix":"test_hose/processing-failed/dt=!{timestamp:yyyy'-'MM'-'dd}/h=!{timestamp:HH}/!{firehose:error-output-type}"}'
aws firehose update-destination --delivery-stream-name 'requests-batch-all' --current-delivery-stream-version-id '16' --destination-id 'destinationId-000000000001' --extended-s3-destination-update '{"Prefix":"dt=!{timestamp:yyyy'-'MM'-'dd}/hh=!{timestamp:HH}/","ErrorOutputPrefix":"processing-failed/dt=!{timestamp:yyyy'-'MM'-'dd}/hh=!{timestamp:HH}/!{firehose:error-output-type}"}'
aws firehose describe-delivery-stream --delivery-stream-name test
https://docs.aws.amazon.com/firehose/latest/dev/s3-prefixes.html
https://aws.amazon.com/about-aws/whats-new/2019/02/amazon-kinesis-data-firehose-announces-support-for-custom-amazon-s3-p
@qi-qi
qi-qi / airflow-ec2
Last active May 26, 2021 10:06
Airflow-ec2
sudo apt-get update && sudo apt-get -y upgrade
sudo pip3 install --upgrade apache-airflow[all]==1.10.2
sudo apt-get install -y curl build-essential autoconf libtool pkg-config git libxml2-dev libxslt1-dev libkrb5-dev libsasl2-dev libssl-dev libffi-dev locales netcat apt-utils libblas-dev libmysqlclient-dev libpq-dev liblapack-dev supervisor net-tools
sudo apt-get install -y python3.6-dev python3-distutils python3-kerberos python3-jenkins
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
sudo python3 get-pip.py
@qi-qi
qi-qi / bash.sh
Created February 13, 2019 12:50
Handy bash
sort -t , -k 2 -g data.file
=============================
-t separator
-k key/column
-g general numeric sort (sort string treated as number)
sed 's/\"//g' fe88422a-e3b5-4645-9d66-c15f7e1a01b6.csv > file_new.txt
=============================
remove " (double quotes)
@qi-qi
qi-qi / Main.java
Last active February 15, 2019 08:20
java11 and spark2.4
// java 11 single-file source-code spark
// java -cp ".:/Users/qi/Desktop/spark-2.4.0-bin-without-hadoop/jars/*:/Users/qi/Desktop/hadoop-3.2.0/share/etc/hadoop/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/common/lib/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/common/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/hdfs/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/hdfs/lib/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/hdfs/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/yarn/lib/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/yarn/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/mapreduce/lib/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/mapreduce/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/tools/lib/*" Main.java
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import static org.apache.spark.sql.functions.*;
import org.apache.spark.sql.types.DataTypes;
@qi-qi
qi-qi / pom.xml
Created February 15, 2019 08:22
maven - java 11 and spark 2.4
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.qi</groupId>
<artifactId>testSparkJava11</artifactId>
<version>1.0</version>
@qi-qi
qi-qi / athena-double-unnest.sql
Created February 17, 2019 22:27
athena double unnest
with dataset as (
select request_id, ad.t, ad.i, ad.p, ad.s, ad.e, ad.v, ad.impurls
from data_raw.batch, unnest(data_raw.batch.ads) as t(ad)
where dt = '2019-02-16' and h = '00' and request_id = '5F4_P7KSUFDdegHlGoFGcfBb2eGSH9NQpW3kuwoW7EFYt0GApaivCQ==')
select request_id, t, i, p, s, e, v, impurl
from dataset, unnest(impurls) as t(impurl)
@qi-qi
qi-qi / athena-double-unnest.sql
Last active February 17, 2019 22:59
athena double unnest
with dataset as (
select request_id, ad.t, ad.i, ad.p, ad.s, ad.e, ad.v, ad.impurls
from data_raw.batch, unnest(data_raw.batch.ads) as t(ad)
where dt = '2019-02-16' and h = '00' and request_id = '5F4_P7KSUFDdegHlGoFGcfBb2eGSH9NQpW3kuwoW7EFYt0GApaivCQ==')
select request_id, t, i, p, s, e, v, impurl
from dataset, unnest(impurls) as t(impurl)
---s3 bucket ---
s3://acast-data-raw-requests/batch/dt=2019-02-16/h=00/requests-batch-all-18-2019-02-16-00-55-00-01872048-c277-4180-b5f6-0dd5358e3476.gz
@qi-qi
qi-qi / empty-impression-urls.sql
Created February 19, 2019 11:00
empty impression urls
with dataset as (
select ad.i, is_rt, ad.impurls, dt
from data_raw.batch, unnest(data_raw.batch.ads) as t(ad)
where dt >= '2019-02-10')
select dt, i as impression, is_rt, count(*) as counting
from dataset where cardinality(impurls) = 0 group by dt, i, is_rt order by dt, count(*) desc, i
@qi-qi
qi-qi / join_stats_with_cdn_bytes_served.sql
Created March 4, 2019 10:16
join stats with cdn bytes served
SELECT t1.event_date,
t1.ip,
t1.ua_source,
t1.best_effort_user_id,
t1.episode_url,
t1.method,
t1.status,
t1.range_req,
t1.content_length,
t2.bytes,