Skip to content

Instantly share code, notes, and snippets.

View qi-qi's full-sized avatar

Qi Qi qi-qi

View GitHub Profile
@qi-qi
qi-qi / gist:46598454fdbf6acac0bd525be7f5389a
Created June 7, 2018 09:36
Remove all local git branches but keep master
git branch | grep -v "master" | xargs git branch -D
@qi-qi
qi-qi / bash.sh
Created February 13, 2019 12:50
Handy bash
sort -t , -k 2 -g data.file
=============================
-t separator
-k key/column
-g general numeric sort (sort string treated as number)
sed 's/\"//g' fe88422a-e3b5-4645-9d66-c15f7e1a01b6.csv > file_new.txt
=============================
remove " (double quotes)
@qi-qi
qi-qi / Main.java
Last active February 15, 2019 08:20
java11 and spark2.4
// java 11 single-file source-code spark
// java -cp ".:/Users/qi/Desktop/spark-2.4.0-bin-without-hadoop/jars/*:/Users/qi/Desktop/hadoop-3.2.0/share/etc/hadoop/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/common/lib/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/common/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/hdfs/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/hdfs/lib/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/hdfs/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/yarn/lib/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/yarn/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/mapreduce/lib/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/mapreduce/*:/Users/qi/Desktop/hadoop-3.2.0/share/hadoop/tools/lib/*" Main.java
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import static org.apache.spark.sql.functions.*;
import org.apache.spark.sql.types.DataTypes;
@qi-qi
qi-qi / pom.xml
Created February 15, 2019 08:22
maven - java 11 and spark 2.4
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.qi</groupId>
<artifactId>testSparkJava11</artifactId>
<version>1.0</version>
@qi-qi
qi-qi / athena-double-unnest.sql
Created February 17, 2019 22:27
athena double unnest
with dataset as (
select request_id, ad.t, ad.i, ad.p, ad.s, ad.e, ad.v, ad.impurls
from data_raw.batch, unnest(data_raw.batch.ads) as t(ad)
where dt = '2019-02-16' and h = '00' and request_id = '5F4_P7KSUFDdegHlGoFGcfBb2eGSH9NQpW3kuwoW7EFYt0GApaivCQ==')
select request_id, t, i, p, s, e, v, impurl
from dataset, unnest(impurls) as t(impurl)
@qi-qi
qi-qi / athena-double-unnest.sql
Last active February 17, 2019 22:59
athena double unnest
with dataset as (
select request_id, ad.t, ad.i, ad.p, ad.s, ad.e, ad.v, ad.impurls
from data_raw.batch, unnest(data_raw.batch.ads) as t(ad)
where dt = '2019-02-16' and h = '00' and request_id = '5F4_P7KSUFDdegHlGoFGcfBb2eGSH9NQpW3kuwoW7EFYt0GApaivCQ==')
select request_id, t, i, p, s, e, v, impurl
from dataset, unnest(impurls) as t(impurl)
---s3 bucket ---
s3://acast-data-raw-requests/batch/dt=2019-02-16/h=00/requests-batch-all-18-2019-02-16-00-55-00-01872048-c277-4180-b5f6-0dd5358e3476.gz
@qi-qi
qi-qi / empty-impression-urls.sql
Created February 19, 2019 11:00
empty impression urls
with dataset as (
select ad.i, is_rt, ad.impurls, dt
from data_raw.batch, unnest(data_raw.batch.ads) as t(ad)
where dt >= '2019-02-10')
select dt, i as impression, is_rt, count(*) as counting
from dataset where cardinality(impurls) = 0 group by dt, i, is_rt order by dt, count(*) desc, i
@qi-qi
qi-qi / join_stats_with_cdn_bytes_served.sql
Created March 4, 2019 10:16
join stats with cdn bytes served
SELECT t1.event_date,
t1.ip,
t1.ua_source,
t1.best_effort_user_id,
t1.episode_url,
t1.method,
t1.status,
t1.range_req,
t1.content_length,
t2.bytes,
@qi-qi
qi-qi / athena-join-parquet.sql
Last active March 8, 2019 10:43
athena-join-parquet.sql
CREATE table test.stats_20190210_20190306 WITH (
bucketed_by = ARRAY['event_ts'],
bucket_count = 10,
format = 'orc',
external_location = 's3://acast-data-dev/stats_20190210_20190306/') AS
with t2 as (select request_id, bytes from test.cf_stitch)
SELECT cast(from_iso8601_timestamp(t1.event_date) AS timestamp) AS event_ts,
t1.best_effort_user_id,
t1.ip,
@qi-qi
qi-qi / emr-5.21
Last active March 8, 2019 13:55
emr-5.21
[hadoop@ip-192-168-47-191 ~]$ sudo find / -iname "*amzn*.jar"
/mnt/var/lib/hadoop-httpfs/tomcat-deployment/webapps/webhdfs/WEB-INF/lib/hadoop-hdfs-client-2.8.5-amzn-1.jar
/mnt/var/lib/hadoop-httpfs/tomcat-deployment/webapps/webhdfs/WEB-INF/lib/hadoop-auth-2.8.5-amzn-1.jar
/mnt/var/lib/hadoop-httpfs/tomcat-deployment/webapps/webhdfs/WEB-INF/lib/hadoop-hdfs-2.8.5-amzn-1.jar
/mnt/var/lib/hadoop-httpfs/tomcat-deployment/webapps/webhdfs/WEB-INF/lib/hadoop-annotations-2.8.5-amzn-1.jar
/mnt/var/lib/hadoop-httpfs/tomcat-deployment/webapps/webhdfs/WEB-INF/lib/hadoop-common-2.8.5-amzn-1.jar
/usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core-2.8.5-amzn-1.jar
/usr/lib/hadoop-mapreduce/hadoop-rumen-2.8.5-amzn-1.jar
/usr/lib/hadoop-mapreduce/hadoop-distcp-2.8.5-amzn-1.jar
/usr/lib/hadoop-mapreduce/hadoop-azure-2.8.5-amzn-1.jar