Skip to content

Instantly share code, notes, and snippets.

@agaszmurlo
Last active September 25, 2019 09:25
Show Gist options
  • Save agaszmurlo/995ba53a9fd874495d3635a88db0475f to your computer and use it in GitHub Desktop.
Save agaszmurlo/995ba53a9fd874495d3635a88db0475f to your computer and use it in GitHub Desktop.
oap varia
unset SPARK_HOME
cd /data/local/opt/spark-2.4.3-bin-hadoop2.7
./bin/spark-shell -v --master yarn-client --num-executors 20 --driver-memory 2g --executor-memory 2g \
--conf spark.hadoop.yarn.timeline-service.enabled=false \
--conf spark.hadoop.hive.metastore.uris=thrift://cdh01.cl.ii.pw.edu.pl:9083 \
--conf spark.hadoop.yarn.timeline-service.enabled=false \
--conf spark.driver.extraJavaOptions=-Dhdp.version=3.1.0.0-78 \
--conf spark.yarn.am.extraJavaOptions=-Dhdp.version=3.1.0.0-78 \
--conf spark.hadoop.metastore.catalog.default=hive
sql ("use igap_dev")
sql(
s"""
| CREATE TABLE etl_genotypes_ac_oap (
| `project` string,
| `chr` string,
| `pos` int,
| `posend` int,
| `ref` string,
| `alt` string,
| `gt` string,
| `ad` int,
| `dp` int,
| `gq` int,
| `pl` int,
| `sample_id` string,
| `al_cnt` int)
| USING oap
| OPTIONS (path 'hdfs:///oap-data-dir/')
""".stripMargin)
sql ("insert overwrite table etl_genotypes_ac_oap select * from etg_genotypes_ac_ext")
sql(
s"""
| CREATE TEMPORARY TABLE etl_genotypes_ac_oap_af (
| `chr` string,
| `pos` int,
| `posend` int,
| `ref` string,
| `alt` string,
| `af` double)
| USING oap
| OPTIONS (path 'hdfs:///oap-data-dir/af')
""".stripMargin)
sql (
s"""
|INSERT OVERWRITE TABLE etl_genotypes_ac_oap_af
| SELECT chr, pos, posend, ref, alt, sum(al_cnt)/(SELECT count (distinct sample_id) from etg_genotypes_ac_ext) as af
| FROM etg_genotypes_ac_ext
| GROUP BY chr, pos, posend, ref, alt
""".stripMargin)
// sql(
// s"""
// | SELECT chr, pos, posend, ref, alt, sum(al_cnt)/(SELECT count (distinct sample_id) from etg_genotypes_ac_ext) as af
// | FROM etg_genotypes_ac_ext
// | GROUP BY chr, pos, posend, ref, alt
// """)
sql("select * from etl_genotypes_ac_oap_af").show
sql("create oindex index1 on etl_genotypes_ac_oap_af(chr)")
sql("show oindex from etl_genotypes_ac_oap_af")
spark.time(sql ("select sum(pos) from etl_genotypes_ac_oap_af where chr='1' OR chr='5'").show)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment