Skip to content

Instantly share code, notes, and snippets.

@agaszmurlo
agaszmurlo / pileup.sh
Last active July 30, 2020 11:00
pileup test procedure
samtools mpileup --fasta-ref ../../reference/Homo_sapiens_assembly18_chr1_chrM.small.fasta -A -q 0 -Q 0 NA12878.multichrom.md.bam -o samtools.pileup2
samtools mpileup --fasta-ref ../../reference/Homo_sapiens_assembly18_chr1_chrM.small.fasta -B -x -A -q 0 -Q 0 NA12878.multichrom.md.bam > samtools_x.pileup
sed $'s/\t"/\t\\\\"/g' samtools_x.pileup> samtools_x_esc.pileup
@agaszmurlo
agaszmurlo / haplotype_spark.sh
Last active December 5, 2019 23:44
haplotype_caller spark
cd /data/samples/CORRIELL/mbi_cwiczenie3
### create sequence dictionary
docker run --rm -it \
-v /data/samples/CORRIELL/mbi_cwiczenie3:/data \
broadinstitute/picard \
CreateSequenceDictionary \
R=/data/chr1.fa \
O=/data/chr1.dict
@agaszmurlo
agaszmurlo / seqtender_alignment.sh
Last active December 5, 2019 23:45
seqtender alignment setup
export SPARK_HOME=/data/local/opt/spark-2.4.3-bin-hadoop2.7
rm -rf /data/local/cache/ivy2/repository/cache/org.biodatageeks/bdg-seqtender_2.11/
rm /data/local/cache/ivy2/repository/jars/org.biodatageeks_bdg-seqtender_2.11-0.2-SNAPSHOT.jar
## master local, defaultFS = HDFS
./bin/spark-shell -v \
--master local \
--driver-memory 2g \
--conf "spark.sql.catalogImplementation=in-memory" \
@agaszmurlo
agaszmurlo / nexus_push
Last active February 10, 2020 12:32
push to nexus
## push to nexus
curl -v --user 'user:pass' --upload-file oap-1.0.0-spark-2.4.3-SNAPSHOT.jar http://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/org/intel/bigdata/oap/1.0.0-spark-2.4.3-SNAPSHOT/oap-1.0.0-spark-2.4.3-SNAPSHOT.jar
## sbt without tests in assembly
sbt 'set test in assembly := {}' clean assembly
export SPARK_HOME= /data/local/opt/spark-2.4.3-bin-hadoop2.7
cd $SPARK_HOME
## scala v 2.11 (!!)
./bin/spark-shell -v --master yarn-client --num-executors 20 --driver-memory 2g --executor-memory 2g \
--jars /tmp/bdg-sequila-acc_2.11-0.1-spark-2.4.3-SNAPSHOT-assembly.jar \
--conf spark.sql.extensions=org.biodatageeks.sequila.spark.BdgExtensions \
--conf spark.hadoop.yarn.timeline-service.enabled=false \
--conf spark.hadoop.hive.metastore.uris=thrift://cdh01.cl.ii.pw.edu.pl:9083 \
@agaszmurlo
agaszmurlo / oap.scala
Last active September 25, 2019 09:25
oap varia
unset SPARK_HOME
cd /data/local/opt/spark-2.4.3-bin-hadoop2.7
./bin/spark-shell -v --master yarn-client --num-executors 20 --driver-memory 2g --executor-memory 2g \
--conf spark.hadoop.yarn.timeline-service.enabled=false \
--conf spark.hadoop.hive.metastore.uris=thrift://cdh01.cl.ii.pw.edu.pl:9083 \
--conf spark.hadoop.yarn.timeline-service.enabled=false \
--conf spark.driver.extraJavaOptions=-Dhdp.version=3.1.0.0-78 \
--conf spark.yarn.am.extraJavaOptions=-Dhdp.version=3.1.0.0-78 \
--conf spark.hadoop.metastore.catalog.default=hive
@agaszmurlo
agaszmurlo / carbon.scala
Last active August 2, 2019 13:39
Carbon data varia
./spark-shell -v --master yarn-client --driver-memory 1G --executor-memory 2G --executor-cores 2 \
--jars /tmp/apache-carbondata-1.6.0-SNAPSHOT-bin-spark2.3.2-hadoop2.7.2.jar \
--conf spark.hadoop.hive.metastore.uris=thrift://cdh01.cl.ii.pw.edu.pl:9083 \
--conf spark.hadoop.yarn.timeline-service.enabled=false \
--conf spark.driver.extraJavaOptions=-Dhdp.version=3.1.0.0-78 \
--conf spark.yarn.am.extraJavaOptions=-Dhdp.version=3.1.0.0-78 \
--conf spark.hadoop.metastore.catalog.default=hive
import org.apache.spark.sql.SparkSession
docker pull biodatageeks/bdg-sequila:0.5.5-spark-2.4.2-SNAPSHOT
docker run -it --rm \
-e USERID=$UID -e GROUPID=$(id -g) \
-v /Users/aga/workplace/data/slice:/data \
biodatageeks/bdg-sequila:0.5.5-spark-2.4.2-SNAPSHOT \
depthOfCoverage \
--master=local --driver-memory=8g \
-- \
--reads /data/NA12878.slice.bam --format blocks -o /data/NA12878.cov.bed
cd /data/local/opt/spark-2.4.0-bin-hadoop2.7/bin
./spark-shell -v --master=yarn --deploy-mode=client --num-executors=60 --executor-memory=4g --driver-memory=12g --conf spark.sql.catalogImplementation=in-memory --conf spark.jars.ivy=/data/local/cache/ivy2/repository --conf spark.hadoop.yarn.timeline-service.enabled=false --repositories http://zsibio.ii.pw.edu.pl/nexus/repository/maven-releases/,http://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/ --packages org.biodatageeks:bdg-sequila_2.11:0.5.5-spark-2.4.2-SNAPSHOT
sc.setLogLevel("WARN")
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
@agaszmurlo
agaszmurlo / sequila_sample.scala
Last active May 6, 2019 11:22
sequila sample script
////// RUN
// map volumes according to your data directory
docker run -it --rm \
-e USERID=$UID -e GROUPID=$(id -g) \
-v /Users/aga/workplace/data/slice/:/data \
biodatageeks/bdg-sequila:0.5.5-spark-2.4.2-SNAPSHOT \
spark-shell --driver-memory=4g \
--jars /tmp/bdg-toolset/bdg-sequila-assembly-0.5.5-spark-2.4.2-SNAPSHOT.jar \
--conf spark.sql.warehouse.dir=/home/bdgeek/spark-warehouse