Skip to content

Instantly share code, notes, and snippets.

/hpc/users/ahujaa01/spark/spark-1.6.2-bin-hadoop2.6/bin/spark-submit
--master yarn-client
--executor-memory 18g
--driver-memory 12g
--num-executors 100
--executor-cores 8
--class org.hammerlab.guacamole.Main
--conf spark.default.parallelism=10000
--conf spark.yarn.executor.memoryOverhead=3024
--conf spark.shuffle.service.enabled=true
let seq2hla_from_bam
~path
~sample =
let open Biokepi.KEDSL in
let open Biokepi.Pipeline.Construct in
let reference_build = "b37" in
let work_dir =
Biokepi.Machine.work_dir Demeter.machine
// "mskcc-bladder-rna" in
let initial_bam =
@arahuja
arahuja / gist:cb945b39933f3f17bf40
Created February 23, 2016 16:14
pyensembl example
In [3]: transcripts = ensembl.transcripts_at_locus('chr12', 125621282)
In [4]: transcript = transcripts[0]
In [5]: transcript.spliced_offset(125621282)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-5-59d91508f846> in <module>()
----> 1 transcript.spliced_offset(125621282)
import org.apache.spark.{SparkContext, SparkConf}
object SimpleSparkApp extends App {
override def main(args: Array[String]) = {
val config: SparkConf = new SparkConf()
config.setMaster("local[4]")
config.setAppName("MySimpleSparkApp")
#! /bin/sh
SCRIPT_NAME=$(basename $0)
short_usage()
{
echo "Cf : $SCRIPT_NAME [--help]"
}
long_usage()
{
This file has been truncated, but you can view the full file.
chr1:14416-14499,chr1:14510-14596,chr1:14693-14782,chr1:14853-14972,chr1:15033-15128,chr1:15632-15900,chr1:15922-16004,chr1:16562-16857,chr1:16914-17012,chr1:17272-17354,chr1:17424-17499,chr1:17567-17696,chr1:17752-17830,chr1:17883-17965,chr1:17997-18130,chr1:18154-18442,chr1:19051-19242,chr1:20523-20708,chr1:24441-24674,chr1:24698-24779,chr1:24802-24945,chr1:29178-29425,chr1:30249-30543,chr1:35079-35286,chr1:35289-35371,chr1:35677-35856,chr1:62953-63130,chr1:63136-63264,chr1:63352-63431,chr1:63535-63697,chr1:65447-65697,chr1:69036-69193,chr1:69230-69384,chr1:69421-69557,chr1:69576-69647,chr1:69667-69778,chr1:69787-69934,chr1:69941-70018,chr1:112640-112731,chr1:120796-120881,chr1:135148-135289,chr1:135712-135807,chr1:135815-136019,chr1:136084-136180,chr1:137370-137557,chr1:137625-137721,chr1:173694-173919,chr1:228206-228442,chr1:228467-228545,chr1:228615-228727,chr1:258957-259048,chr1:267027-267103,chr1:267117-267202,chr1:326427-326544,chr1:326561-326761,chr1:327136-327210,chr1:327236-327310,chr1:327322-32742
case taskNumberRegionPairs1 :: taskNumberRegionPairs2 :: Nil => {
val rdd1Records = sc.accumulator(0L, s"rdd1.records")
val rdd2Records = sc.accumulator(0L, s"rdd2.records")
val lociAccum = sc.accumulator(0L, s"rdd.task.loci")
// Cogroup-based implementation.
val paritioned = taskNumberRegionPairs1.cogroup(taskNumberRegionPairs2, new PartitionByKey(numTasks.toInt))
val sorted = new ShuffledRDD[TaskPosition, (Iterable[M], Iterable[M]), (Iterable[M], Iterable[M])](
paritioned,
new PartitionByKey(numTasks.toInt))
.setKeyOrdering(implicitly[Ordering[TaskPosition]])
package edu.berkeley.cs.amplab.adam.cli
import java.io.File;
import edu.berkeley.cs.amplab.adam.models.ADAMVariantContext
import edu.berkeley.cs.amplab.adam.rdd.AdamContext._
import edu.berkeley.cs.amplab.adam.rdd.variation.ADAMVariationContext._
import edu.berkeley.cs.amplab.adam.util.ParquetLogger
import java.util.logging.Level
import org.apache.hadoop.mapreduce.Job
events = load 'data' USING PigStorage() as (timestamp:int, event_id:int, advertiser_id:int)
advertiser_data = load 'advertiser_data' USING PigStorage() as (id:int, name:chararray, campaign:chararray)
events_w_metadata = JOIN events by advertiser_id, advertiser_data by id;