Aaron Quinlan arq5x

## ggplot2.tcga_and_1kg_cpv.R
library(ggplot2)
library(gridExtra)
cov <- read.table("/Users/arq5x/Documents/Projects/HallLab/TCGA-1KG/ForKeystone/tcga_and_1kg_span_cov.txt",header=TRUE)
span <- qplot(sample, span_cov, data=cov, fill=factor(type_num), geom="bar",
             binwidth=1,
             xlab="Sample",
             ylab="Spanning coverage") +
        opts(axis.ticks = theme_blank(),
             axis.text.x = theme_blank(),
             axis.title.x = theme_text(size = 18, face = "bold"),

## rs-exome-pbs.sh
export BATCH1="1094PC0005 1094PC0009 1094PC0012 1094PC0013 "
export BATCH2="1094PC0016 1094PC0017 1094PC0018 1094PC0019 \
               1094PC0020 1094PC0021 1094PC0022 1094PC0023 1094PC0025 "
export BATCH3="1478PC0001B 1478PC0002 1478PC0003 1478PC0004 \
               1478PC0005 1478PC0006B 1478PC0007B 1478PC0008B \
               1478PC0009B 1478PC0010 1478PC0011 1478PC0012 \
               1478PC0013B 1478PC0014B 1478PC0015B 1478PC0016 \
               1478PC0017B 1478PC0018 1478PC0019 1478PC0020 \
               1478PC0021 1478PC0022B 1478PC0023B 1478PC0024B"
export BATCH4="1719PC0001 1719PC0002 1719PC0003 1719PC0004 \

## t1d-exome.hg19.sh


############################################################
# Pair the alignments.
# Keep proper, on-target (i.e. +/- 500 bp of a probe) pairs.
# Require mapping quality >= 20
############################################################
export DIR=/home/arq5x/cphg-home/projects/t1d/t1d-exome-suna/
export STEPNAME=t1d-ex-bwa-par
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa

## dbsnp-to-bed.sh
export GENOME=hg19
export SNPBUILD=131
curl -s http://hgdownload.cse.ucsc.edu/goldenPath/$GENOME/database/snp$SNPBUILD.txt.gz | \
        zcat | \
        cut -f 2,3,4,5,6,7,10,16 > dbsnp.$SNPBUILD.$GENOME.bed

head dbsnp.$SNPBUILD.$GENOME.bed
chr1	10433	10433	rs56289060	0	+	-/C	near-gene-5
chr1	10491	10492	rs55998931	0	+	C/T	near-gene-5
chr1	10518	10519	rs62636508	0	+	C/G	near-gene-5

## transcripts-w-groupBy.sh
# Step 1: Get transcripts from UCSC refGene (hg19) into a BED file.
# Notes:
#      the awk statement reorders the "raw" columns into BED12 format
#      bed12ToBed6 converts the BED12 into discrete BED6 entries for each exon
#           - the -n option is new and in the bedtools repository
$ curl -s http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz | \
      zcat | \
      awk '{OFS="\t"; print $3,$5,$6,$2,$9,$4,$7,$8,"0",$9,$10,$11}' | \
      bed12ToBed6 -n \
      > refGene.bed

## paired-fastq-subset.sh
# Staring FASTQ files
export FQ1=1.fq
export FQ2=2.fq

# The names of the random subsets
export FQ1SUBSET=1.rand.fq
export FQ2SUBSET=2.rand.fq

# How many random pairs do we want?
export N=100

## groupBy-exons-to-cDNA.sh
###################################################################
# Assume we have a file of BED exons for every gene and transcript.
# The exons are listed in genomic order for each gene/transcipt
###################################################################
$ head -n 5 exons.bed
chr1	1337462	1337636	MRPL20	exon1	-
chr1	1340996	1341266	MRPL20	exon2	-
chr1	1341188	1341266	MRPL20	exon3	-
chr1	1342288	1342399	MRPL20	exon4	-
chr1	1342510	1342597	MRPL20	exon5	-

## multi_bam_cov.cpp
void MultiCovBam::CollectCoverage()
{
    BamMultiReader reader;

    if ( !reader.Open(_bam_files) )
    {
        cerr << "Could not open input BAM files." << endl; return;
    }
    else
    {

## main-pipeline.sh
############################################################
# Index the position-sorted BAM files.
############################################################
export SAMPLES="T10AA T10AB T10D T10H"
export TUMHOME=/home/arq5x/cphg-home/projects/navin-tumor-heterogeneity/
export STEPNAME="bam-index"
for sample in `echo $SAMPLES`
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu"
      echo "cd $TUMHOME; samtools index bam/$sample.*.bam" | $QSUB

## basic-data-analysis.R
########################################
# 1. Counting the discrete occurrences
#    of a value in each column of a
#    matrix. Store the count for each
#    column in a new vector whose size
#    is the number of columns in the
#    matrix.
########################################
# make a 3x3 matrix with columns
# having 0, 1, and 2 zeros
	library(ggplot2)
	library(gridExtra)
	cov <- read.table("/Users/arq5x/Documents/Projects/HallLab/TCGA-1KG/ForKeystone/tcga_and_1kg_span_cov.txt",header=TRUE)
	span <- qplot(sample, span_cov, data=cov, fill=factor(type_num), geom="bar",
	binwidth=1,
	xlab="Sample",
	ylab="Spanning coverage") +
	opts(axis.ticks = theme_blank(),
	axis.text.x = theme_blank(),
	axis.title.x = theme_text(size = 18, face = "bold"),
	export BATCH1="1094PC0005 1094PC0009 1094PC0012 1094PC0013 "
	export BATCH2="1094PC0016 1094PC0017 1094PC0018 1094PC0019 \
	1094PC0020 1094PC0021 1094PC0022 1094PC0023 1094PC0025 "
	export BATCH3="1478PC0001B 1478PC0002 1478PC0003 1478PC0004 \
	1478PC0005 1478PC0006B 1478PC0007B 1478PC0008B \
	1478PC0009B 1478PC0010 1478PC0011 1478PC0012 \
	1478PC0013B 1478PC0014B 1478PC0015B 1478PC0016 \
	1478PC0017B 1478PC0018 1478PC0019 1478PC0020 \
	1478PC0021 1478PC0022B 1478PC0023B 1478PC0024B"
	export BATCH4="1719PC0001 1719PC0002 1719PC0003 1719PC0004 \


	############################################################
	# Pair the alignments.
	# Keep proper, on-target (i.e. +/- 500 bp of a probe) pairs.
	# Require mapping quality >= 20
	############################################################
	export DIR=/home/arq5x/cphg-home/projects/t1d/t1d-exome-suna/
	export STEPNAME=t1d-ex-bwa-par
	export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
	export GENOME=hg19
	export SNPBUILD=131
	curl -s http://hgdownload.cse.ucsc.edu/goldenPath/$GENOME/database/snp$SNPBUILD.txt.gz \| \
	zcat \| \
	cut -f 2,3,4,5,6,7,10,16 > dbsnp.$SNPBUILD.$GENOME.bed

	head dbsnp.$SNPBUILD.$GENOME.bed
	chr1 10433 10433 rs56289060 0 + -/C near-gene-5
	chr1 10491 10492 rs55998931 0 + C/T near-gene-5
	chr1 10518 10519 rs62636508 0 + C/G near-gene-5
	# Step 1: Get transcripts from UCSC refGene (hg19) into a BED file.
	# Notes:
	# the awk statement reorders the "raw" columns into BED12 format
	# bed12ToBed6 converts the BED12 into discrete BED6 entries for each exon
	# - the -n option is new and in the bedtools repository
	$ curl -s http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz \| \
	zcat \| \
	awk '{OFS="\t"; print $3,$5,$6,$2,$9,$4,$7,$8,"0",$9,$10,$11}' \| \
	bed12ToBed6 -n \
	> refGene.bed
	# Staring FASTQ files
	export FQ1=1.fq
	export FQ2=2.fq

	# The names of the random subsets
	export FQ1SUBSET=1.rand.fq
	export FQ2SUBSET=2.rand.fq

	# How many random pairs do we want?
	export N=100
	###################################################################
	# Assume we have a file of BED exons for every gene and transcript.
	# The exons are listed in genomic order for each gene/transcipt
	###################################################################
	$ head -n 5 exons.bed
	chr1 1337462 1337636 MRPL20 exon1 -
	chr1 1340996 1341266 MRPL20 exon2 -
	chr1 1341188 1341266 MRPL20 exon3 -
	chr1 1342288 1342399 MRPL20 exon4 -
	chr1 1342510 1342597 MRPL20 exon5 -
	void MultiCovBam::CollectCoverage()
	{
	BamMultiReader reader;

	if ( !reader.Open(_bam_files) )
	{
	cerr << "Could not open input BAM files." << endl; return;
	}
	else
	{
	############################################################
	# Index the position-sorted BAM files.
	############################################################
	export SAMPLES="T10AA T10AB T10D T10H"
	export TUMHOME=/home/arq5x/cphg-home/projects/navin-tumor-heterogeneity/
	export STEPNAME="bam-index"
	for sample in `echo $SAMPLES`
	do
	export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu"
	echo "cd $TUMHOME; samtools index bam/$sample.*.bam" \| $QSUB
	########################################
	# 1. Counting the discrete occurrences
	# of a value in each column of a
	# matrix. Store the count for each
	# column in a new vector whose size
	# is the number of columns in the
	# matrix.
	########################################
	# make a 3x3 matrix with columns
	# having 0, 1, and 2 zeros