Skip to content

Instantly share code, notes, and snippets.

View arq5x's full-sized avatar

Aaron Quinlan arq5x

View GitHub Profile
@arq5x
arq5x / main-pipeline.sh
Created June 16, 2011 18:39
Navin Main Processing
############################################################
# Index the position-sorted BAM files.
############################################################
export SAMPLES="T10AA T10AB T10D T10H"
export TUMHOME=/home/arq5x/cphg-home/projects/navin-tumor-heterogeneity/
export STEPNAME="bam-index"
for sample in `echo $SAMPLES`
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $TUMHOME; samtools index bam/$sample.*.bam" | $QSUB
import sys
class Line(object):
def __init__(self, line):
self.fields = line.split('\t')
self.chrom = self.fields[0]
self.start = int(self.fields[1])
self.end = int(self.fields[2])
self.depth = int(self.fields[3])
@arq5x
arq5x / dbsnp-to-bed.sh
Created March 7, 2011 23:19
Make a BED file from the raw UCSC file
export GENOME=hg19
export SNPBUILD=131
curl -s http://hgdownload.cse.ucsc.edu/goldenPath/$GENOME/database/snp$SNPBUILD.txt.gz | \
zcat | \
cut -f 2,3,4,5,6,7,10,16 > dbsnp.$SNPBUILD.$GENOME.bed
head dbsnp.$SNPBUILD.$GENOME.bed
chr1 10433 10433 rs56289060 0 + -/C near-gene-5
chr1 10491 10492 rs55998931 0 + C/T near-gene-5
chr1 10518 10519 rs62636508 0 + C/G near-gene-5
@arq5x
arq5x / complexity.py
Last active February 6, 2019 21:14
kmer fun with jellyfish
import sys
from itertools import *
"""
compute the complexity of each kmer passed in
given the format of the output of `jellyfish dump -ct`
complexity is measured as the number of runs divided
by the total length of the sequence.
e.g., "AAAAA" would be 1/5
and "ACTGC" would be 5/5
@arq5x
arq5x / example.sh
Last active January 24, 2019 13:02
Natural sort a VCF
chmod a+x vcfsort.sh
vcfsort.sh trio.trim.vep.vcf.gz
@arq5x
arq5x / grantham-dict.py
Last active November 13, 2018 15:25
Convert Grantham Amino Acid matrix into Python dict.
#!/usr/bin/env python
import sys
import pprint
def make_grantham_dict(grantham_mat_file):
"""
Citation: http://www.ncbi.nlm.nih.gov/pubmed/4843792
Provenance: http://www.genome.jp/dbget-bin/www_bget?aaindex:GRAR740104
@arq5x
arq5x / make-unified-segmentation.sh
Created September 14, 2012 00:55
ENCODE consensus segmentations
# 1. Get the ENCODE segmentations from EBI.
# consensus
wget http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/awgHub/byDataType/segmentations/jan2011/gm12878.combined.bb
wget http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/awgHub/byDataType/segmentations/jan2011/h1hesc.combined.bb
wget http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/awgHub/byDataType/segmentations/jan2011/helas3.combined.bb
wget http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/awgHub/byDataType/segmentations/jan2011/hepg2.combined.bb
wget http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/awgHub/byDataType/segmentations/jan2011/huvec.combined.bb
wget http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/awgHub/byDataType/segmentations/jan2011/k562.combined.bb
# Segway (ahem; https://twitter.com/michaelhoffman/status/246679147164880897)
@arq5x
arq5x / diff-file.sh
Created April 13, 2012 18:17
VCF->TPED-PLINK-BED
# Create SNPs-only subset of the 1000G callset VCF
$ curl -s http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20120316_phase1_integrated_release_version2/ALL.chr22.phase1_release_v2.20101123.snps_indels_svs.vcf.gz | \
zcat | \
awk '$0 ~ /^#/ || $0 ~ "VT=SNP"' | \
gzip > ALL.chr22.phase1_release_v2.20101123.snps_indels_svs.snpsonly.vcf.gz
# Use VCFTools to convert to TPED
$ vcftools
VCFtools (v0.1.8)
@arq5x
arq5x / paired-fastq-subset.sh
Created March 17, 2011 14:27
Grab random subset of FASTQ pairs
# Staring FASTQ files
export FQ1=1.fq
export FQ2=2.fq
# The names of the random subsets
export FQ1SUBSET=1.rand.fq
export FQ2SUBSET=2.rand.fq
# How many random pairs do we want?
export N=100
@arq5x
arq5x / methods.sh
Last active March 14, 2018 20:05
breast-cancer-evolution-cnv-segmentation
# bedtools --version
# bedtools v2.24.0-14-gaa11ef9
########################################################
# Create a BED file of 5kb windows with 2.5kb overlap
# tiling build 37 (hg19) of the human genome
########################################################
bedtools makewindows -g hg19.txt -w 5000 -s 2500 > hg19.w5k.s2.5k.bedg
########################################################