Skip to content

Instantly share code, notes, and snippets.

@arq5x
Last active March 14, 2018 20:05
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save arq5x/657a4c36bc6195b041b0 to your computer and use it in GitHub Desktop.
Save arq5x/657a4c36bc6195b041b0 to your computer and use it in GitHub Desktop.
breast-cancer-evolution-cnv-segmentation
# bedtools --version
# bedtools v2.24.0-14-gaa11ef9
########################################################
# Create a BED file of 5kb windows with 2.5kb overlap
# tiling build 37 (hg19) of the human genome
########################################################
bedtools makewindows -g hg19.txt -w 5000 -s 2500 > hg19.w5k.s2.5k.bedg
########################################################
# Count the number of reads aligned to each overlapping
# 5kb genomic window for each sample.
########################################################
bedtools intersect -a hg19.w5k.s2.5k.bedg \
-b /scratch/ucgd/lustre/Projects/WashU/15-05-21_Marth_Quinlan_Breast_Cancer/alignment/B0.bam \
-sorted -c -g hg19.txt \
> B0.w5k.s2.5k.conc.bedg
bedtools intersect -a hg19.w5k.s2.5k.bedg \
-b /scratch/ucgd/lustre/Projects/WashU/15-05-21_Marth_Quinlan_Breast_Cancer/alignment/B1.bam \
-sorted -c -g hg19.txt \
> B1.w5k.s2.5k.conc.bedg
bedtools intersect -a hg19.w5k.s2.5k.bedg \
-b /scratch/ucgd/lustre/Projects/WashU/15-05-21_Marth_Quinlan_Breast_Cancer/alignment/B2.bam \
-sorted -c -g hg19.txt \
> B2.w5k.s2.5k.conc.bedg
bedtools intersect -a hg19.w5k.s2.5k.bedg \
-b /scratch/ucgd/lustre/Projects/WashU/15-05-21_Marth_Quinlan_Breast_Cancer/alignment/B3.bam \
-sorted -c -g hg19.txt \
> B3.w5k.s2.5k.conc.bedg
bedtools intersect -a hg19.w5k.s2.5k.bedg \
-b /scratch/ucgd/lustre/Projects/WashU/15-05-21_Marth_Quinlan_Breast_Cancer/alignment/B4.bam \
-sorted -c -g hg19.txt \
> B4.w5k.s2.5k.conc.bedg
########################################################
# restrict depth calls to chrom 1 through X
########################################################
awk 'length($1) <=2' B0.w5k.s2.5k.conc.bedg | grep -v "^MT" | grep -v "^Y" > B0.w5k.s2.5k.conc.chr1-chrX.bedg
awk 'length($1) <=2' B1.w5k.s2.5k.conc.bedg | grep -v "^MT" | grep -v "^Y" > B1.w5k.s2.5k.conc.chr1-chrX.bedg
awk 'length($1) <=2' B2.w5k.s2.5k.conc.bedg | grep -v "^MT" | grep -v "^Y" > B2.w5k.s2.5k.conc.chr1-chrX.bedg
awk 'length($1) <=2' B3.w5k.s2.5k.conc.bedg | grep -v "^MT" | grep -v "^Y" > B3.w5k.s2.5k.conc.chr1-chrX.bedg
awk 'length($1) <=2' B4.w5k.s2.5k.conc.bedg | grep -v "^MT" | grep -v "^Y" > B4.w5k.s2.5k.conc.chr1-chrX.bedg
########################################################
# Prefix chromosomes with chr
########################################################
awk '{print "chr"$0}' B3.w5k.s2.5k.conc.chr1-chrX.bedg > B3.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg
awk '{print "chr"$0}' B4.w5k.s2.5k.conc.chr1-chrX.bedg > B4.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg
awk '{print "chr"$0}' B2.w5k.s2.5k.conc.chr1-chrX.bedg > B2.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg
awk '{print "chr"$0}' B1.w5k.s2.5k.conc.chr1-chrX.bedg > B1.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg
awk '{print "chr"$0}' B0.w5k.s2.5k.conc.chr1-chrX.bedg > B0.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg
########################################################
# Normalize each samples coverage by GC content
# Uses the codachrom package:
# https://github.com/arq5x/codachrom
########################################################
python codachrom/windowed_coverage.py -w 5000 -s 2500 -c B0.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg -f hg19.fa
python codachrom/windowed_coverage.py -w 5000 -s 2500 -c B1.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg -f hg19.fa
python codachrom/windowed_coverage.py -w 5000 -s 2500 -c B2.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg -f hg19.fa
python codachrom/windowed_coverage.py -w 5000 -s 2500 -c B3.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg -f hg19.fa
python codachrom/windowed_coverage.py -w 5000 -s 2500 -c B4.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg -f hg19.fa
########################################################
# Compute log2 ratios of one sample's (-1) normalized
# coverage versus another's (-2).
########################################################
python codachrom/log_ratio.py -1 B1.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B0.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B1vB0.log2.bedg &
python codachrom/log_ratio.py -1 B2.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B0.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B2vB0.log2.bedg &
python codachrom/log_ratio.py -1 B3.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B0.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B3vB0.log2.bedg &
python codachrom/log_ratio.py -1 B4.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B0.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B4vB0.log2.bedg &
python codachrom/log_ratio.py -1 B2.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B1.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B2vB1.log2.bedg &
python codachrom/log_ratio.py -1 B3.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B2.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B3vB2.log2.bedg &
python codachrom/log_ratio.py -1 B4.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt -2 B3.w5k.s2.5k.conc.chr1-chrX.chromlabels.bedg.depth.txt > B4vB3.log2.bedg &
# segment the log2 ratios using DNAcopy
# see details in seg.R script
########################################################
# convert DNAcopy output to bedgraph
########################################################
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b1b0.txt > segs.b1b0.bedg
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b2b0.txt > segs.b2b0.bedg
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b3b0.txt > segs.b3b0.bedg
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b4b0.txt > segs.b4b0.bedg
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b2b1.txt > segs.b2b1.bedg
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b3b2.txt > segs.b3b2.bedg
awk '{OFS="\t"; print $3,$4,$5,$7,$6,$1}' segs.b4b3.txt > segs.b4b3.bedg
########################################################
# Identify segments with a mean log2 ratio of > 0.4
# or less than -0.4 and that are at least 15kb in size
########################################################
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b1b0.bedg > segs.b1b0.cnvs.bedg
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b2b0.bedg > segs.b2b0.cnvs.bedg
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b3b0.bedg > segs.b3b0.cnvs.bedg
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b4b0.bedg > segs.b4b0.cnvs.bedg
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b2b1.bedg > segs.b2b1.cnvs.bedg
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b3b2.bedg > segs.b3b2.cnvs.bedg
awk '($4>=0.4 || $4<=-0.4) && ($5 >=5)' segs.b4b3.bedg > segs.b4b3.cnvs.bedg
########################################################
# merge segments that are within 250Kb of one another.
########################################################
sort -k1,1 -k2,2n segs.b3b2.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b3b2.cnvs.merged.bedg
sort -k1,1 -k2,2n segs.b2b1.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b2b1.cnvs.merged.bedg
sort -k1,1 -k2,2n segs.b1b0.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b1b0.cnvs.merged.bedg
sort -k1,1 -k2,2n segs.b2b0.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b2b0.cnvs.merged.bedg
sort -k1,1 -k2,2n segs.b3b0.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b3b0.cnvs.merged.bedg
sort -k1,1 -k2,2n segs.b4b0.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b4b0.cnvs.merged.bedg
sort -k1,1 -k2,2n segs.b4b3.cnvs.bedg | bedtools merge -i - -d 250000 -c 4,5 -o mean,sum > segs.b4b3.cnvs.merged.bedg
########################################################
# get protein coding, known GENCODE genes in
# BED format (downloaded from UCSC)
########################################################
grep protein_coding gencode.bed | grep KNOWN | sort -k1,1 -k2,2n > gencode.prot.known.sorted.bed
########################################################
# collect a gene list for each putative CNV
########################################################
bedtools intersect -a segs.b2b1.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b2b1.cnvs.merged.genelist.bedgraph
bedtools intersect -a segs.b1b0.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b1b0.cnvs.merged.genelist.bedgraph
bedtools intersect -a segs.b2b0.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b2b0.cnvs.merged.genelist.bedgraph
bedtools intersect -a segs.b3b0.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b3b0.cnvs.merged.genelist.bedgraph
bedtools intersect -a segs.b4b0.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b4b0.cnvs.merged.genelist.bedgraph
bedtools intersect -a segs.b3b2.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b3b2.cnvs.merged.genelist.bedgraph
bedtools intersect -a segs.b4b3.cnvs.merged.bedg -b gencode.prot.known.sorted.bed -wa -wb | bedtools groupby -i - -g 1-5 -c 9 -o distinct > segs.b4b3.cnvs.merged.genelist.bedgraph
########################################################
# bgzip and tabix for IGV
########################################################
bgzip segs.b1b0.cnvs.merged.genelist.bedgraph
bgzip segs.b2b1.cnvs.merged.genelist.bedgraph
bgzip segs.b3b2.cnvs.merged.genelist.bedgraph
bgzip segs.b4b3.cnvs.merged.genelist.bedgraph
tabix -p bed segs.b1b0.cnvs.merged.genelist.bedgraph.gz
tabix -p bed segs.b2b1.cnvs.merged.genelist.bedgraph.gz
tabix -p bed segs.b3b2.cnvs.merged.genelist.bedgraph.gz
tabix -p bed segs.b4b3.cnvs.merged.genelist.bedgraph.gz
library("DNAcopy")
b1b0 <- read.table('B1vB0.log2.bedg')
b2b0 <- read.table('B2vB0.log2.bedg')
b3b0 <- read.table('B3vB0.log2.bedg')
b4b0 <- read.table('B4vB0.log2.bedg')
b2b1 <- read.table('B2vB1.log2.bedg')
b3b2 <- read.table('B3vB2.log2.bedg')
b4b3 <- read.table('B4vB3.log2.bedg')
###########################################
# Load into DNAcopy objects and smooth
###########################################
CNA.object.b1b0 <- CNA( genomdat = b1b0[,4], chrom = b1b0[,1], maploc = b1b0[,2]+(b1b0[,3]-b1b0[,2])/2, data.type = 'logratio', sampleid="b1b0")
CNA.object.b2b0 <- CNA( genomdat = b2b0[,4], chrom = b2b0[,1], maploc = b2b0[,2]+(b2b0[,3]-b2b0[,2])/2, data.type = 'logratio', sampleid="b2b0")
CNA.object.b3b0 <- CNA( genomdat = b3b0[,4], chrom = b3b0[,1], maploc = b3b0[,2]+(b3b0[,3]-b3b0[,2])/2, data.type = 'logratio', sampleid="b3b0")
CNA.object.b4b0 <- CNA( genomdat = b4b0[,4], chrom = b4b0[,1], maploc = b4b0[,2]+(b4b0[,3]-b4b0[,2])/2, data.type = 'logratio', sampleid="b4b0")
CNA.object.b2b1 <- CNA( genomdat = b2b1[,4], chrom = b2b1[,1], maploc = b2b1[,2]+(b2b1[,3]-b2b1[,2])/2, data.type = 'logratio', sampleid="b2b1")
CNA.object.b3b2 <- CNA( genomdat = b3b2[,4], chrom = b3b2[,1], maploc = b3b2[,2]+(b3b2[,3]-b3b2[,2])/2, data.type = 'logratio', sampleid="b3b2")
CNA.object.b4b3 <- CNA( genomdat = b4b3[,4], chrom = b4b3[,1], maploc = b4b3[,2]+(b4b3[,3]-b4b3[,2])/2, data.type = 'logratio', sampleid="b4b3")
CNA.smoothed.b1b0 <- smooth.CNA(CNA.object.b1b0)
CNA.smoothed.b2b0 <- smooth.CNA(CNA.object.b2b0)
CNA.smoothed.b3b0 <- smooth.CNA(CNA.object.b3b0)
CNA.smoothed.b4b0 <- smooth.CNA(CNA.object.b4b0)
CNA.smoothed.b2b1 <- smooth.CNA(CNA.object.b2b1)
CNA.smoothed.b3b2 <- smooth.CNA(CNA.object.b3b2)
CNA.smoothed.b4b3 <- smooth.CNA(CNA.object.b4b3)
###########################################
# Segment
###########################################
segs.b1b0 <- segment(CNA.smoothed.b1b0, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5)
segs.b2b0 <- segment(CNA.smoothed.b2b0, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5)
segs.b3b0 <- segment(CNA.smoothed.b3b0, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5)
segs.b4b0 <- segment(CNA.smoothed.b4b0, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5)
segs.b2b1 <- segment(CNA.smoothed.b2b1, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5)
segs.b3b2 <- segment(CNA.smoothed.b3b2, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5)
segs.b4b3 <- segment(CNA.smoothed.b4b3, undo.splits = "sdundo", undo.SD = 1.5, verbose = 1, min.width=5)
###########################################
# Make individual plots for each chrom and sample
###########################################
chroms <- c('chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18',
'chr19', 'chr2', 'chr20', 'chr21', 'chr22', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
'chr9', 'chrX')
for (chrom in chroms)
{
title <- paste("b1b0", ".", chrom, ".pdf", sep="")
pdf(file=title)
plotSample(segs.b1b0, chromlist = chrom, ylim=c(-3,3), main=title)
dev.off()
title <- paste("b2b1", ".", chrom, ".pdf", sep="")
pdf(file=title)
plotSample(segs.b2b1, chromlist = chrom, ylim=c(-3,3), main=title)
dev.off()
title <- paste("b3b2", ".", chrom, ".pdf", sep="")
pdf(file=title)
plotSample(segs.b3b2, chromlist = chrom, ylim=c(-3,3), main=title)
dev.off()
title <- paste("b4b3", ".", chrom, ".pdf", sep="")
pdf(file=title)
plotSample(segs.b4b3, chromlist = chrom, ylim=c(-3,3), main=title)
dev.off()
}
###########################################
# Write the CN segments to a tab-delimited file
###########################################
write.table(segs.b1b0$output, file="segs.b1b0.txt", row.names=T, col.names=F, quote=F, sep="\t")
write.table(segs.b2b0$output, file="segs.b2b0.txt", row.names=T, col.names=F, quote=F, sep="\t")
write.table(segs.b3b0$output, file="segs.b3b0.txt", row.names=T, col.names=F, quote=F, sep="\t")
write.table(segs.b4b0$output, file="segs.b4b0.txt", row.names=T, col.names=F, quote=F, sep="\t")
write.table(segs.b2b1$output, file="segs.b2b1.txt", row.names=T, col.names=F, quote=F, sep="\t")
write.table(segs.b3b2$output, file="segs.b3b2.txt", row.names=T, col.names=F, quote=F, sep="\t")
write.table(segs.b4b3$output, file="segs.b4b3.txt", row.names=T, col.names=F, quote=F, sep="\t")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment