Calculate Depth of Coverage and Breadth of Coverage from a bam file. This function calculates by chromsome and for the entire genome. Additionally, if the mtchr (Mitochondrial chromosome name) is provided, nuclear coverage and the ratio of mtDNA:nuclear DNA is calculated. #bam #stats
# | |
# This script calculates the depth of coverage and breadth of coverage for a given bam. | |
# Outputs a dictionary containing the contig/chromosome names and the depth and breadth of coverage for each | |
# and for the entire genome. | |
# | |
# If you optionally specify the name of the mitochondrial chromosome (e.g. mtDNA, chrM, chrMT) | |
# The script will also generate breadth and depth of coverage for the nuclear genome AND the ratio | |
# of mtDNA:nuclearDNA; which can act as a proxy in some cases for mitochondrial count within an individual. | |
# | |
# Author: Daniel E. Cook | |
# Website: Danielecook.com | |
# | |
import os | |
import re | |
from subprocess import Popen, PIPE | |
def get_contigs(bam): | |
header, err = Popen(["samtools","view","-H",bam], stdout=PIPE, stderr=PIPE).communicate() | |
if err != "": | |
raise Exception(err) | |
# Extract contigs from header and convert contigs to integers | |
contigs = {} | |
for x in re.findall("@SQ\WSN:(?P<chrom>[A-Za-z0-9_]*)\WLN:(?P<length>[0-9]+)", header): | |
contigs[x[0]] = int(x[1]) | |
return contigs | |
def coverage(bam, mtchr = None): | |
# Check to see if file exists | |
if os.path.isfile(bam) == False: | |
raise Exception("Bam file does not exist") | |
contigs = get_contigs(bam) | |
# Guess mitochondrial chromosome | |
mtchr = [x for x in contigs if x.lower().find("m") == 0] | |
if len(mtchr) != 1: | |
mtchr = None | |
else: | |
mtchr = mtchr[0] | |
coverage_dict = {} | |
for c in contigs.keys(): | |
command = "samtools depth -r %s %s | awk '{sum+=$3;cnt++}END{print cnt \"\t\" sum}'" % (c, bam) | |
coverage_dict[c] = {} | |
coverage_dict[c]["Bases Mapped"], coverage_dict[c]["Sum of Depths"] = map(int,Popen(command, stdout=PIPE, shell = True).communicate()[0].strip().split("\t")) | |
coverage_dict[c]["Breadth of Coverage"] = coverage_dict[c]["Bases Mapped"] / float(contigs[c]) | |
coverage_dict[c]["Depth of Coverage"] = coverage_dict[c]["Sum of Depths"] / float(contigs[c]) | |
coverage_dict[c]["Length"] = int(contigs[c]) | |
# Calculate Genome Wide Breadth of Coverage and Depth of Coverage | |
genome_length = float(sum(contigs.values())) | |
coverage_dict["genome"] = {} | |
coverage_dict["genome"]["Length"] = int(genome_length) | |
coverage_dict["genome"]["Bases Mapped"] = sum([x["Bases Mapped"] for k, x in coverage_dict.iteritems() if k != "genome"]) | |
coverage_dict["genome"]["Sum of Depths"] = sum([x["Sum of Depths"] for k, x in coverage_dict.iteritems() if k != "genome"]) | |
coverage_dict["genome"]["Breadth of Coverage"] = sum([x["Bases Mapped"] for k, x in coverage_dict.iteritems() if k != "genome"]) / float(genome_length) | |
coverage_dict["genome"]["Depth of Coverage"] = sum([x["Sum of Depths"] for k, x in coverage_dict.iteritems() if k != "genome"]) / float(genome_length) | |
if mtchr != None: | |
# Calculate nuclear breadth of coverage and depth of coverage | |
ignore_contigs = [mtchr, "genome", "nuclear"] | |
coverage_dict["nuclear"] = {} | |
coverage_dict["nuclear"]["Length"] = sum([x["Length"] for k,x in coverage_dict.iteritems() if k not in ignore_contigs ]) | |
coverage_dict["nuclear"]["Bases Mapped"] = sum([x["Bases Mapped"] for k, x in coverage_dict.iteritems() if k not in ignore_contigs]) | |
coverage_dict["nuclear"]["Sum of Depths"] = sum([x["Sum of Depths"] for k, x in coverage_dict.iteritems() if k not in ignore_contigs]) | |
coverage_dict["nuclear"]["Breadth of Coverage"] = sum([x["Bases Mapped"] for k, x in coverage_dict.iteritems() if k not in ignore_contigs]) / float(coverage_dict["nuclear"]["Length"]) | |
coverage_dict["nuclear"]["Depth of Coverage"] = sum([x["Sum of Depths"] for k, x in coverage_dict.iteritems() if k not in ignore_contigs]) / float(coverage_dict["nuclear"]["Length"]) | |
# Calculate the ratio of mtDNA depth to nuclear depth | |
coverage_dict["genome"]["mt_ratio"] = coverage_dict[mtchr]["Depth of Coverage"] / float(coverage_dict["nuclear"]["Depth of Coverage"]) | |
# Flatten Dictionary | |
coverage = [] | |
for k,v in coverage_dict.items(): | |
for x in v.items(): | |
coverage += [(k,x[0], x[1])] | |
return coverage |
This comment has been minimized.
This comment has been minimized.
how is working? looks really interesting |
This comment has been minimized.
This comment has been minimized.
@danielecook, that's a very interesting script! But I cannot run it, maybe I'm doing something bad, I don't know. Could you help me? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
What is syntac to run this script?