Skip to content

Instantly share code, notes, and snippets.

@suqingdong
Created October 14, 2016 14:40
Show Gist options
  • Save suqingdong/085b4677bed6f0749b9c14f6829fc82e to your computer and use it in GitHub Desktop.
Save suqingdong/085b4677bed6f0749b9c14f6829fc82e to your computer and use it in GitHub Desktop.
depth and coverage statistics by each gene
#!/usr/bin/env python
import re
# One gene may exist in different chromsomes
# One position may belong diffrent genes
# geneList format:
# {'genename1': {
# "chr1": [(start1,stop1), (start2, stop2)],
# "chr2": [(start1,stop1), (start2, stop2)]
# },
# ...
# }
def getGeneList(infile):
geneList = {}
with open(infile) as f:
for eachline in f:
if eachline.startswith('chr'):
chrom = eachline.split()[0]
start = int(eachline.split()[1])
stop = int(eachline.split()[2])
genenames = re.findall(r'entg\|(.*?),', eachline)
for genename in genenames:
if genename not in geneList:
geneList[genename] = { chrom: [(start, stop)] }
elif chrom not in geneList[genename]:
geneList[genename][chrom] = [(start, stop)]
else:
geneList[genename][chrom].append((start, stop))
return geneList
# Get bed format from geneList
def getGeneBed(geneList):
with open('all_gene.bed', 'w') as out:
for genename in geneList:
for chrom, value in geneList[genename].items():
for start,stop in value:
out.write('{}\t{}\t{}\n'.format(chrom, start, stop))
# posDepthFile comes from result of samtools depth commands
# Output 7 columns: GeneName Chrom TotalLength TotalDepth MeanDepth CoveredBases ProCoveredBases
def depthStatByGene(geneList, posDepthFile):
# get each base's depth
with open(posDepthFile) as f:
posDepthDict = {}
for eachline in f:
pos = int(eachline.strip().split()[1])
depth = int(eachline.strip().split()[2])
posDepthDict[pos] = depth
# statistic each gene's depth
with open('coverage.bygene.xls', 'w') as out:
out.write('GeneName\tChrom\tTotalLength\tTotalDepth\tMeanDepth\tCoveredBases\tProCoveredBases\n')
for genename in geneList:
total_length = 0
total_depth = 0
covered_bases = 0
for chrom, value in geneList[genename].items():
for start, stop in value:
bed_length = stop - start + 1
bed_bases = 0
bed_depth = 0
for i in xrange(start, stop+1):
if i in posDepthDict:
bed_bases += 1
bed_depth += posDepthDict[i]
total_length += bed_length
total_depth += bed_depth
covered_bases += bed_bases
print total_length, total_depth, covered_bases
mean_depth = '%.2f' % (float(total_depth) / total_length)
pro_covered_bases = '%.3f' % (float(covered_bases) / total_length)
out.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(genename, chrom, total_length, total_depth, mean_depth, covered_bases, pro_covered_bases))
if __name__ == '__main__':
import sys
import os
if len(sys.argv) < 2:
print "Usage python %s <regrions.bed> <nodup.bam>" % sys.argv[0]
exit(1)
geneList = getGeneList(sys.argv[1])
getGeneBed(geneList)
command = 'samtools depth -b all_gene.bed %s > posDepth.xls' % sys.argv[2]
os.system(command)
print "samtools depth done~"
depthStatByGene(geneList, 'posDepth.xls')
#with open('one_gene_in_different_crhoms.txt', 'w') as out:
# for gene, value in geneList.items():
# if len(value.keys()) > 1:
# out.write('{}\t{}\n'.format(gene, value.keys()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment