Created
October 14, 2016 14:40
-
-
Save suqingdong/085b4677bed6f0749b9c14f6829fc82e to your computer and use it in GitHub Desktop.
depth and coverage statistics by each gene
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
# One gene may exist in different chromsomes | |
# One position may belong diffrent genes | |
# geneList format: | |
# {'genename1': { | |
# "chr1": [(start1,stop1), (start2, stop2)], | |
# "chr2": [(start1,stop1), (start2, stop2)] | |
# }, | |
# ... | |
# } | |
def getGeneList(infile): | |
geneList = {} | |
with open(infile) as f: | |
for eachline in f: | |
if eachline.startswith('chr'): | |
chrom = eachline.split()[0] | |
start = int(eachline.split()[1]) | |
stop = int(eachline.split()[2]) | |
genenames = re.findall(r'entg\|(.*?),', eachline) | |
for genename in genenames: | |
if genename not in geneList: | |
geneList[genename] = { chrom: [(start, stop)] } | |
elif chrom not in geneList[genename]: | |
geneList[genename][chrom] = [(start, stop)] | |
else: | |
geneList[genename][chrom].append((start, stop)) | |
return geneList | |
# Get bed format from geneList | |
def getGeneBed(geneList): | |
with open('all_gene.bed', 'w') as out: | |
for genename in geneList: | |
for chrom, value in geneList[genename].items(): | |
for start,stop in value: | |
out.write('{}\t{}\t{}\n'.format(chrom, start, stop)) | |
# posDepthFile comes from result of samtools depth commands | |
# Output 7 columns: GeneName Chrom TotalLength TotalDepth MeanDepth CoveredBases ProCoveredBases | |
def depthStatByGene(geneList, posDepthFile): | |
# get each base's depth | |
with open(posDepthFile) as f: | |
posDepthDict = {} | |
for eachline in f: | |
pos = int(eachline.strip().split()[1]) | |
depth = int(eachline.strip().split()[2]) | |
posDepthDict[pos] = depth | |
# statistic each gene's depth | |
with open('coverage.bygene.xls', 'w') as out: | |
out.write('GeneName\tChrom\tTotalLength\tTotalDepth\tMeanDepth\tCoveredBases\tProCoveredBases\n') | |
for genename in geneList: | |
total_length = 0 | |
total_depth = 0 | |
covered_bases = 0 | |
for chrom, value in geneList[genename].items(): | |
for start, stop in value: | |
bed_length = stop - start + 1 | |
bed_bases = 0 | |
bed_depth = 0 | |
for i in xrange(start, stop+1): | |
if i in posDepthDict: | |
bed_bases += 1 | |
bed_depth += posDepthDict[i] | |
total_length += bed_length | |
total_depth += bed_depth | |
covered_bases += bed_bases | |
print total_length, total_depth, covered_bases | |
mean_depth = '%.2f' % (float(total_depth) / total_length) | |
pro_covered_bases = '%.3f' % (float(covered_bases) / total_length) | |
out.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(genename, chrom, total_length, total_depth, mean_depth, covered_bases, pro_covered_bases)) | |
if __name__ == '__main__': | |
import sys | |
import os | |
if len(sys.argv) < 2: | |
print "Usage python %s <regrions.bed> <nodup.bam>" % sys.argv[0] | |
exit(1) | |
geneList = getGeneList(sys.argv[1]) | |
getGeneBed(geneList) | |
command = 'samtools depth -b all_gene.bed %s > posDepth.xls' % sys.argv[2] | |
os.system(command) | |
print "samtools depth done~" | |
depthStatByGene(geneList, 'posDepth.xls') | |
#with open('one_gene_in_different_crhoms.txt', 'w') as out: | |
# for gene, value in geneList.items(): | |
# if len(value.keys()) > 1: | |
# out.write('{}\t{}\n'.format(gene, value.keys())) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment