Created
October 12, 2016 05:22
-
-
Save suqingdong/ca56c5e17b920812d4d774b85465c325 to your computer and use it in GitHub Desktop.
Get all gene's bed from origin bed file according to the result snp result file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#====================== | |
# Date: 2016-10-10 | |
# Author: suqingdong | |
# Introductions: get gene list from annovar annotation result file, then generate a bed file from origin bed. | |
# Usage: python getGeneBed.py <genelist> <originbed> [outbed] | |
#====================== | |
import re | |
# Get a gene list from snpannovar file | |
def getGeneList(snpannovarfile): | |
geneList = [] | |
with open(snpannovarfile) as f: | |
f.readline() | |
for eachline in f: | |
genename = eachline.split('\t')[7] | |
if ',' in genename: # may more than one gene | |
for genename in genename.split(','): | |
if genename != '.' and genename not in geneList: | |
geneList.append(genename) | |
else: | |
if genename != '.' and genename not in geneList: | |
geneList.append(genename) | |
return geneList | |
# Get gene bed from origin bed file | |
def getGeneBed(originbedfile): | |
geneBedDict = {} | |
with open(originbedfile) as f: | |
for eachline in f: | |
if eachline.startswith('chr'): | |
chr, start, stop = eachline.split('\t')[:3] | |
genenames = re.findall(r'entg\|(.*?),', eachline) | |
if not genenames : | |
continue | |
# the same genename might exist in different chromsomes, | |
# and one bed region might exist in diffrent genes | |
for genename in genenames: | |
if genename not in geneBedDict: | |
geneBedDict[genename] = [{'chr':chr,'start':start, 'stop':stop}] | |
elif geneBedDict[genename][-1]['chr'] == chr: | |
geneBedDict[genename][-1]['stop'] = stop | |
else: | |
geneBedDict[genename].append({'chr':chr,'start':start, 'stop':stop}) | |
return geneBedDict | |
# Generate a new bed file according to the gene list. | |
def saveGeneBed(geneList, geneBedDict, outputfile): | |
with open(outputfile, 'w') as out: | |
with open(re.sub('.bed', '_with_gene.bed', outputfile), 'w') as out2: | |
for genename in geneList: | |
if genename in geneBedDict: | |
for geneDict in geneBedDict[genename]: | |
line = '{}\t{}\t{}'.format(geneDict['chr'], geneDict['start'], geneDict['stop']) | |
out.write(line+'\n') | |
out2.write(line+'\t'+genename+'\n') | |
def main(): | |
snpannovarfile = 'NS0_DNA.samtools.snp.annovar.mm9_multianno.xls' | |
originbedfile = 'S0276129_Regions.bed' | |
geneList = getGeneList(snpannovarfile) | |
geneBedDict = getGeneBed(originbedfile) | |
saveGeneBed(geneList, geneBedDict, 'all_gene.bed') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment