Skip to content

Instantly share code, notes, and snippets.

@suqingdong
Created October 12, 2016 05:22
Show Gist options
  • Save suqingdong/ca56c5e17b920812d4d774b85465c325 to your computer and use it in GitHub Desktop.
Save suqingdong/ca56c5e17b920812d4d774b85465c325 to your computer and use it in GitHub Desktop.
Get all gene's bed from origin bed file according to the result snp result file
#!/usr/bin/env python
#======================
# Date: 2016-10-10
# Author: suqingdong
# Introductions: get gene list from annovar annotation result file, then generate a bed file from origin bed.
# Usage: python getGeneBed.py <genelist> <originbed> [outbed]
#======================
import re
# Get a gene list from snpannovar file
def getGeneList(snpannovarfile):
geneList = []
with open(snpannovarfile) as f:
f.readline()
for eachline in f:
genename = eachline.split('\t')[7]
if ',' in genename: # may more than one gene
for genename in genename.split(','):
if genename != '.' and genename not in geneList:
geneList.append(genename)
else:
if genename != '.' and genename not in geneList:
geneList.append(genename)
return geneList
# Get gene bed from origin bed file
def getGeneBed(originbedfile):
geneBedDict = {}
with open(originbedfile) as f:
for eachline in f:
if eachline.startswith('chr'):
chr, start, stop = eachline.split('\t')[:3]
genenames = re.findall(r'entg\|(.*?),', eachline)
if not genenames :
continue
# the same genename might exist in different chromsomes,
# and one bed region might exist in diffrent genes
for genename in genenames:
if genename not in geneBedDict:
geneBedDict[genename] = [{'chr':chr,'start':start, 'stop':stop}]
elif geneBedDict[genename][-1]['chr'] == chr:
geneBedDict[genename][-1]['stop'] = stop
else:
geneBedDict[genename].append({'chr':chr,'start':start, 'stop':stop})
return geneBedDict
# Generate a new bed file according to the gene list.
def saveGeneBed(geneList, geneBedDict, outputfile):
with open(outputfile, 'w') as out:
with open(re.sub('.bed', '_with_gene.bed', outputfile), 'w') as out2:
for genename in geneList:
if genename in geneBedDict:
for geneDict in geneBedDict[genename]:
line = '{}\t{}\t{}'.format(geneDict['chr'], geneDict['start'], geneDict['stop'])
out.write(line+'\n')
out2.write(line+'\t'+genename+'\n')
def main():
snpannovarfile = 'NS0_DNA.samtools.snp.annovar.mm9_multianno.xls'
originbedfile = 'S0276129_Regions.bed'
geneList = getGeneList(snpannovarfile)
geneBedDict = getGeneBed(originbedfile)
saveGeneBed(geneList, geneBedDict, 'all_gene.bed')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment