Created
February 25, 2017 15:18
-
-
Save wdecoster/8204dba7e504725e5bb249ca77bb2788 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Takes in a tsv file containing minimally chromosome begin end to calculate the GC content of the targets | |
# Second argument is the name of the field to take as identifier | |
# Alternative input is a fasta file | |
# wdecoster | |
import sys, os, time | |
from Bio import Entrez, SeqIO | |
Entrez.email = "example@email.com" | |
def getGC(seq): | |
return(str(int(round((sum([1.0 for nucl in seq if nucl in ['G', 'C']]) / len(seq)) * 100)))) | |
def getFas(chrID, start, stop): | |
record = str(SeqIO.read(Entrez.efetch(db="nucleotide", id=chrID, | |
rettype="fasta", strand="1", seq_start=start, seq_stop=stop), "fasta").seq) | |
return([getGC(record), str(len(record)), str(record)]) | |
def processFasta(fas): | |
print("identifier\tGCcontent\tlength") | |
for record in SeqIO.parse(fas, "fasta"): | |
print("{}\t{}\t{}".format(record.id, getGC(str(record.seq)), str(len(record.seq)))) | |
def processBed(bedstream, namefield): | |
hg19_gidict = {'1': "224589800", '2': "224589811", '3': "224589815", '4': "224589816", '5': "224589817", '6': "224589818", | |
'7': "224589819", '8': "224589820", '9': "224589821", '10': "224589801", '11': "224589802", '12': "224589803", '13': "224589804", | |
'14': "224589805", '15': "224589806", '16': "224589807", '17': "224589808", '18': "224589809", '19': "224589810", '20': "224589812", | |
'21': "224589813", '22': "224589814", 'X': "224589822", 'Y': "224589823", 'M': "17981852"} | |
print("identifier\tGCcontent\tlength\tsequence") | |
for line in bedstream: | |
linelist = line.strip().split() | |
print("{}\t{}".format(linelist[namefield], "\t".join(getFas(hg19_gidict[linelist[0].replace('chr','')], linelist[1], linelist[2])))) | |
time.sleep(1) | |
def main(): | |
if not len(sys.argv) in [2,3]: | |
sys.exit("ERROR: Incorrect number of arguments\nEither supply a bed file with header and name of identifier to use, or a fasta file.") | |
with open(sys.argv[1], 'r') as input: | |
for line in input: | |
if line.startswith('#'): | |
continue | |
else: | |
if line.startswith("chromosome\tbegin\tend"): | |
try: | |
namefield = line.strip().split().index(sys.argv[2]) #Find the index of the field to be used as identifier | |
processBed(input, namefield) | |
break | |
except ValueError: | |
sys.exit("Could not find {} in the header".format(sys.argv[2])) | |
elif line.startswith(">"): | |
processFasta(sys.argv[1]) | |
break | |
else: | |
sys.exit("ERROR:\nRequired header not found, expecting minimally 'chromosome' 'begin' 'end' '<identifier>\nAlternatively use a fasta file as input.'") | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment