Created
November 1, 2017 21:07
-
-
Save zkamvar/fd2b61adb6e8da1d3bfdfe9b826403af to your computer and use it in GitHub Desktop.
Find chromosome regions for given SSR GenBank IDs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# This script will BLAST a series of GenBank IDs representing SSR sequences from | |
# Sirjusingh, C., Kohn, L.M., 2001. Characterization of microsatellites in the | |
# fungal plant pathogen, Sclerotinia sclerotiorum. Molecular Ecology 1, 267e269. | |
# | |
# This uses biopython to download the XML, save it to a file in the working | |
# directory, and then looks for matches that have Sclerotinia chromosome in the | |
# title and has a score of > 100. If there's a better way to do this, I would | |
# like to know it. Currently, it can download a few results at a time and then | |
# it just hangs. | |
from Bio.Blast import NCBIWWW | |
from Bio.Blast import NCBIXML | |
import os | |
import re | |
import sys | |
# Each of these sequences represents the conventional name and GenBank ID for | |
# the SSR genotypes defined in Sirjusingh and Kohn, 2001 | |
seqs = { | |
"41-1" : "AF377899", | |
"5-2" : "AF377900", | |
"6-2" : "AF377901", | |
"7-2" : "AF377902", | |
"9-2" : "AF377903", | |
"11-2" : "AF377905", | |
"12-2" : "AF377906", | |
"13-2" : "AF377907", | |
"5-3" : "AF377908", | |
"7-3" : "AF377909", | |
"8-3" : "AF377910", | |
"17-3" : "AF377911", | |
"20-3" : "AF377912", | |
"23-4" : "AF377913", | |
"36-4" : "AF377914", | |
"42-4" : "AF377916", | |
"50-4" : "AF377917", | |
"55-4" : "AF377918", | |
"92-4" : "AF377919", | |
"99-4" : "AF377926", | |
"106-4" : "AF377921", | |
"110-4" : "AF377922", | |
"114-4" : "AF377923", | |
"117-4" : "AF377924", | |
"119-4" : "AF377925" | |
} | |
for s in seqs.keys(): | |
print("Processing " + str(seqs[s]), file = sys.stderr) | |
xml = str(seqs[s]) + ".xml" | |
# Check if the xml file has already been downloaded. | |
# If it hasn't, download it with qblast() | |
if not os.path.isfile(xml): | |
print("Downloading...", file = sys.stderr) | |
result_handle = NCBIWWW.qblast("blastn", "nt", str(seqs[s])) | |
print("Done", file = sys.stderr) | |
with open(xml, "w") as out_handle: | |
out_handle.write(result_handle.read()) | |
# Read in the file and grab the blast records | |
result_handle = open(xml) | |
blast_records = NCBIXML.read(result_handle) | |
for alignment in blast_records.alignments: | |
HSP = alignment.hsps[0] | |
# Determine if the alignment comes from Ssc genome. We don't care about | |
# any other sequence | |
if re.search(r'Sclerotinia.+?chromosome', alignment.title) and HSP.score > 100: | |
# grab the title | |
res = s + ";" + str(seqs[s]) + ";" + alignment.title + ";" | |
# grab the positions on chromosome | |
res = res + str(HSP.sbjct_start) + ";" | |
res = res + str(HSP.sbjct_end) | |
# print to stdout | |
print(res) | |
else: | |
next | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Okay, after running this for an ungodly amount of time, I finally got a csv file (yes, I'm storing this in Box... I'm lazy) that worked and, after processing in R (hey, that's where I'm comfortable), I found this: