zkamvar/find_chromosome_region.py

## find_chromosome_region.py
#!/usr/bin/env python3
#
# This script will BLAST a series of GenBank IDs representing SSR sequences from
# Sirjusingh, C., Kohn, L.M., 2001. Characterization of microsatellites in the
# fungal plant pathogen, Sclerotinia sclerotiorum. Molecular Ecology 1, 267e269.
#
# This uses biopython to download the XML, save it to a file in the working
# directory, and then looks for matches that have Sclerotinia chromosome in the
# title and has a score of > 100. If there's a better way to do this, I would
# like to know it. Currently, it can download a few results at a time and then
# it just hangs.

from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import os
import re
import sys


# Each of these sequences represents the conventional name and GenBank ID for
# the SSR genotypes defined in Sirjusingh and Kohn, 2001
seqs = {
"41-1" : "AF377899",
"5-2" : "AF377900",
"6-2" : "AF377901",
"7-2" : "AF377902",
"9-2" : "AF377903",
"11-2" : "AF377905",
"12-2" : "AF377906",
"13-2" : "AF377907",
"5-3" : "AF377908",
"7-3" : "AF377909",
"8-3" : "AF377910",
"17-3" : "AF377911",
"20-3" : "AF377912",
"23-4" : "AF377913",
"36-4" : "AF377914",
"42-4" : "AF377916",
"50-4" : "AF377917",
"55-4" : "AF377918",
"92-4" : "AF377919",
"99-4" : "AF377926",
"106-4" : "AF377921",
"110-4" : "AF377922",
"114-4" : "AF377923",
"117-4" : "AF377924",
"119-4" : "AF377925"
}

for s in seqs.keys():
	print("Processing " + str(seqs[s]), file = sys.stderr)
	xml = str(seqs[s]) + ".xml"
	# Check if the xml file has already been downloaded.
	# If it hasn't, download it with qblast()
	if not os.path.isfile(xml):
		print("Downloading...", file = sys.stderr)
		result_handle = NCBIWWW.qblast("blastn", "nt", str(seqs[s]))
		print("Done", file = sys.stderr)
		with open(xml, "w") as out_handle:
				out_handle.write(result_handle.read())
	# Read in the file and grab the blast records
	result_handle = open(xml)
	blast_records = NCBIXML.read(result_handle)
	for alignment in blast_records.alignments:
		HSP = alignment.hsps[0]
		# Determine if the alignment comes from Ssc genome. We don't care about
		# any other sequence
		if re.search(r'Sclerotinia.+?chromosome', alignment.title) and HSP.score > 100:
			# grab the title
			res = s + ";" + str(seqs[s]) + ";" + alignment.title + ";"
			# grab the positions on chromosome
			res = res + str(HSP.sbjct_start) + ";"
			res = res + str(HSP.sbjct_end)
			# print to stdout
			print(res)
		else:
			next
	#!/usr/bin/env python3
	#
	# This script will BLAST a series of GenBank IDs representing SSR sequences from
	# Sirjusingh, C., Kohn, L.M., 2001. Characterization of microsatellites in the
	# fungal plant pathogen, Sclerotinia sclerotiorum. Molecular Ecology 1, 267e269.
	#
	# This uses biopython to download the XML, save it to a file in the working
	# directory, and then looks for matches that have Sclerotinia chromosome in the
	# title and has a score of > 100. If there's a better way to do this, I would
	# like to know it. Currently, it can download a few results at a time and then
	# it just hangs.

	from Bio.Blast import NCBIWWW
	from Bio.Blast import NCBIXML
	import os
	import re
	import sys


	# Each of these sequences represents the conventional name and GenBank ID for
	# the SSR genotypes defined in Sirjusingh and Kohn, 2001
	seqs = {
	"41-1" : "AF377899",
	"5-2" : "AF377900",
	"6-2" : "AF377901",
	"7-2" : "AF377902",
	"9-2" : "AF377903",
	"11-2" : "AF377905",
	"12-2" : "AF377906",
	"13-2" : "AF377907",
	"5-3" : "AF377908",
	"7-3" : "AF377909",
	"8-3" : "AF377910",
	"17-3" : "AF377911",
	"20-3" : "AF377912",
	"23-4" : "AF377913",
	"36-4" : "AF377914",
	"42-4" : "AF377916",
	"50-4" : "AF377917",
	"55-4" : "AF377918",
	"92-4" : "AF377919",
	"99-4" : "AF377926",
	"106-4" : "AF377921",
	"110-4" : "AF377922",
	"114-4" : "AF377923",
	"117-4" : "AF377924",
	"119-4" : "AF377925"
	}

	for s in seqs.keys():
	print("Processing " + str(seqs[s]), file = sys.stderr)
	xml = str(seqs[s]) + ".xml"
	# Check if the xml file has already been downloaded.
	# If it hasn't, download it with qblast()
	if not os.path.isfile(xml):
	print("Downloading...", file = sys.stderr)
	result_handle = NCBIWWW.qblast("blastn", "nt", str(seqs[s]))
	print("Done", file = sys.stderr)
	with open(xml, "w") as out_handle:
	out_handle.write(result_handle.read())
	# Read in the file and grab the blast records
	result_handle = open(xml)
	blast_records = NCBIXML.read(result_handle)
	for alignment in blast_records.alignments:
	HSP = alignment.hsps[0]
	# Determine if the alignment comes from Ssc genome. We don't care about
	# any other sequence
	if re.search(r'Sclerotinia.+?chromosome', alignment.title) and HSP.score > 100:
	# grab the title
	res = s + ";" + str(seqs[s]) + ";" + alignment.title + ";"
	# grab the positions on chromosome
	res = res + str(HSP.sbjct_start) + ";"
	res = res + str(HSP.sbjct_end)
	# print to stdout
	print(res)
	else:
	next