cathalgarvey/seqio_answers.py

## seqio_answers.py
from Bio import SeqIO
from Bio.Seq import Seq

sequence_generator = SeqIO.parse("br_sequences.fasta", "fasta")
all_sequences = list(sequence_generator)

# * How many records are in the file?
print("Number of records:", len(all_sequences))

# * How many records have a sequence of length 249?
sequences_at_249 = []
for item in all_sequences:
    if len(item.seq) == 249:
        sequences_at_249.append(item)
print("Number with length of 249:", sequences_at_249)

# * What is the title for the record with the shortest sequence?
shortest = all_sequences[0]
for record in all_sequences:
    if len(record.seq) < len(shortest.seq):
        shortest = record
# * Is there more than one record with that length?
allshortest = []
for record in all_sequences:
    if len(record.seq) == len(shortest.seq):
        allshortest.append(record)
num_shortest = len(allshortest)

# * What is the title for the record with the longest sequence?
longest = all_sequences[0]
for record in all_sequences:
    if len(record.seq) > len(longest.seq):
        longest = record
# * Is there more than one record with that length?
alllongest = []
for record in all_sequences:
    if len(record.seq) == len(longest.seq):
        alllongest.append(record)
num_longest = len(longest)

# * How long is the sequence for the GenBank identifier gi|114812?
for record in all_sequence:
    if "gi|114812" in record.description: # Easiest way for fasta!
        print("Length of gi|114812", len(record.seq))

# * Which records have 3D structures submitted to the PDB?
#  (Give me the 4 character PDB identifier.)
# FUN! On PDB entries!
#
# As I suspected in the latter third of the tutorial, dbxrefs is
# where you'll find PDB data for any sequences graced with protein
# databank references. dbxrefs is not, as I assumed, a dictionary,
# but is rather a list of strings!
# According to the scant information on the Biopython tutorial:
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec79
# ..PDB entries will be of form: ['PDB; 1ifl ; 1-52;']
# I'm assuming the middle portion is the PDB accession number.
#
# Parsing the accession from this will ask you to know a bit about
# the string-methods str.strip(), which strips all characters passed
# as arguments from the start and end of strings, and str.split(),
# which splits a string by the substring passed as argument.
# So, for the example here: ['PDB; 1ifl ; 1-52;'], there is one dbxref,
# and you get the accession in the middle ("1ifl") using:
#
#> ref = whatever.dbxrefs[0]   # Get first item
#> ref = ref.replace(";", " ") # Those semicolons are annoying! Replace with
#                              # a space character, to make blocks of space
#                              # continuous, which split()/strip() will eat.
#> ref = ref.strip()           # In case there's leading or trailing whitespace
#> ref = ref.split()           # Split by blocks of whitespace, the default.
# refbits is a list that, in this case, looks like: ['PDB', '1ifl', '1-52']
# The accession is assumed to be the one after "PDB", rather than the "second",
# because it's possible there'll be some other crud in front of "PDB". So, find
# "PDB" and get whatever comes after it:
#> pdbid = refbits[ refbits.index("PDB") ] # index is a list method to find the
#                                          # numeric index of something *if it's present*.
#
# You were only asked to get the pdbids for all entries with PDB entries.
# But, if you needed to work further, there is, a PDB module in Biopython just for
# getting and working with PDB data:
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec156
# So without further ado, a function that, for any given record, will return
# any and all PDB accessions in the record, or an empty list (for consistency)
# if none are found:
def get_pdb_accessions(record):
    # Remember that empty lists or dictionaries are equivalent to false,
    # so if record.dbxrefs is empty, the function immediately returns [].
    if not record.dbxrefs:
        return []
    # Otherwise, it looks at each dbxrefs entry, and if "PDB" is in the
    # string, it assumes it's a PDB entry and returns the accession.
    # In reality, this may be a little naive, but let's assume it's OK.
    else:
        pdb_xrefs = []
        for item in record.dbxref:
            if "PDB" in item:
                item = item.replace(";", " ") # Kill all semicolons with fire
                item = item.strip()           # Remove all flanking whitespace
                bits = item.split()           # Split by each block of whitespace
                accession = bits[ bits.index("PDB") + 1 ] # Get the bit AFTER "PDB"
                pdb_xrefs.append(accession)
        # Right, that's all the apparent PDB accessions (could be none), return 'em!
        return pdb_xrefs

# Now, what was the question again?
# * Which records have 3D structures submitted to the PDB?
#  (Give me the 4 character PDB identifier.)
for record in all_sequences:
    pdb_xrefs = get_pdb_accessions(record)
    if pdb_xrefs:
        # Remember: empty lists evaluate false, so this only applies to results!
        print("Found PBD xrefs in record", record.id, "-", pdb_xrefs)


# * Some records contain identical sequences. How many unique sequences are present?
# * Give the titles for two different records which have identical sequences
#   (any two will do)
# If only the first question were asked, I'd have done something hackish involving the
#  lesser-known and little-loved datatype "set()" which only stores unique stuff.
# Instead, I'm going to suggest something even *more* hackish.
# Create a dictionary keyed by the entire sequence of the records (Inefficient!!),
#  with each value being a list of the records with that sequence! This achieves
#  both goals, because the dictionaries will be keyed by sequence, making the number
#  of keys equal to the number of unique sequences, and each key will map to the
#  records that carry the sequence..so you can quickly select some that have more than
#  one entry, and report those as duplicates.
# The "efficient" way to do this would be to use a concept called "hash functions" to
#  reduce each sequence to a small, but unique, string for use as a key. You *don't*
#  need to understand hash functions for this exercise, but they are pretty nifty. :)
records_by_seq = {}
for record in all_sequences:
    if record.seq not in records_by_seq:
        # Using "this in that" on dicts checks the dict *keys* not values.
        # So we're asking if record.seq is not being used as a dict key already.
        records_by_seq[record.seq].append(record)
    else:
        # Not already in the dict, so add a new list to contain this and any future
        # records with this sequence.
        records_by_seq[record.seq] = [ record ]
print("Number of unique sequence records:", len(records_by_seq))
print("Some records with same sequence:")
for key in records_by_seq:
    if len(records_by_seq[key]) > 1: # For duplicates only
        for record in records_by_seq[key]:
            print("\t", record)      # "\t" is a tab character.
	from Bio import SeqIO
	from Bio.Seq import Seq

	sequence_generator = SeqIO.parse("br_sequences.fasta", "fasta")
	all_sequences = list(sequence_generator)

	# * How many records are in the file?
	print("Number of records:", len(all_sequences))

	# * How many records have a sequence of length 249?
	sequences_at_249 = []
	for item in all_sequences:
	if len(item.seq) == 249:
	sequences_at_249.append(item)
	print("Number with length of 249:", sequences_at_249)

	# * What is the title for the record with the shortest sequence?
	shortest = all_sequences[0]
	for record in all_sequences:
	if len(record.seq) < len(shortest.seq):
	shortest = record
	# * Is there more than one record with that length?
	allshortest = []
	for record in all_sequences:
	if len(record.seq) == len(shortest.seq):
	allshortest.append(record)
	num_shortest = len(allshortest)

	# * What is the title for the record with the longest sequence?
	longest = all_sequences[0]
	for record in all_sequences:
	if len(record.seq) > len(longest.seq):
	longest = record
	# * Is there more than one record with that length?
	alllongest = []
	for record in all_sequences:
	if len(record.seq) == len(longest.seq):
	alllongest.append(record)
	num_longest = len(longest)

	# * How long is the sequence for the GenBank identifier gi\|114812?
	for record in all_sequence:
	if "gi\|114812" in record.description: # Easiest way for fasta!
	print("Length of gi\|114812", len(record.seq))

	# * Which records have 3D structures submitted to the PDB?
	# (Give me the 4 character PDB identifier.)
	# FUN! On PDB entries!
	#
	# As I suspected in the latter third of the tutorial, dbxrefs is
	# where you'll find PDB data for any sequences graced with protein
	# databank references. dbxrefs is not, as I assumed, a dictionary,
	# but is rather a list of strings!
	# According to the scant information on the Biopython tutorial:
	# http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec79
	# ..PDB entries will be of form: ['PDB; 1ifl ; 1-52;']
	# I'm assuming the middle portion is the PDB accession number.
	#
	# Parsing the accession from this will ask you to know a bit about
	# the string-methods str.strip(), which strips all characters passed
	# as arguments from the start and end of strings, and str.split(),
	# which splits a string by the substring passed as argument.
	# So, for the example here: ['PDB; 1ifl ; 1-52;'], there is one dbxref,
	# and you get the accession in the middle ("1ifl") using:
	#
	#> ref = whatever.dbxrefs[0] # Get first item
	#> ref = ref.replace(";", " ") # Those semicolons are annoying! Replace with
	# # a space character, to make blocks of space
	# # continuous, which split()/strip() will eat.
	#> ref = ref.strip() # In case there's leading or trailing whitespace
	#> ref = ref.split() # Split by blocks of whitespace, the default.
	# refbits is a list that, in this case, looks like: ['PDB', '1ifl', '1-52']
	# The accession is assumed to be the one after "PDB", rather than the "second",
	# because it's possible there'll be some other crud in front of "PDB". So, find
	# "PDB" and get whatever comes after it:
	#> pdbid = refbits[ refbits.index("PDB") ] # index is a list method to find the
	# # numeric index of something if it's present.
	#
	# You were only asked to get the pdbids for all entries with PDB entries.
	# But, if you needed to work further, there is, a PDB module in Biopython just for
	# getting and working with PDB data:
	# http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec156
	# So without further ado, a function that, for any given record, will return
	# any and all PDB accessions in the record, or an empty list (for consistency)
	# if none are found:
	def get_pdb_accessions(record):
	# Remember that empty lists or dictionaries are equivalent to false,
	# so if record.dbxrefs is empty, the function immediately returns [].
	if not record.dbxrefs:
	return []
	# Otherwise, it looks at each dbxrefs entry, and if "PDB" is in the
	# string, it assumes it's a PDB entry and returns the accession.
	# In reality, this may be a little naive, but let's assume it's OK.
	else:
	pdb_xrefs = []
	for item in record.dbxref:
	if "PDB" in item:
	item = item.replace(";", " ") # Kill all semicolons with fire
	item = item.strip() # Remove all flanking whitespace
	bits = item.split() # Split by each block of whitespace
	accession = bits[ bits.index("PDB") + 1 ] # Get the bit AFTER "PDB"
	pdb_xrefs.append(accession)
	# Right, that's all the apparent PDB accessions (could be none), return 'em!
	return pdb_xrefs

	# Now, what was the question again?
	# * Which records have 3D structures submitted to the PDB?
	# (Give me the 4 character PDB identifier.)
	for record in all_sequences:
	pdb_xrefs = get_pdb_accessions(record)
	if pdb_xrefs:
	# Remember: empty lists evaluate false, so this only applies to results!
	print("Found PBD xrefs in record", record.id, "-", pdb_xrefs)


	# * Some records contain identical sequences. How many unique sequences are present?
	# * Give the titles for two different records which have identical sequences
	# (any two will do)
	# If only the first question were asked, I'd have done something hackish involving the
	# lesser-known and little-loved datatype "set()" which only stores unique stuff.
	# Instead, I'm going to suggest something even more hackish.
	# Create a dictionary keyed by the entire sequence of the records (Inefficient!!),
	# with each value being a list of the records with that sequence! This achieves
	# both goals, because the dictionaries will be keyed by sequence, making the number
	# of keys equal to the number of unique sequences, and each key will map to the
	# records that carry the sequence..so you can quickly select some that have more than
	# one entry, and report those as duplicates.
	# The "efficient" way to do this would be to use a concept called "hash functions" to
	# reduce each sequence to a small, but unique, string for use as a key. You don't
	# need to understand hash functions for this exercise, but they are pretty nifty. :)
	records_by_seq = {}
	for record in all_sequences:
	if record.seq not in records_by_seq:
	# Using "this in that" on dicts checks the dict keys not values.
	# So we're asking if record.seq is not being used as a dict key already.
	records_by_seq[record.seq].append(record)
	else:
	# Not already in the dict, so add a new list to contain this and any future
	# records with this sequence.
	records_by_seq[record.seq] = [ record ]
	print("Number of unique sequence records:", len(records_by_seq))
	print("Some records with same sequence:")
	for key in records_by_seq:
	if len(records_by_seq[key]) > 1: # For duplicates only
	for record in records_by_seq[key]:
	print("\t", record) # "\t" is a tab character.