Created
March 31, 2014 19:02
-
-
Save cathalgarvey/9899686 to your computer and use it in GitHub Desktop.
Suggested Solutions to SeqIO Exercises
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from Bio import SeqIO | |
from Bio.Seq import Seq | |
sequence_generator = SeqIO.parse("br_sequences.fasta", "fasta") | |
all_sequences = list(sequence_generator) | |
# * How many records are in the file? | |
print("Number of records:", len(all_sequences)) | |
# * How many records have a sequence of length 249? | |
sequences_at_249 = [] | |
for item in all_sequences: | |
if len(item.seq) == 249: | |
sequences_at_249.append(item) | |
print("Number with length of 249:", sequences_at_249) | |
# * What is the title for the record with the shortest sequence? | |
shortest = all_sequences[0] | |
for record in all_sequences: | |
if len(record.seq) < len(shortest.seq): | |
shortest = record | |
# * Is there more than one record with that length? | |
allshortest = [] | |
for record in all_sequences: | |
if len(record.seq) == len(shortest.seq): | |
allshortest.append(record) | |
num_shortest = len(allshortest) | |
# * What is the title for the record with the longest sequence? | |
longest = all_sequences[0] | |
for record in all_sequences: | |
if len(record.seq) > len(longest.seq): | |
longest = record | |
# * Is there more than one record with that length? | |
alllongest = [] | |
for record in all_sequences: | |
if len(record.seq) == len(longest.seq): | |
alllongest.append(record) | |
num_longest = len(longest) | |
# * How long is the sequence for the GenBank identifier gi|114812? | |
for record in all_sequence: | |
if "gi|114812" in record.description: # Easiest way for fasta! | |
print("Length of gi|114812", len(record.seq)) | |
# * Which records have 3D structures submitted to the PDB? | |
# (Give me the 4 character PDB identifier.) | |
# FUN! On PDB entries! | |
# | |
# As I suspected in the latter third of the tutorial, dbxrefs is | |
# where you'll find PDB data for any sequences graced with protein | |
# databank references. dbxrefs is not, as I assumed, a dictionary, | |
# but is rather a list of strings! | |
# According to the scant information on the Biopython tutorial: | |
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec79 | |
# ..PDB entries will be of form: ['PDB; 1ifl ; 1-52;'] | |
# I'm assuming the middle portion is the PDB accession number. | |
# | |
# Parsing the accession from this will ask you to know a bit about | |
# the string-methods str.strip(), which strips all characters passed | |
# as arguments from the start and end of strings, and str.split(), | |
# which splits a string by the substring passed as argument. | |
# So, for the example here: ['PDB; 1ifl ; 1-52;'], there is one dbxref, | |
# and you get the accession in the middle ("1ifl") using: | |
# | |
#> ref = whatever.dbxrefs[0] # Get first item | |
#> ref = ref.replace(";", " ") # Those semicolons are annoying! Replace with | |
# # a space character, to make blocks of space | |
# # continuous, which split()/strip() will eat. | |
#> ref = ref.strip() # In case there's leading or trailing whitespace | |
#> ref = ref.split() # Split by blocks of whitespace, the default. | |
# refbits is a list that, in this case, looks like: ['PDB', '1ifl', '1-52'] | |
# The accession is assumed to be the one after "PDB", rather than the "second", | |
# because it's possible there'll be some other crud in front of "PDB". So, find | |
# "PDB" and get whatever comes after it: | |
#> pdbid = refbits[ refbits.index("PDB") ] # index is a list method to find the | |
# # numeric index of something *if it's present*. | |
# | |
# You were only asked to get the pdbids for all entries with PDB entries. | |
# But, if you needed to work further, there is, a PDB module in Biopython just for | |
# getting and working with PDB data: | |
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec156 | |
# So without further ado, a function that, for any given record, will return | |
# any and all PDB accessions in the record, or an empty list (for consistency) | |
# if none are found: | |
def get_pdb_accessions(record): | |
# Remember that empty lists or dictionaries are equivalent to false, | |
# so if record.dbxrefs is empty, the function immediately returns []. | |
if not record.dbxrefs: | |
return [] | |
# Otherwise, it looks at each dbxrefs entry, and if "PDB" is in the | |
# string, it assumes it's a PDB entry and returns the accession. | |
# In reality, this may be a little naive, but let's assume it's OK. | |
else: | |
pdb_xrefs = [] | |
for item in record.dbxref: | |
if "PDB" in item: | |
item = item.replace(";", " ") # Kill all semicolons with fire | |
item = item.strip() # Remove all flanking whitespace | |
bits = item.split() # Split by each block of whitespace | |
accession = bits[ bits.index("PDB") + 1 ] # Get the bit AFTER "PDB" | |
pdb_xrefs.append(accession) | |
# Right, that's all the apparent PDB accessions (could be none), return 'em! | |
return pdb_xrefs | |
# Now, what was the question again? | |
# * Which records have 3D structures submitted to the PDB? | |
# (Give me the 4 character PDB identifier.) | |
for record in all_sequences: | |
pdb_xrefs = get_pdb_accessions(record) | |
if pdb_xrefs: | |
# Remember: empty lists evaluate false, so this only applies to results! | |
print("Found PBD xrefs in record", record.id, "-", pdb_xrefs) | |
# * Some records contain identical sequences. How many unique sequences are present? | |
# * Give the titles for two different records which have identical sequences | |
# (any two will do) | |
# If only the first question were asked, I'd have done something hackish involving the | |
# lesser-known and little-loved datatype "set()" which only stores unique stuff. | |
# Instead, I'm going to suggest something even *more* hackish. | |
# Create a dictionary keyed by the entire sequence of the records (Inefficient!!), | |
# with each value being a list of the records with that sequence! This achieves | |
# both goals, because the dictionaries will be keyed by sequence, making the number | |
# of keys equal to the number of unique sequences, and each key will map to the | |
# records that carry the sequence..so you can quickly select some that have more than | |
# one entry, and report those as duplicates. | |
# The "efficient" way to do this would be to use a concept called "hash functions" to | |
# reduce each sequence to a small, but unique, string for use as a key. You *don't* | |
# need to understand hash functions for this exercise, but they are pretty nifty. :) | |
records_by_seq = {} | |
for record in all_sequences: | |
if record.seq not in records_by_seq: | |
# Using "this in that" on dicts checks the dict *keys* not values. | |
# So we're asking if record.seq is not being used as a dict key already. | |
records_by_seq[record.seq].append(record) | |
else: | |
# Not already in the dict, so add a new list to contain this and any future | |
# records with this sequence. | |
records_by_seq[record.seq] = [ record ] | |
print("Number of unique sequence records:", len(records_by_seq)) | |
print("Some records with same sequence:") | |
for key in records_by_seq: | |
if len(records_by_seq[key]) > 1: # For duplicates only | |
for record in records_by_seq[key]: | |
print("\t", record) # "\t" is a tab character. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment