Skip to content

Instantly share code, notes, and snippets.

@clarle
Created July 13, 2012 03:34
Show Gist options
  • Save clarle/3102541 to your computer and use it in GitHub Desktop.
Save clarle/3102541 to your computer and use it in GitHub Desktop.
genet - Python screen scraper for the Cystic Fibrosis Mutation Database
"""
genet - Python screen scraper for the Cystic Fibrosis Mutation Database
Installation
-------------
$ pip install requests pyquery
$ python
> import genet
License
-------
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2012 Clarence Leung
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.
"""
import re
import requests
from pyquery import PyQuery as pq
def set_payload(options):
"""
Sets and return the form data in dictionary form for the HTTP request.
Keyword arguments:
options -- Form options to mix-in or override the request
"""
# How I managed to figure this out: magic
default_payload = {
'formids': 'searchCriteria,cdnaname_ad,proteinname_ad,mutationName_ad,ncchange_ad,consequence_ad,original_node_ad,phenotype_ad,exonChooser,For_0,intronChooser,For_29,mutationChooser,For_59,institute_ad,contributor_ad,fromYearChooser,toYearChooser',
'seedids': 'ZH4sIAAAAAAAAAFvzloG1vI6hRqc4tagsMzlVxUCnIDEdRCXn5xbk56XmlYDZeSWJmXmpRUB2cWpxcWZ+HohVACSc8otSwOLBGak5OWCBlEogFQA0xSczLxvIdMxKrAguSSwpLQZyglLzgOqhitzyi3JVDADDTn+1hAAAAA==',
'submitmode': '',
'submitname': '',
'For_0': ['VSExon 1', 'VSExon 2', 'VSExon 3', 'VSExon 4', 'VSExon 5', 'VSExon 6', 'VSExon 7', 'VSExon 8', 'VSExon 9', 'VSExon 10', 'VSExon 11', 'VSExon 12', 'VSExon 13', 'VSExon 14', 'VSExon 15', 'VSExon 16', 'VSExon 17', 'VSExon 18', 'VSExon 19', 'VSExon 20', 'VSExon 21', 'VSExon 22', 'VSExon 23', 'VSExon 24', 'VSExon 25', 'VSExon 26', 'VSExon 27'],
'For_29': ['VSPromoter', 'VSIntron 1', 'VSIntron 2', 'VSIntron 3', 'VSIntron 4', 'VSIntron 5', 'VSIntron 6', 'VSIntron 7', 'VSIntron 8', 'VSIntron 9', 'VSIntron 10', 'VSIntron 11', 'VSIntron 12', 'VSIntron 13', 'VSIntron 14', 'VSIntron 15', 'VSIntron 16', 'VSIntron 17', 'VSIntron 18', 'VSIntron 19', 'VSIntron 20', 'VSIntron 21', 'VSIntron 22', 'VSIntron 23', 'VSIntron 24', 'VSIntron 25', 'VSIntron 26', 'VS3\'UTR'],
'For_59': ['VSSplicing', 'VSIn frame', 'VSFrame Shift', 'VSSequence Variation', 'VSLarge In/del', 'VSMissense', 'VSNonsense', 'VSPromoter'],
'searchCriteria': '1',
'cdnaname_ad': '',
'proteinname_ad': '',
'mutationName_ad': '',
'ncchange_ad': '',
'consequence_ad': '',
'original_node_ad': '',
'phenotype_ad': '',
'institute_ad': '',
'contributor_ad': '',
'fromYearChooser': 0,
'toYearChooser': 0
}
return dict(default_payload.items() + options.items())
def genet_request(options):
"""
Send the HTTP request to the Genet site and return the HTML data.
Keyword arguments:
options -- Form options to mix-in or override the request
"""
# Mixin our options with the default payload
payload = set_payload(options)
# Send the request!
response = requests.post(
'http://www.genet.sickkids.on.ca/AdvancedSearchPage,$Form.direct',
data=payload
)
return response
def extract_aa_range(aa_data):
"""
Given an amino acid range in slash-separated form, return the
corresponding array of integers.
Keyword arguments:
aa_data -- String of amino acid ranges
"""
aa_range = []
# Split the string by slashes
aa_strings = aa_data.split('/')
# Iterate through our string array
for aa_string in aa_strings:
if '-' in aa_string: # If we have a range of values
data_range = aa_string.split('-')
# Inclusive range
aa_range += range(int(data_range[0]), int(data_range[1]) + 1)
elif aa_string.isdigit(): # If we have a single number
aa_range += [int(aa_string)]
return aa_range
def extract_aa_values(loop_string):
"""
Given a loop in a format similar to:
`Cyto loop1 (56 aa) 139/140-194/195`
Extract the target range of amino acid consequences wanted.
Keyword arguments:
loop_string -- Formatted string representing a loop
"""
# Split the string by spaces
loop_data = loop_string.split()
# Get the last segment, which is the amino acid data
aa_data = loop_data[-1]
# Extract the range of amino acid values that we want
return extract_aa_range(aa_data)
def search_by_consequence(amino_acids):
"""
Returns an array of HTML pages to be parsed.
Keyword arguments:
amino_acids -- Array of amino acid IDs to search, or a single ID
"""
search_targets = []
results = []
if isinstance(amino_acids, int): # If we only have one amino acid
search_targets.append(amino_acids)
else: # If we have a range of amino acids
search_targets += amino_acids
# Start searching them all!
for target in search_targets:
payload = set_payload({'consequence_ad': target})
response = genet_request(payload)
results.append(response.text)
# Return an array of HTML pages
return results
def extract_row_data(data_row, aa_id):
"""
Extracts the data from each HTML row, and returns a parsed cDNA object.
Keyword arguments:
data_row -- HTML fragment representing a single row of cDNA information
"""
cdna_result = {}
# PyQuery selector
select = pq(data_row)
# Get the name first
cdna_name = select('td a').html()
cdna_result["cdna_name"] = cdna_name.strip()
# Get the rest of the data
data_columns = select('td')[1:]
# Headers for the rest of the data
data_headers = ["protein_name", "legacy_name", "region",
"description", "consequence"]
# Extract the rest of the data
for i in xrange(5):
data = data_columns[i].text.strip()
cdna_result[data_headers[i]] = data
# Regular expression filtering
aa_check = re.findall(r"([\d]*\d+)", cdna_result["protein_name"])
aa_number = None
if aa_check:
aa_number = int(aa_check[0])
if aa_number == aa_id:
return cdna_result
def parse_result_data(html_results, amino_acids):
"""
Returns an array of parsed cDNA object data.
Keyword arguments:
html_results -- An array of HTML pages, or a single HTML page
"""
html_pages = []
amino_ids = []
results = []
if isinstance(html_results, basestring): # If we only have one HTML page
html_pages.append(html_results)
else: # If we have an array of HTML pages
html_pages += html_results
if isinstance(amino_acids, int):
amino_ids.append(amino_acids)
else:
amino_ids += amino_acids
length = len(html_pages)
for index in range(length):
page = html_pages[index]
amino_id = amino_ids[index]
cdna_data = {}
# Set our PyQuery selector
select = pq(page)
# Select our data rows but ignore the headers
data_rows = select("#Any").find('tr')[1:]
# Extract data from each row
for row in data_rows:
row_data = extract_row_data(row, amino_id)
results.append(row_data)
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment