Created
July 13, 2012 03:34
-
-
Save clarle/3102541 to your computer and use it in GitHub Desktop.
genet - Python screen scraper for the Cystic Fibrosis Mutation Database
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
genet - Python screen scraper for the Cystic Fibrosis Mutation Database | |
Installation | |
------------- | |
$ pip install requests pyquery | |
$ python | |
> import genet | |
License | |
------- | |
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
Version 2, December 2004 | |
Copyright (C) 2012 Clarence Leung | |
Everyone is permitted to copy and distribute verbatim or modified | |
copies of this license document, and changing it is allowed as long | |
as the name is changed. | |
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | |
0. You just DO WHAT THE FUCK YOU WANT TO. | |
""" | |
import re | |
import requests | |
from pyquery import PyQuery as pq | |
def set_payload(options): | |
""" | |
Sets and return the form data in dictionary form for the HTTP request. | |
Keyword arguments: | |
options -- Form options to mix-in or override the request | |
""" | |
# How I managed to figure this out: magic | |
default_payload = { | |
'formids': 'searchCriteria,cdnaname_ad,proteinname_ad,mutationName_ad,ncchange_ad,consequence_ad,original_node_ad,phenotype_ad,exonChooser,For_0,intronChooser,For_29,mutationChooser,For_59,institute_ad,contributor_ad,fromYearChooser,toYearChooser', | |
'seedids': 'ZH4sIAAAAAAAAAFvzloG1vI6hRqc4tagsMzlVxUCnIDEdRCXn5xbk56XmlYDZeSWJmXmpRUB2cWpxcWZ+HohVACSc8otSwOLBGak5OWCBlEogFQA0xSczLxvIdMxKrAguSSwpLQZyglLzgOqhitzyi3JVDADDTn+1hAAAAA==', | |
'submitmode': '', | |
'submitname': '', | |
'For_0': ['VSExon 1', 'VSExon 2', 'VSExon 3', 'VSExon 4', 'VSExon 5', 'VSExon 6', 'VSExon 7', 'VSExon 8', 'VSExon 9', 'VSExon 10', 'VSExon 11', 'VSExon 12', 'VSExon 13', 'VSExon 14', 'VSExon 15', 'VSExon 16', 'VSExon 17', 'VSExon 18', 'VSExon 19', 'VSExon 20', 'VSExon 21', 'VSExon 22', 'VSExon 23', 'VSExon 24', 'VSExon 25', 'VSExon 26', 'VSExon 27'], | |
'For_29': ['VSPromoter', 'VSIntron 1', 'VSIntron 2', 'VSIntron 3', 'VSIntron 4', 'VSIntron 5', 'VSIntron 6', 'VSIntron 7', 'VSIntron 8', 'VSIntron 9', 'VSIntron 10', 'VSIntron 11', 'VSIntron 12', 'VSIntron 13', 'VSIntron 14', 'VSIntron 15', 'VSIntron 16', 'VSIntron 17', 'VSIntron 18', 'VSIntron 19', 'VSIntron 20', 'VSIntron 21', 'VSIntron 22', 'VSIntron 23', 'VSIntron 24', 'VSIntron 25', 'VSIntron 26', 'VS3\'UTR'], | |
'For_59': ['VSSplicing', 'VSIn frame', 'VSFrame Shift', 'VSSequence Variation', 'VSLarge In/del', 'VSMissense', 'VSNonsense', 'VSPromoter'], | |
'searchCriteria': '1', | |
'cdnaname_ad': '', | |
'proteinname_ad': '', | |
'mutationName_ad': '', | |
'ncchange_ad': '', | |
'consequence_ad': '', | |
'original_node_ad': '', | |
'phenotype_ad': '', | |
'institute_ad': '', | |
'contributor_ad': '', | |
'fromYearChooser': 0, | |
'toYearChooser': 0 | |
} | |
return dict(default_payload.items() + options.items()) | |
def genet_request(options): | |
""" | |
Send the HTTP request to the Genet site and return the HTML data. | |
Keyword arguments: | |
options -- Form options to mix-in or override the request | |
""" | |
# Mixin our options with the default payload | |
payload = set_payload(options) | |
# Send the request! | |
response = requests.post( | |
'http://www.genet.sickkids.on.ca/AdvancedSearchPage,$Form.direct', | |
data=payload | |
) | |
return response | |
def extract_aa_range(aa_data): | |
""" | |
Given an amino acid range in slash-separated form, return the | |
corresponding array of integers. | |
Keyword arguments: | |
aa_data -- String of amino acid ranges | |
""" | |
aa_range = [] | |
# Split the string by slashes | |
aa_strings = aa_data.split('/') | |
# Iterate through our string array | |
for aa_string in aa_strings: | |
if '-' in aa_string: # If we have a range of values | |
data_range = aa_string.split('-') | |
# Inclusive range | |
aa_range += range(int(data_range[0]), int(data_range[1]) + 1) | |
elif aa_string.isdigit(): # If we have a single number | |
aa_range += [int(aa_string)] | |
return aa_range | |
def extract_aa_values(loop_string): | |
""" | |
Given a loop in a format similar to: | |
`Cyto loop1 (56 aa) 139/140-194/195` | |
Extract the target range of amino acid consequences wanted. | |
Keyword arguments: | |
loop_string -- Formatted string representing a loop | |
""" | |
# Split the string by spaces | |
loop_data = loop_string.split() | |
# Get the last segment, which is the amino acid data | |
aa_data = loop_data[-1] | |
# Extract the range of amino acid values that we want | |
return extract_aa_range(aa_data) | |
def search_by_consequence(amino_acids): | |
""" | |
Returns an array of HTML pages to be parsed. | |
Keyword arguments: | |
amino_acids -- Array of amino acid IDs to search, or a single ID | |
""" | |
search_targets = [] | |
results = [] | |
if isinstance(amino_acids, int): # If we only have one amino acid | |
search_targets.append(amino_acids) | |
else: # If we have a range of amino acids | |
search_targets += amino_acids | |
# Start searching them all! | |
for target in search_targets: | |
payload = set_payload({'consequence_ad': target}) | |
response = genet_request(payload) | |
results.append(response.text) | |
# Return an array of HTML pages | |
return results | |
def extract_row_data(data_row, aa_id): | |
""" | |
Extracts the data from each HTML row, and returns a parsed cDNA object. | |
Keyword arguments: | |
data_row -- HTML fragment representing a single row of cDNA information | |
""" | |
cdna_result = {} | |
# PyQuery selector | |
select = pq(data_row) | |
# Get the name first | |
cdna_name = select('td a').html() | |
cdna_result["cdna_name"] = cdna_name.strip() | |
# Get the rest of the data | |
data_columns = select('td')[1:] | |
# Headers for the rest of the data | |
data_headers = ["protein_name", "legacy_name", "region", | |
"description", "consequence"] | |
# Extract the rest of the data | |
for i in xrange(5): | |
data = data_columns[i].text.strip() | |
cdna_result[data_headers[i]] = data | |
# Regular expression filtering | |
aa_check = re.findall(r"([\d]*\d+)", cdna_result["protein_name"]) | |
aa_number = None | |
if aa_check: | |
aa_number = int(aa_check[0]) | |
if aa_number == aa_id: | |
return cdna_result | |
def parse_result_data(html_results, amino_acids): | |
""" | |
Returns an array of parsed cDNA object data. | |
Keyword arguments: | |
html_results -- An array of HTML pages, or a single HTML page | |
""" | |
html_pages = [] | |
amino_ids = [] | |
results = [] | |
if isinstance(html_results, basestring): # If we only have one HTML page | |
html_pages.append(html_results) | |
else: # If we have an array of HTML pages | |
html_pages += html_results | |
if isinstance(amino_acids, int): | |
amino_ids.append(amino_acids) | |
else: | |
amino_ids += amino_acids | |
length = len(html_pages) | |
for index in range(length): | |
page = html_pages[index] | |
amino_id = amino_ids[index] | |
cdna_data = {} | |
# Set our PyQuery selector | |
select = pq(page) | |
# Select our data rows but ignore the headers | |
data_rows = select("#Any").find('tr')[1:] | |
# Extract data from each row | |
for row in data_rows: | |
row_data = extract_row_data(row, amino_id) | |
results.append(row_data) | |
return results |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment