clarle/genet.py

## genet.py
"""
genet - Python screen scraper for the Cystic Fibrosis Mutation Database

Installation
-------------

$ pip install requests pyquery
$ python
> import genet

License
-------

DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004

Copyright (C) 2012 Clarence Leung
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.

DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

0. You just DO WHAT THE FUCK YOU WANT TO.
"""

import re
import requests
from pyquery import PyQuery as pq

def set_payload(options):
    """
    Sets and return the form data in dictionary form for the HTTP request.

    Keyword arguments:
    options -- Form options to mix-in or override the request
    """

    # How I managed to figure this out: magic
    default_payload = {
        'formids': 'searchCriteria,cdnaname_ad,proteinname_ad,mutationName_ad,ncchange_ad,consequence_ad,original_node_ad,phenotype_ad,exonChooser,For_0,intronChooser,For_29,mutationChooser,For_59,institute_ad,contributor_ad,fromYearChooser,toYearChooser',
        'seedids': 'ZH4sIAAAAAAAAAFvzloG1vI6hRqc4tagsMzlVxUCnIDEdRCXn5xbk56XmlYDZeSWJmXmpRUB2cWpxcWZ+HohVACSc8otSwOLBGak5OWCBlEogFQA0xSczLxvIdMxKrAguSSwpLQZyglLzgOqhitzyi3JVDADDTn+1hAAAAA==',
        'submitmode': '',
        'submitname': '',
        'For_0': ['VSExon 1', 'VSExon 2', 'VSExon 3', 'VSExon 4', 'VSExon 5', 'VSExon 6', 'VSExon 7', 'VSExon 8', 'VSExon 9', 'VSExon 10', 'VSExon 11', 'VSExon 12', 'VSExon 13', 'VSExon 14', 'VSExon 15', 'VSExon 16', 'VSExon 17', 'VSExon 18', 'VSExon 19', 'VSExon 20', 'VSExon 21', 'VSExon 22', 'VSExon 23', 'VSExon 24', 'VSExon 25', 'VSExon 26', 'VSExon 27'],
        'For_29': ['VSPromoter', 'VSIntron 1', 'VSIntron 2', 'VSIntron 3', 'VSIntron 4', 'VSIntron 5', 'VSIntron 6', 'VSIntron 7', 'VSIntron 8', 'VSIntron 9', 'VSIntron 10', 'VSIntron 11', 'VSIntron 12', 'VSIntron 13', 'VSIntron 14', 'VSIntron 15', 'VSIntron 16', 'VSIntron 17', 'VSIntron 18', 'VSIntron 19', 'VSIntron 20', 'VSIntron 21', 'VSIntron 22', 'VSIntron 23', 'VSIntron 24', 'VSIntron 25', 'VSIntron 26', 'VS3\'UTR'],
        'For_59': ['VSSplicing', 'VSIn frame', 'VSFrame Shift', 'VSSequence Variation', 'VSLarge In/del', 'VSMissense', 'VSNonsense', 'VSPromoter'],
        'searchCriteria': '1',
        'cdnaname_ad': '',
        'proteinname_ad': '',
        'mutationName_ad': '',
        'ncchange_ad': '',
        'consequence_ad': '',
        'original_node_ad': '',
        'phenotype_ad': '',
        'institute_ad': '',
        'contributor_ad': '',
        'fromYearChooser': 0,
        'toYearChooser': 0
    }

    return dict(default_payload.items() + options.items())

def genet_request(options):
    """
    Send the HTTP request to the Genet site and return the HTML data.

    Keyword arguments:
    options -- Form options to mix-in or override the request
    """

    # Mixin our options with the default payload
    payload = set_payload(options)

    # Send the request!
    response = requests.post(
        'http://www.genet.sickkids.on.ca/AdvancedSearchPage,$Form.direct',
        data=payload
    )
    return response

def extract_aa_range(aa_data):
    """
    Given an amino acid range in slash-separated form, return the
    corresponding array of integers.

    Keyword arguments:
    aa_data -- String of amino acid ranges
    """

    aa_range = []

    # Split the string by slashes
    aa_strings = aa_data.split('/')

    # Iterate through our string array
    for aa_string in aa_strings:
        if '-' in aa_string: # If we have a range of values
            data_range = aa_string.split('-')
            # Inclusive range
            aa_range += range(int(data_range[0]), int(data_range[1]) + 1)
        elif aa_string.isdigit(): # If we have a single number
            aa_range += [int(aa_string)]

    return aa_range


def extract_aa_values(loop_string):
    """
    Given a loop in a format similar to:

        `Cyto loop1 (56 aa) 139/140-194/195`

    Extract the target range of amino acid consequences wanted.

    Keyword arguments:
    loop_string -- Formatted string representing a loop
    """

    # Split the string by spaces
    loop_data = loop_string.split()

    # Get the last segment, which is the amino acid data
    aa_data = loop_data[-1]

    # Extract the range of amino acid values that we want
    return extract_aa_range(aa_data)

def search_by_consequence(amino_acids):
    """
    Returns an array of HTML pages to be parsed.

    Keyword arguments:
    amino_acids -- Array of amino acid IDs to search, or a single ID
    """

    search_targets = []
    results = []

    if isinstance(amino_acids, int): # If we only have one amino acid
       search_targets.append(amino_acids)
    else: # If we have a range of amino acids
        search_targets += amino_acids

    # Start searching them all!
    for target in search_targets:
        payload = set_payload({'consequence_ad': target})
        response = genet_request(payload)
        results.append(response.text)

    # Return an array of HTML pages
    return results

def extract_row_data(data_row, aa_id):
    """
    Extracts the data from each HTML row, and returns a parsed cDNA object.

    Keyword arguments:
    data_row -- HTML fragment representing a single row of cDNA information
    """

    cdna_result = {}

    # PyQuery selector
    select = pq(data_row)

    # Get the name first
    cdna_name = select('td a').html()

    cdna_result["cdna_name"] = cdna_name.strip()

    # Get the rest of the data
    data_columns = select('td')[1:]

    # Headers for the rest of the data
    data_headers = ["protein_name", "legacy_name", "region",
                        "description", "consequence"]

    # Extract the rest of the data
    for i in xrange(5):
        data = data_columns[i].text.strip()
        cdna_result[data_headers[i]] = data

    # Regular expression filtering
    aa_check = re.findall(r"([\d]*\d+)", cdna_result["protein_name"])
    aa_number = None

    if aa_check:
        aa_number = int(aa_check[0])


    if aa_number == aa_id:
        return cdna_result

def parse_result_data(html_results, amino_acids):
    """
    Returns an array of parsed cDNA object data.

    Keyword arguments:
    html_results -- An array of HTML pages, or a single HTML page
    """

    html_pages = []
    amino_ids = []
    results = []

    if isinstance(html_results, basestring): # If we only have one HTML page
        html_pages.append(html_results)
    else: # If we have an array of HTML pages
        html_pages += html_results

    if isinstance(amino_acids, int):
        amino_ids.append(amino_acids)
    else:
        amino_ids += amino_acids

    length = len(html_pages)

    for index in range(length):
        page = html_pages[index]
        amino_id = amino_ids[index]

        cdna_data = {}
        # Set our PyQuery selector
        select = pq(page)

        # Select our data rows but ignore the headers
        data_rows = select("#Any").find('tr')[1:]

        # Extract data from each row
        for row in data_rows:
            row_data = extract_row_data(row, amino_id)
            results.append(row_data)

    return results
	"""
	genet - Python screen scraper for the Cystic Fibrosis Mutation Database

	Installation
	-------------

	$ pip install requests pyquery
	$ python
	> import genet

	License
	-------

	DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
	Version 2, December 2004

	Copyright (C) 2012 Clarence Leung
	Everyone is permitted to copy and distribute verbatim or modified
	copies of this license document, and changing it is allowed as long
	as the name is changed.

	DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
	TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

	0. You just DO WHAT THE FUCK YOU WANT TO.
	"""

	import re
	import requests
	from pyquery import PyQuery as pq

	def set_payload(options):
	"""
	Sets and return the form data in dictionary form for the HTTP request.

	Keyword arguments:
	options -- Form options to mix-in or override the request
	"""

	# How I managed to figure this out: magic
	default_payload = {
	'formids': 'searchCriteria,cdnaname_ad,proteinname_ad,mutationName_ad,ncchange_ad,consequence_ad,original_node_ad,phenotype_ad,exonChooser,For_0,intronChooser,For_29,mutationChooser,For_59,institute_ad,contributor_ad,fromYearChooser,toYearChooser',
	'seedids': 'ZH4sIAAAAAAAAAFvzloG1vI6hRqc4tagsMzlVxUCnIDEdRCXn5xbk56XmlYDZeSWJmXmpRUB2cWpxcWZ+HohVACSc8otSwOLBGak5OWCBlEogFQA0xSczLxvIdMxKrAguSSwpLQZyglLzgOqhitzyi3JVDADDTn+1hAAAAA==',
	'submitmode': '',
	'submitname': '',
	'For_0': ['VSExon 1', 'VSExon 2', 'VSExon 3', 'VSExon 4', 'VSExon 5', 'VSExon 6', 'VSExon 7', 'VSExon 8', 'VSExon 9', 'VSExon 10', 'VSExon 11', 'VSExon 12', 'VSExon 13', 'VSExon 14', 'VSExon 15', 'VSExon 16', 'VSExon 17', 'VSExon 18', 'VSExon 19', 'VSExon 20', 'VSExon 21', 'VSExon 22', 'VSExon 23', 'VSExon 24', 'VSExon 25', 'VSExon 26', 'VSExon 27'],
	'For_29': ['VSPromoter', 'VSIntron 1', 'VSIntron 2', 'VSIntron 3', 'VSIntron 4', 'VSIntron 5', 'VSIntron 6', 'VSIntron 7', 'VSIntron 8', 'VSIntron 9', 'VSIntron 10', 'VSIntron 11', 'VSIntron 12', 'VSIntron 13', 'VSIntron 14', 'VSIntron 15', 'VSIntron 16', 'VSIntron 17', 'VSIntron 18', 'VSIntron 19', 'VSIntron 20', 'VSIntron 21', 'VSIntron 22', 'VSIntron 23', 'VSIntron 24', 'VSIntron 25', 'VSIntron 26', 'VS3\'UTR'],
	'For_59': ['VSSplicing', 'VSIn frame', 'VSFrame Shift', 'VSSequence Variation', 'VSLarge In/del', 'VSMissense', 'VSNonsense', 'VSPromoter'],
	'searchCriteria': '1',
	'cdnaname_ad': '',
	'proteinname_ad': '',
	'mutationName_ad': '',
	'ncchange_ad': '',
	'consequence_ad': '',
	'original_node_ad': '',
	'phenotype_ad': '',
	'institute_ad': '',
	'contributor_ad': '',
	'fromYearChooser': 0,
	'toYearChooser': 0
	}

	return dict(default_payload.items() + options.items())

	def genet_request(options):
	"""
	Send the HTTP request to the Genet site and return the HTML data.

	Keyword arguments:
	options -- Form options to mix-in or override the request
	"""

	# Mixin our options with the default payload
	payload = set_payload(options)

	# Send the request!
	response = requests.post(
	'http://www.genet.sickkids.on.ca/AdvancedSearchPage,$Form.direct',
	data=payload
	)
	return response

	def extract_aa_range(aa_data):
	"""
	Given an amino acid range in slash-separated form, return the
	corresponding array of integers.

	Keyword arguments:
	aa_data -- String of amino acid ranges
	"""

	aa_range = []

	# Split the string by slashes
	aa_strings = aa_data.split('/')

	# Iterate through our string array
	for aa_string in aa_strings:
	if '-' in aa_string: # If we have a range of values
	data_range = aa_string.split('-')
	# Inclusive range
	aa_range += range(int(data_range[0]), int(data_range[1]) + 1)
	elif aa_string.isdigit(): # If we have a single number
	aa_range += [int(aa_string)]

	return aa_range


	def extract_aa_values(loop_string):
	"""
	Given a loop in a format similar to:

	`Cyto loop1 (56 aa) 139/140-194/195`

	Extract the target range of amino acid consequences wanted.

	Keyword arguments:
	loop_string -- Formatted string representing a loop
	"""

	# Split the string by spaces
	loop_data = loop_string.split()

	# Get the last segment, which is the amino acid data
	aa_data = loop_data[-1]

	# Extract the range of amino acid values that we want
	return extract_aa_range(aa_data)

	def search_by_consequence(amino_acids):
	"""
	Returns an array of HTML pages to be parsed.

	Keyword arguments:
	amino_acids -- Array of amino acid IDs to search, or a single ID
	"""

	search_targets = []
	results = []

	if isinstance(amino_acids, int): # If we only have one amino acid
	search_targets.append(amino_acids)
	else: # If we have a range of amino acids
	search_targets += amino_acids

	# Start searching them all!
	for target in search_targets:
	payload = set_payload({'consequence_ad': target})
	response = genet_request(payload)
	results.append(response.text)

	# Return an array of HTML pages
	return results

	def extract_row_data(data_row, aa_id):
	"""
	Extracts the data from each HTML row, and returns a parsed cDNA object.

	Keyword arguments:
	data_row -- HTML fragment representing a single row of cDNA information
	"""

	cdna_result = {}

	# PyQuery selector
	select = pq(data_row)

	# Get the name first
	cdna_name = select('td a').html()

	cdna_result["cdna_name"] = cdna_name.strip()

	# Get the rest of the data
	data_columns = select('td')[1:]

	# Headers for the rest of the data
	data_headers = ["protein_name", "legacy_name", "region",
	"description", "consequence"]

	# Extract the rest of the data
	for i in xrange(5):
	data = data_columns[i].text.strip()
	cdna_result[data_headers[i]] = data

	# Regular expression filtering
	aa_check = re.findall(r"([\d]*\d+)", cdna_result["protein_name"])
	aa_number = None

	if aa_check:
	aa_number = int(aa_check[0])


	if aa_number == aa_id:
	return cdna_result

	def parse_result_data(html_results, amino_acids):
	"""
	Returns an array of parsed cDNA object data.

	Keyword arguments:
	html_results -- An array of HTML pages, or a single HTML page
	"""

	html_pages = []
	amino_ids = []
	results = []

	if isinstance(html_results, basestring): # If we only have one HTML page
	html_pages.append(html_results)
	else: # If we have an array of HTML pages
	html_pages += html_results

	if isinstance(amino_acids, int):
	amino_ids.append(amino_acids)
	else:
	amino_ids += amino_acids

	length = len(html_pages)

	for index in range(length):
	page = html_pages[index]
	amino_id = amino_ids[index]

	cdna_data = {}
	# Set our PyQuery selector
	select = pq(page)

	# Select our data rows but ignore the headers
	data_rows = select("#Any").find('tr')[1:]

	# Extract data from each row
	for row in data_rows:
	row_data = extract_row_data(row, amino_id)
	results.append(row_data)

	return results