Last active
September 2, 2016 06:56
-
-
Save nvictus/58af80eab1bc7f4e72dd to your computer and use it in GitHub Desktop.
UCSC genome browser to matplotlib
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Request snapshots from the UCSC Genome Browser. | |
""" | |
from __future__ import division, print_function, unicode_literals | |
import sys | |
import re | |
if sys.version_info[0] >= 3: | |
from io import BytesIO | |
else: | |
try: | |
from cStringIO import StringIO as BytesIO | |
except ImportError: | |
from StringIO import StringIO as BytesIO | |
from matplotlib.image import imread | |
import numpy as np | |
import requests | |
import bs4 | |
# European mirror is http://genome-euro.ucsc.edu/ | |
BASE_URL = "https://genome.ucsc.edu" | |
def _scrape_for_pdf(resp, ideogram=False): | |
# Scrape the pdf download page | |
soup = bs4.BeautifulSoup(resp.text) | |
if ideogram: | |
name = 'hgtIdeo_genome' | |
else: | |
name = 'hgt_genome' | |
a_tags = soup.find_all('a') | |
for a in a_tags: | |
if name in a.attrs['href']: | |
pdf_url = BASE_URL + a.attrs['href'][2:] | |
break | |
else: | |
raise ValueError('Link to PDF not found') | |
r_pdf = requests.get(pdf_url) | |
return r_pdf.content | |
def _scrape_for_images(resp): | |
# Scrape the UCSC browser page | |
soup = bs4.BeautifulSoup(resp.text) | |
# Image of the main panel | |
img_tag1 = soup.find(id="img_data_ruler") | |
r_data = requests.get(BASE_URL + img_tag1.attrs['src'][2:]) | |
r_data.raise_for_status() | |
im_data = imread(BytesIO(r_data.content)) | |
# Image of the side panel | |
img_tag2 = soup.find(id="img_side_ruler") | |
r_side = requests.get(BASE_URL + img_tag2.attrs['src'][2:]) | |
r_side.raise_for_status() | |
im_side = imread(BytesIO(r_side.content)) | |
# Get the width of the side panel | |
css = img_tag1.attrs['style'] | |
margin_txt = css.split(';')[0].split(':')[1] | |
margin_px = int(next(re.finditer(r'\d+', margin_txt)).group()) | |
return im_side[:, :margin_px, :], im_data[:, margin_px:, :] | |
defaults = { | |
'intronEst': 'hide', | |
'knownGene': 'hide', | |
'mrna': 'hide', | |
'pubs': 'hide', | |
'refGene': 'hide', | |
'snp142Common': 'hide', | |
'cons100way': 'hide', | |
'wgEncodeReg': 'hide', | |
'rmsk': 'hide', | |
} | |
def _process_params(params, region, db, session_id): | |
params['position'] = region | |
params['db'] = db | |
if session_id is not None: | |
params['hgsid'] = session_id | |
else: | |
# hide all default tracks not explicitly turned on | |
for k, v in defaults.items(): | |
params.setdefault(k, v) | |
def fetch(region, db='hg19', session_id=None, params=None): | |
""" | |
Download snapshot from UCSC Genome Browser as RGB array. | |
Input | |
----- | |
region: str | |
genomic coordinate query string | |
db: str | |
name of genome assembly, default is hg19 | |
session_id: str, optional | |
genome browser session ID (hgsid) | |
params: dict, optional | |
url parameters, e.g. names of tracks | |
Notes | |
----- | |
UCSC track display modes: | |
'hide', 'dense', 'squish', 'pack', 'full' | |
Returns | |
------- | |
Two RGB image arrays (M x N x 3) | |
im_side: image of the side panel, variable width | |
im_data: image of the main panel, spans the genomic interval exactly | |
Example | |
------- | |
>>> side, data = fetch('chrX:15,000,000-16,000,000', params={'knownGene': 'full'}) | |
""" | |
if params is None: | |
params = {} | |
_process_params(params, region, db, session_id) | |
r = requests.get(BASE_URL + "/cgi-bin/hgTracks", params=params) | |
r.raise_for_status() | |
return _scrape_for_images(r) | |
def fetch_pdf(filepath, region, db='hg19', session_id=None, params=None): | |
""" | |
Download snapshot from UCSC Genome Browser as PDF file. | |
""" | |
if params is None: | |
params = {} | |
_process_params(params, region, db, session_id) | |
params['hgt.psOutput'] = 'on' | |
r = requests.get(BASE_URL + "/cgi-bin/hgTracks", params=params) | |
r.raise_for_status() | |
content = _scrape_for_pdf(r) | |
with open(filepath, 'wb') as f: | |
f.write(content) | |
if __name__ == '__main__': | |
import matplotlib.pyplot as plt | |
im0, im1 = fetch('chr16:52,800,000-56,000,000', params={'knownGene': 'full'}) | |
plt.imshow(im1) | |
plt.show() |
Hey Ilya! Noticed your post just now for some reason! Are you sure you copied the session ID correctly from your url, or that your session didn't expire? It always works for me. If not, could you check what HTML the server sends back in r.text
?
On that note, it should test if the scraper failed by checking that img_tag1
is not None.
As for your second issue, specifying ENCODE or other data is a mystery wrapped in an enigma wrapped in a twinkie. You can add things and inspect the session file on UCSC to see what random parameters get included. I haven't been able to find any method in the madness.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is really cool! But I can't get it to work with a custom session_id, which would be very convenient :(
As a workaround it would be OK to just manually specify everything each time, but I couldn't figure out how to specify parameters for different ENCODE data, for example...
Do you have any ideas?