Skip to content

Instantly share code, notes, and snippets.

@nvictus
Last active September 2, 2016 06:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nvictus/58af80eab1bc7f4e72dd to your computer and use it in GitHub Desktop.
Save nvictus/58af80eab1bc7f4e72dd to your computer and use it in GitHub Desktop.
UCSC genome browser to matplotlib
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# -*- coding: utf-8 -*-
"""
Request snapshots from the UCSC Genome Browser.
"""
from __future__ import division, print_function, unicode_literals
import sys
import re
if sys.version_info[0] >= 3:
from io import BytesIO
else:
try:
from cStringIO import StringIO as BytesIO
except ImportError:
from StringIO import StringIO as BytesIO
from matplotlib.image import imread
import numpy as np
import requests
import bs4
# European mirror is http://genome-euro.ucsc.edu/
BASE_URL = "https://genome.ucsc.edu"
def _scrape_for_pdf(resp, ideogram=False):
# Scrape the pdf download page
soup = bs4.BeautifulSoup(resp.text)
if ideogram:
name = 'hgtIdeo_genome'
else:
name = 'hgt_genome'
a_tags = soup.find_all('a')
for a in a_tags:
if name in a.attrs['href']:
pdf_url = BASE_URL + a.attrs['href'][2:]
break
else:
raise ValueError('Link to PDF not found')
r_pdf = requests.get(pdf_url)
return r_pdf.content
def _scrape_for_images(resp):
# Scrape the UCSC browser page
soup = bs4.BeautifulSoup(resp.text)
# Image of the main panel
img_tag1 = soup.find(id="img_data_ruler")
r_data = requests.get(BASE_URL + img_tag1.attrs['src'][2:])
r_data.raise_for_status()
im_data = imread(BytesIO(r_data.content))
# Image of the side panel
img_tag2 = soup.find(id="img_side_ruler")
r_side = requests.get(BASE_URL + img_tag2.attrs['src'][2:])
r_side.raise_for_status()
im_side = imread(BytesIO(r_side.content))
# Get the width of the side panel
css = img_tag1.attrs['style']
margin_txt = css.split(';')[0].split(':')[1]
margin_px = int(next(re.finditer(r'\d+', margin_txt)).group())
return im_side[:, :margin_px, :], im_data[:, margin_px:, :]
defaults = {
'intronEst': 'hide',
'knownGene': 'hide',
'mrna': 'hide',
'pubs': 'hide',
'refGene': 'hide',
'snp142Common': 'hide',
'cons100way': 'hide',
'wgEncodeReg': 'hide',
'rmsk': 'hide',
}
def _process_params(params, region, db, session_id):
params['position'] = region
params['db'] = db
if session_id is not None:
params['hgsid'] = session_id
else:
# hide all default tracks not explicitly turned on
for k, v in defaults.items():
params.setdefault(k, v)
def fetch(region, db='hg19', session_id=None, params=None):
"""
Download snapshot from UCSC Genome Browser as RGB array.
Input
-----
region: str
genomic coordinate query string
db: str
name of genome assembly, default is hg19
session_id: str, optional
genome browser session ID (hgsid)
params: dict, optional
url parameters, e.g. names of tracks
Notes
-----
UCSC track display modes:
'hide', 'dense', 'squish', 'pack', 'full'
Returns
-------
Two RGB image arrays (M x N x 3)
im_side: image of the side panel, variable width
im_data: image of the main panel, spans the genomic interval exactly
Example
-------
>>> side, data = fetch('chrX:15,000,000-16,000,000', params={'knownGene': 'full'})
"""
if params is None:
params = {}
_process_params(params, region, db, session_id)
r = requests.get(BASE_URL + "/cgi-bin/hgTracks", params=params)
r.raise_for_status()
return _scrape_for_images(r)
def fetch_pdf(filepath, region, db='hg19', session_id=None, params=None):
"""
Download snapshot from UCSC Genome Browser as PDF file.
"""
if params is None:
params = {}
_process_params(params, region, db, session_id)
params['hgt.psOutput'] = 'on'
r = requests.get(BASE_URL + "/cgi-bin/hgTracks", params=params)
r.raise_for_status()
content = _scrape_for_pdf(r)
with open(filepath, 'wb') as f:
f.write(content)
if __name__ == '__main__':
import matplotlib.pyplot as plt
im0, im1 = fetch('chr16:52,800,000-56,000,000', params={'knownGene': 'full'})
plt.imshow(im1)
plt.show()
@Phlya
Copy link

Phlya commented Jul 29, 2016

This is really cool! But I can't get it to work with a custom session_id, which would be very convenient :(


im0, im1 = ucsc.fetch('chr16:{}-{}'.format(5280000,5600000), session_id='504899467_paH0Cg1JuKHXqprclfvhTYAqoPgO')
plt.imshow(im1)
plt.show()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-27-7f8eb01f9d99> in <module>()
----> 1 im0, im1 = ucsc.fetch('chr16:{}-{}'.format(5280000,5600000), session_id='504899467_paH0Cg1JuKHXqprclfvhTYAqoPgO')
      2 plt.imshow(im1)
      3 plt.show()

/home/s1529682/Documents/PhD/ucsc.py in fetch(region, db, session_id, params)
    136     r = requests.get(BASE_URL + "/cgi-bin/hgTracks", params=params)
    137     r.raise_for_status()
--> 138     return _scrape_for_images(r)
    139 
    140 

/home/s1529682/Documents/PhD/ucsc.py in _scrape_for_images(resp)
     58     # Image of the main panel
     59     img_tag1 = soup.find(id="img_data_ruler")
---> 60     r_data = requests.get(BASE_URL + img_tag1.attrs['src'][2:])
     61     r_data.raise_for_status()
     62     im_data = imread(BytesIO(r_data.content))

AttributeError: 'NoneType' object has no attribute 'attrs'

As a workaround it would be OK to just manually specify everything each time, but I couldn't figure out how to specify parameters for different ENCODE data, for example...
Do you have any ideas?

@nvictus
Copy link
Author

nvictus commented Sep 2, 2016

Hey Ilya! Noticed your post just now for some reason! Are you sure you copied the session ID correctly from your url, or that your session didn't expire? It always works for me. If not, could you check what HTML the server sends back in r.text?

On that note, it should test if the scraper failed by checking that img_tag1 is not None.

As for your second issue, specifying ENCODE or other data is a mystery wrapped in an enigma wrapped in a twinkie. You can add things and inspect the session file on UCSC to see what random parameters get included. I haven't been able to find any method in the madness.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment