Skip to content

Instantly share code, notes, and snippets.

@daler
Created September 19, 2016 21:30
Show Gist options
  • Save daler/a71d7c952875e47744c904dc65c77bd5 to your computer and use it in GitHub Desktop.
Save daler/a71d7c952875e47744c904dc65c77bd5 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# Install into new environment and activate it before running this script:
#
# conda create -n biostars-212519 python=3 requests numpy pandas
# source activate biostars-212519
# python identify_encode_controls.py
import pandas
import numpy as np
import requests
def accession_metadata(acc):
"""
Returns the metadata for ENCODE accession `acc` (e.g., ENCSR000BJN)
"""
HEADERS = {'accept': 'application/json'}
URL = (
'https://www.encodeproject.org/experiments/{0}/?frame=object'
.format(acc)
)
response = requests.get(URL, headers=HEADERS)
return response.content
# You can get a URL interactively on encodeproject.org by subsetting your
# query, clicking on the "download" button, and extracting the first line of
# that file. This example is all HepG2 ChIP-seq data.
metadata_url = (
"https://www.encodeproject.org/metadata/type=Experiment&"
"biosample_term_name=HepG2&assay_title=ChIP-seq&limit=all/metadata.tsv"
)
df = pandas.read_table(metadata_url)
# subset just the first 100 rows for this example
df = df.iloc[:100]
def find_controls(acc):
"""
The metadata for an accession contains a "possible controls" field. I'm
taking that to mean there can be multiple controls, so to be safe I'm
returning a list of them.
"""
m = pandas.read_json(accession_metadata(acc), typ='series')
c = m['possible_controls']
return [i.split('/')[2] for i in c]
# the metadata has multiple rows for each accession. To speed things up
# dramatically, only look for controls for the unique set of accessions, and
# then join them to the dataframe afterwards.
#
ds = []
for acc in df['Experiment accession'].unique():
print('getting metadata for accession:', acc)
ds.append(
{
'Experiment accession': acc,
'controls': find_controls(acc)
}
)
controls = pandas.DataFrame(ds).set_index('Experiment accession')
# join controls to full metadata
df = df.join(controls, on='Experiment accession')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment