Skip to content

Instantly share code, notes, and snippets.

@indraniel
Created December 30, 2015 23:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save indraniel/d1562ed555a9bb9159dd to your computer and use it in GitHub Desktop.
Save indraniel/d1562ed555a9bb9159dd to your computer and use it in GitHub Desktop.
use lxml to fetch NA12878 Coriell Sample Info
#!/usr/bin/env python
from __future__ import print_function, division
import sys, os, pprint
from glob import glob
import requests
from lxml import html
from toolz.curried import *
pp = pprint.PrettyPrinter(indent=4)
coriell_base_url = 'https://catalog.coriell.org/0/Sections/Search/Sample_Detail.aspx'
def convert_to_coriell_name(sample_name):
coriell = ''.join(['GM', sample_name[2:]])
return coriell
def fetch_coriell_page(coriell_name):
payload = { 'Ref' : coriell_name }
r = requests.get(coriell_base_url, params=payload)
if r.status_code != 200:
msg = "Got a {} code on coriell name: {}".format(
r.status_code, coriell_name
)
sys.exit(msg)
return r.content
def get_sample_attrs(sample_name):
coriell_name = convert_to_coriell_name(sample_name)
page = fetch_coriell_page(coriell_name)
tree = html.fromstring(page)
gender = (tree.xpath('//span[@id="lblGender"]/text()'))[0]
race = (tree.xpath('//span[@id="lblRace"]/text()'))[0]
relation = (tree.xpath('//span[@id="lblRelprob"]/text()'))[0]
ethnicity = (tree.xpath('//span[@id="lblEthnicity"]/text()'))[0]
commonName = (tree.xpath('//span[@id="lblCommon_Name"]/text()'))[0]
genus = (tree.xpath('//span[@id="lblGenus"]/text()'))[0]
species = (tree.xpath('//span[@id="lblSpecies"]/text()'))[0]
tissueType = (tree.xpath('//span[@id="lblCell_Subtype"]/text()'))[0]
data = {
'gender' : gender,
'race' : race,
'relation' : relation,
'ethnicity' : ethnicity,
'common_name' : commonName,
'genus/species': ' '.join([genus, species]),
'tissue_type' : tissueType,
}
return data
def process(datafile):
with open(datafile, 'r') as f:
samples = pipe(f, map(lambda (elem): str.rstrip(elem, "\n")),
map(lambda (elem): str.split(elem, "\t")),
pluck(1),
filter(lambda (elem): "_2_" not in elem),
map(lambda (elem): (str.split(elem, '_'))[0]),
tuple)
with open(datafile, 'r') as f:
errnames = pipe(f, map(lambda (elem): str.rstrip(elem, "\n")),
map(lambda (elem): str.split(elem, "\t")),
pluck([0,1]),
filter(lambda (elem): "_2_" not in elem[1]),
pluck(0),
tuple)
with open(datafile, 'r') as f:
fastqs = pipe(f, map(lambda (elem): str.rstrip(elem, "\n")),
map(lambda (elem): str.split(elem, "\t")),
pluck([0,1,3]),
filter(lambda (elem): "_2_" not in elem[1]),
pluck(2),
tuple)
attributes = map(get_sample_attrs, samples)
data = dict(zip(samples, attributes))
for (i, s) in enumerate(samples):
data[s]['errname'] = errnames[i]
data[s]['individual_name'] = s
data[s]['dna_type'] = 'genomic dna'
data[s]['Sample Nomenclature'] = 'MGI'
data[s]['Individual Nomenclature'] = 'Coriell'
data[s]['Files'] = ','.join(glob(fastqs[i]))
data[s]['file_sizes (GB)'] = \
','.join( pipe( glob(fastqs[i]), map(lambda (f): os.stat(f).st_size / 1024.0 / 1024.0 / 1024.0),
map(lambda (s): str.join(',', ["{:.2f}".format(s)])),
list))
return data
def printer(data):
attributes = (
'Sample Nomenclature',
'individual_name',
'Individual Nomenclature',
'ethnicity',
'gender',
'genus/species',
'race',
'relation',
'dna_type',
'Files',
'file_sizes (GB)'
)
# Header
print("{}\t{}".format('SampleName', "\t".join(attributes)))
# Data
for sample in sorted(data.keys()):
if sample == 'NA12878':
continue
attrs = data[sample]
errname = attrs['errname']
elems = [attrs[k] for k in attributes]
print("{}\t{}".format('-'.join([sample, errname]), "\t".join(elems)))
if __name__ == '__main__':
srcfile = os.path.join(
'/gscmnt/gc2801/analytics/idas/jira/BIO-1670',
'data/external',
'20151111-NA12878-pedigree-list.tsv'
)
data = process(srcfile)
# pp.pprint(data)
printer(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment