Created
December 30, 2015 23:12
-
-
Save indraniel/d1562ed555a9bb9159dd to your computer and use it in GitHub Desktop.
use lxml to fetch NA12878 Coriell Sample Info
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function, division | |
import sys, os, pprint | |
from glob import glob | |
import requests | |
from lxml import html | |
from toolz.curried import * | |
pp = pprint.PrettyPrinter(indent=4) | |
coriell_base_url = 'https://catalog.coriell.org/0/Sections/Search/Sample_Detail.aspx' | |
def convert_to_coriell_name(sample_name): | |
coriell = ''.join(['GM', sample_name[2:]]) | |
return coriell | |
def fetch_coriell_page(coriell_name): | |
payload = { 'Ref' : coriell_name } | |
r = requests.get(coriell_base_url, params=payload) | |
if r.status_code != 200: | |
msg = "Got a {} code on coriell name: {}".format( | |
r.status_code, coriell_name | |
) | |
sys.exit(msg) | |
return r.content | |
def get_sample_attrs(sample_name): | |
coriell_name = convert_to_coriell_name(sample_name) | |
page = fetch_coriell_page(coriell_name) | |
tree = html.fromstring(page) | |
gender = (tree.xpath('//span[@id="lblGender"]/text()'))[0] | |
race = (tree.xpath('//span[@id="lblRace"]/text()'))[0] | |
relation = (tree.xpath('//span[@id="lblRelprob"]/text()'))[0] | |
ethnicity = (tree.xpath('//span[@id="lblEthnicity"]/text()'))[0] | |
commonName = (tree.xpath('//span[@id="lblCommon_Name"]/text()'))[0] | |
genus = (tree.xpath('//span[@id="lblGenus"]/text()'))[0] | |
species = (tree.xpath('//span[@id="lblSpecies"]/text()'))[0] | |
tissueType = (tree.xpath('//span[@id="lblCell_Subtype"]/text()'))[0] | |
data = { | |
'gender' : gender, | |
'race' : race, | |
'relation' : relation, | |
'ethnicity' : ethnicity, | |
'common_name' : commonName, | |
'genus/species': ' '.join([genus, species]), | |
'tissue_type' : tissueType, | |
} | |
return data | |
def process(datafile): | |
with open(datafile, 'r') as f: | |
samples = pipe(f, map(lambda (elem): str.rstrip(elem, "\n")), | |
map(lambda (elem): str.split(elem, "\t")), | |
pluck(1), | |
filter(lambda (elem): "_2_" not in elem), | |
map(lambda (elem): (str.split(elem, '_'))[0]), | |
tuple) | |
with open(datafile, 'r') as f: | |
errnames = pipe(f, map(lambda (elem): str.rstrip(elem, "\n")), | |
map(lambda (elem): str.split(elem, "\t")), | |
pluck([0,1]), | |
filter(lambda (elem): "_2_" not in elem[1]), | |
pluck(0), | |
tuple) | |
with open(datafile, 'r') as f: | |
fastqs = pipe(f, map(lambda (elem): str.rstrip(elem, "\n")), | |
map(lambda (elem): str.split(elem, "\t")), | |
pluck([0,1,3]), | |
filter(lambda (elem): "_2_" not in elem[1]), | |
pluck(2), | |
tuple) | |
attributes = map(get_sample_attrs, samples) | |
data = dict(zip(samples, attributes)) | |
for (i, s) in enumerate(samples): | |
data[s]['errname'] = errnames[i] | |
data[s]['individual_name'] = s | |
data[s]['dna_type'] = 'genomic dna' | |
data[s]['Sample Nomenclature'] = 'MGI' | |
data[s]['Individual Nomenclature'] = 'Coriell' | |
data[s]['Files'] = ','.join(glob(fastqs[i])) | |
data[s]['file_sizes (GB)'] = \ | |
','.join( pipe( glob(fastqs[i]), map(lambda (f): os.stat(f).st_size / 1024.0 / 1024.0 / 1024.0), | |
map(lambda (s): str.join(',', ["{:.2f}".format(s)])), | |
list)) | |
return data | |
def printer(data): | |
attributes = ( | |
'Sample Nomenclature', | |
'individual_name', | |
'Individual Nomenclature', | |
'ethnicity', | |
'gender', | |
'genus/species', | |
'race', | |
'relation', | |
'dna_type', | |
'Files', | |
'file_sizes (GB)' | |
) | |
# Header | |
print("{}\t{}".format('SampleName', "\t".join(attributes))) | |
# Data | |
for sample in sorted(data.keys()): | |
if sample == 'NA12878': | |
continue | |
attrs = data[sample] | |
errname = attrs['errname'] | |
elems = [attrs[k] for k in attributes] | |
print("{}\t{}".format('-'.join([sample, errname]), "\t".join(elems))) | |
if __name__ == '__main__': | |
srcfile = os.path.join( | |
'/gscmnt/gc2801/analytics/idas/jira/BIO-1670', | |
'data/external', | |
'20151111-NA12878-pedigree-list.tsv' | |
) | |
data = process(srcfile) | |
# pp.pprint(data) | |
printer(data) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment