indraniel/1-fetch-na12878-pedigree-metadata.py

## 1-fetch-na12878-pedigree-metadata.py
#!/usr/bin/env python

from __future__ import print_function, division
import sys, os, pprint
from glob import glob

import requests
from lxml import html
from toolz.curried import *

pp = pprint.PrettyPrinter(indent=4)

coriell_base_url = 'https://catalog.coriell.org/0/Sections/Search/Sample_Detail.aspx'

def convert_to_coriell_name(sample_name):
    coriell = ''.join(['GM', sample_name[2:]])
    return coriell

def fetch_coriell_page(coriell_name):
    payload = { 'Ref' : coriell_name }
    r = requests.get(coriell_base_url, params=payload)
    if r.status_code != 200:
        msg = "Got a {} code on coriell name: {}".format(
                r.status_code, coriell_name
        )
        sys.exit(msg)
    return r.content

def get_sample_attrs(sample_name):
    coriell_name = convert_to_coriell_name(sample_name)
    page = fetch_coriell_page(coriell_name)
    tree = html.fromstring(page)
    gender = (tree.xpath('//span[@id="lblGender"]/text()'))[0]
    race = (tree.xpath('//span[@id="lblRace"]/text()'))[0]
    relation = (tree.xpath('//span[@id="lblRelprob"]/text()'))[0]
    ethnicity = (tree.xpath('//span[@id="lblEthnicity"]/text()'))[0]
    commonName = (tree.xpath('//span[@id="lblCommon_Name"]/text()'))[0]
    genus = (tree.xpath('//span[@id="lblGenus"]/text()'))[0]
    species = (tree.xpath('//span[@id="lblSpecies"]/text()'))[0]
    tissueType = (tree.xpath('//span[@id="lblCell_Subtype"]/text()'))[0]
    data = {
        'gender'       : gender,
        'race'         : race,
        'relation'     : relation,
        'ethnicity'    : ethnicity,
        'common_name'  : commonName,
        'genus/species': ' '.join([genus, species]),
        'tissue_type'  : tissueType,
    }
    return data

def process(datafile):
    with open(datafile, 'r') as f:
        samples = pipe(f, map(lambda (elem): str.rstrip(elem, "\n")),
                          map(lambda (elem): str.split(elem, "\t")),
                          pluck(1),
                          filter(lambda (elem): "_2_" not in elem),
                          map(lambda (elem): (str.split(elem, '_'))[0]),
                          tuple)

    with open(datafile, 'r') as f:
        errnames = pipe(f, map(lambda (elem): str.rstrip(elem, "\n")),
                           map(lambda (elem): str.split(elem, "\t")),
                           pluck([0,1]),
                           filter(lambda (elem): "_2_" not in elem[1]),
                           pluck(0),
                           tuple)

    with open(datafile, 'r') as f:
        fastqs = pipe(f, map(lambda (elem): str.rstrip(elem, "\n")),
                         map(lambda (elem): str.split(elem, "\t")),
                         pluck([0,1,3]),
                         filter(lambda (elem): "_2_" not in elem[1]),
                         pluck(2),
                         tuple)

    attributes = map(get_sample_attrs, samples)
    data = dict(zip(samples, attributes))

    for (i, s) in enumerate(samples):
        data[s]['errname'] = errnames[i]
        data[s]['individual_name'] = s
        data[s]['dna_type'] = 'genomic dna'
        data[s]['Sample Nomenclature'] = 'MGI'
        data[s]['Individual Nomenclature'] = 'Coriell'
        data[s]['Files'] = ','.join(glob(fastqs[i]))
        data[s]['file_sizes (GB)'] = \
            ','.join( pipe( glob(fastqs[i]), map(lambda (f): os.stat(f).st_size / 1024.0 / 1024.0 / 1024.0),
                                             map(lambda (s): str.join(',', ["{:.2f}".format(s)])),
                                             list))


    return data

def printer(data):
    attributes = (
        'Sample Nomenclature',
        'individual_name',
        'Individual Nomenclature',
        'ethnicity',
        'gender',
        'genus/species',
        'race',
        'relation',
        'dna_type',
        'Files',
        'file_sizes (GB)'
    )

    # Header
    print("{}\t{}".format('SampleName', "\t".join(attributes)))

    # Data
    for sample in sorted(data.keys()):
        if sample == 'NA12878':
            continue
        attrs = data[sample]
        errname = attrs['errname']
        elems = [attrs[k] for k in attributes]
        print("{}\t{}".format('-'.join([sample, errname]), "\t".join(elems)))

if __name__ == '__main__':
    srcfile = os.path.join(
        '/gscmnt/gc2801/analytics/idas/jira/BIO-1670',
        'data/external',
        '20151111-NA12878-pedigree-list.tsv'
    )
    data = process(srcfile)
#    pp.pprint(data)
    printer(data)
	#!/usr/bin/env python

	from __future__ import print_function, division
	import sys, os, pprint
	from glob import glob

	import requests
	from lxml import html
	from toolz.curried import *

	pp = pprint.PrettyPrinter(indent=4)

	coriell_base_url = 'https://catalog.coriell.org/0/Sections/Search/Sample_Detail.aspx'

	def convert_to_coriell_name(sample_name):
	coriell = ''.join(['GM', sample_name[2:]])
	return coriell

	def fetch_coriell_page(coriell_name):
	payload = { 'Ref' : coriell_name }
	r = requests.get(coriell_base_url, params=payload)
	if r.status_code != 200:
	msg = "Got a {} code on coriell name: {}".format(
	r.status_code, coriell_name
	)
	sys.exit(msg)
	return r.content

	def get_sample_attrs(sample_name):
	coriell_name = convert_to_coriell_name(sample_name)
	page = fetch_coriell_page(coriell_name)
	tree = html.fromstring(page)
	gender = (tree.xpath('//span[@id="lblGender"]/text()'))[0]
	race = (tree.xpath('//span[@id="lblRace"]/text()'))[0]
	relation = (tree.xpath('//span[@id="lblRelprob"]/text()'))[0]
	ethnicity = (tree.xpath('//span[@id="lblEthnicity"]/text()'))[0]
	commonName = (tree.xpath('//span[@id="lblCommon_Name"]/text()'))[0]
	genus = (tree.xpath('//span[@id="lblGenus"]/text()'))[0]
	species = (tree.xpath('//span[@id="lblSpecies"]/text()'))[0]
	tissueType = (tree.xpath('//span[@id="lblCell_Subtype"]/text()'))[0]
	data = {
	'gender' : gender,
	'race' : race,
	'relation' : relation,
	'ethnicity' : ethnicity,
	'common_name' : commonName,
	'genus/species': ' '.join([genus, species]),
	'tissue_type' : tissueType,
	}
	return data

	def process(datafile):
	with open(datafile, 'r') as f:
	samples = pipe(f, map(lambda (elem): str.rstrip(elem, "\n")),
	map(lambda (elem): str.split(elem, "\t")),
	pluck(1),
	filter(lambda (elem): "_2_" not in elem),
	map(lambda (elem): (str.split(elem, '_'))[0]),
	tuple)

	with open(datafile, 'r') as f:
	errnames = pipe(f, map(lambda (elem): str.rstrip(elem, "\n")),
	map(lambda (elem): str.split(elem, "\t")),
	pluck([0,1]),
	filter(lambda (elem): "_2_" not in elem[1]),
	pluck(0),
	tuple)

	with open(datafile, 'r') as f:
	fastqs = pipe(f, map(lambda (elem): str.rstrip(elem, "\n")),
	map(lambda (elem): str.split(elem, "\t")),
	pluck([0,1,3]),
	filter(lambda (elem): "_2_" not in elem[1]),
	pluck(2),
	tuple)

	attributes = map(get_sample_attrs, samples)
	data = dict(zip(samples, attributes))

	for (i, s) in enumerate(samples):
	data[s]['errname'] = errnames[i]
	data[s]['individual_name'] = s
	data[s]['dna_type'] = 'genomic dna'
	data[s]['Sample Nomenclature'] = 'MGI'
	data[s]['Individual Nomenclature'] = 'Coriell'
	data[s]['Files'] = ','.join(glob(fastqs[i]))
	data[s]['file_sizes (GB)'] = \
	','.join( pipe( glob(fastqs[i]), map(lambda (f): os.stat(f).st_size / 1024.0 / 1024.0 / 1024.0),
	map(lambda (s): str.join(',', ["{:.2f}".format(s)])),
	list))



	return data

	def printer(data):
	attributes = (
	'Sample Nomenclature',
	'individual_name',
	'Individual Nomenclature',
	'ethnicity',
	'gender',
	'genus/species',
	'race',
	'relation',
	'dna_type',
	'Files',
	'file_sizes (GB)'
	)

	# Header
	print("{}\t{}".format('SampleName', "\t".join(attributes)))

	# Data
	for sample in sorted(data.keys()):
	if sample == 'NA12878':
	continue
	attrs = data[sample]
	errname = attrs['errname']
	elems = [attrs[k] for k in attributes]
	print("{}\t{}".format('-'.join([sample, errname]), "\t".join(elems)))

	if __name__ == '__main__':
	srcfile = os.path.join(
	'/gscmnt/gc2801/analytics/idas/jira/BIO-1670',
	'data/external',
	'20151111-NA12878-pedigree-list.tsv'
	)
	data = process(srcfile)
	# pp.pprint(data)
	printer(data)