CholoTook/get_geo_data.py Secret

## get_geo_data.py
import GEOparse
import storage_thingy

# The Illumina CanineHD BeadChip in GEO
gpl_accession = "GPL17481"

# Not sure where (or how really) to do this
st = storage_thingy.StorageThingy(basedir="./Data/DB/")


def main(gpl_accession):
    print(f"Working on GPL: {gpl_accession}.")
    gpl = GEOparse.get_GEO(gpl_accession,
                           destdir='./Data/GEO/')

    print(f"Title: {gpl.metadata['title'][0]}.")
    print(f"Rows : {gpl.metadata['data_row_count'][0]}.")

    marker_dict = gpl_to_marker_dict(gpl.table)

    # Not sure why, but a GPL doesn't have explicit GSMs, rather they are
    # linked via metadata...
    gsm_accessions = gpl.metadata['sample_id']
    print(f"Found {len(gsm_accessions)} GSMs.")

    for gsm_accession in gsm_accessions:
        print(f"Working on GSM: {gsm_accession}.")
        gsm = GEOparse.get_GEO(gsm_accession,
                               destdir='./Data/GEO/Samples/')

        sample = get_sample(gsm)
        sample_genotypes = get_sample_genotypes(gsm)

        # An all round nasty conversion step...
        update_sample_genotype(sample_genotypes, marker_dict)

        st.store_sample(sample)
        st.store_sample_genotypes(sample, sample_genotypes)


# TODO: Sample class?

def get_sample(gsm):
    sample_id = gsm.get_accession()

    # If there is more than one series_id, we want the 'SuperSeries'
    # rather than the 'SubSeries' accession. Hopefully sorting works?
    # There may be a better way to do this!
    series_id = sorted(
        gsm.metadata['series_id'])[-1]

    characteristics = get_characteristics_as_dict(
        gsm.metadata['characteristics_ch1'])

    return [sample_id, series_id, characteristics['breed']]


def get_characteristics_as_dict(sample_characteristics):
    """By default, gsm.metadata gives us a list, even if there is only one
       element for a given field. i.e. sample_characteristics should
       be a list already.

    """
    # There are some funky 'characteristics' that we don't need and
    # we don't want to worry about parsing them...
    sc_filtered = (
        sc for sc in sample_characteristics if not sc.startswith('status (')
    )

    scs = dict(field.split(": ") for field in sc_filtered)

    # MAGIC: The only samples with missing breeds that I've found
    # also have the 'sub-type' characteristic that can stand in for
    # breed.
    if 'breed' not in scs:
        scs['breed'] = scs['sub-type']

    return scs


# TODO: SampleGenotype class?

def get_sample_genotypes(gsm):
    # Convert a Pandas data frame into a dict
    snps_dict = {gsm.table.at[i, 'ID_REF']: gsm.table.at[i, 'VALUE']
                 for i in gsm.table.index}

    return snps_dict


def gpl_to_marker_dict(marker_table):
    """Used to convert marker value into a SNP genotpe.

       Marker values are basically AA, BB, AB and NC. SNP genotypes
       are the actual bases, e.g. GG, TT, GT and None.

    """
    # Convert a Pandas data frame into a dict
    marker_dict = {marker_table.at[i, 'ID']: marker_table.at[i, 'SNP']
                   for i in marker_table.index}

    # Convert the marker values to SNP genotypes
    for key, value in marker_dict.items():
        marker_dict[key] = snp_string_to_dict(value)

    return marker_dict


def snp_string_to_dict(snp_string):
    ref = snp_string[1]
    alt = snp_string[3]

    return {
        'AA': ref + ref,
        'BB': alt + alt,
        'NC': None,

        # Convert T/A calls to A/T
        'AB': ''.join(sorted(ref + alt)),

        # Store this away for convenience
        '_snp_string': snp_string,
    }


def update_sample_genotype(sg, md):
    for key, value in sg.items():
        sg[key] = md[key][value]


if __name__ == '__main__':
    main(gpl_accession)
	import GEOparse
	import storage_thingy

	# The Illumina CanineHD BeadChip in GEO
	gpl_accession = "GPL17481"

	# Not sure where (or how really) to do this
	st = storage_thingy.StorageThingy(basedir="./Data/DB/")


	def main(gpl_accession):
	print(f"Working on GPL: {gpl_accession}.")
	gpl = GEOparse.get_GEO(gpl_accession,
	destdir='./Data/GEO/')

	print(f"Title: {gpl.metadata['title'][0]}.")
	print(f"Rows : {gpl.metadata['data_row_count'][0]}.")

	marker_dict = gpl_to_marker_dict(gpl.table)

	# Not sure why, but a GPL doesn't have explicit GSMs, rather they are
	# linked via metadata...
	gsm_accessions = gpl.metadata['sample_id']
	print(f"Found {len(gsm_accessions)} GSMs.")

	for gsm_accession in gsm_accessions:
	print(f"Working on GSM: {gsm_accession}.")
	gsm = GEOparse.get_GEO(gsm_accession,
	destdir='./Data/GEO/Samples/')

	sample = get_sample(gsm)
	sample_genotypes = get_sample_genotypes(gsm)

	# An all round nasty conversion step...
	update_sample_genotype(sample_genotypes, marker_dict)

	st.store_sample(sample)
	st.store_sample_genotypes(sample, sample_genotypes)


	# TODO: Sample class?

	def get_sample(gsm):
	sample_id = gsm.get_accession()

	# If there is more than one series_id, we want the 'SuperSeries'
	# rather than the 'SubSeries' accession. Hopefully sorting works?
	# There may be a better way to do this!
	series_id = sorted(
	gsm.metadata['series_id'])[-1]

	characteristics = get_characteristics_as_dict(
	gsm.metadata['characteristics_ch1'])

	return [sample_id, series_id, characteristics['breed']]


	def get_characteristics_as_dict(sample_characteristics):
	"""By default, gsm.metadata gives us a list, even if there is only one
	element for a given field. i.e. sample_characteristics should
	be a list already.

	"""
	# There are some funky 'characteristics' that we don't need and
	# we don't want to worry about parsing them...
	sc_filtered = (
	sc for sc in sample_characteristics if not sc.startswith('status (')
	)

	scs = dict(field.split(": ") for field in sc_filtered)

	# MAGIC: The only samples with missing breeds that I've found
	# also have the 'sub-type' characteristic that can stand in for
	# breed.
	if 'breed' not in scs:
	scs['breed'] = scs['sub-type']

	return scs


	# TODO: SampleGenotype class?

	def get_sample_genotypes(gsm):
	# Convert a Pandas data frame into a dict
	snps_dict = {gsm.table.at[i, 'ID_REF']: gsm.table.at[i, 'VALUE']
	for i in gsm.table.index}

	return snps_dict


	def gpl_to_marker_dict(marker_table):
	"""Used to convert marker value into a SNP genotpe.

	Marker values are basically AA, BB, AB and NC. SNP genotypes
	are the actual bases, e.g. GG, TT, GT and None.

	"""
	# Convert a Pandas data frame into a dict
	marker_dict = {marker_table.at[i, 'ID']: marker_table.at[i, 'SNP']
	for i in marker_table.index}

	# Convert the marker values to SNP genotypes
	for key, value in marker_dict.items():
	marker_dict[key] = snp_string_to_dict(value)

	return marker_dict


	def snp_string_to_dict(snp_string):
	ref = snp_string[1]
	alt = snp_string[3]

	return {
	'AA': ref + ref,
	'BB': alt + alt,
	'NC': None,

	# Convert T/A calls to A/T
	'AB': ''.join(sorted(ref + alt)),

	# Store this away for convenience
	'_snp_string': snp_string,
	}


	def update_sample_genotype(sg, md):
	for key, value in sg.items():
	sg[key] = md[key][value]


	if __name__ == '__main__':
	main(gpl_accession)