Skip to content

Instantly share code, notes, and snippets.

@CholoTook
Last active May 17, 2021 12:44
Show Gist options
  • Save CholoTook/2eaed8009e48e65bc1b1b65111320a59 to your computer and use it in GitHub Desktop.
Save CholoTook/2eaed8009e48e65bc1b1b65111320a59 to your computer and use it in GitHub Desktop.
My code for using GEOparse for comment...
import GEOparse
import storage_thingy
# The Illumina CanineHD BeadChip in GEO
gpl_accession = "GPL17481"
# Not sure where (or how really) to do this
st = storage_thingy.StorageThingy(basedir="./Data/DB/")
def main(gpl_accession):
print(f"Working on GPL: {gpl_accession}.")
gpl = GEOparse.get_GEO(gpl_accession,
destdir='./Data/GEO/')
print(f"Title: {gpl.metadata['title'][0]}.")
print(f"Rows : {gpl.metadata['data_row_count'][0]}.")
marker_dict = gpl_to_marker_dict(gpl.table)
# Not sure why, but a GPL doesn't have explicit GSMs, rather they are
# linked via metadata...
gsm_accessions = gpl.metadata['sample_id']
print(f"Found {len(gsm_accessions)} GSMs.")
for gsm_accession in gsm_accessions:
print(f"Working on GSM: {gsm_accession}.")
gsm = GEOparse.get_GEO(gsm_accession,
destdir='./Data/GEO/Samples/')
sample = get_sample(gsm)
sample_genotypes = get_sample_genotypes(gsm)
# An all round nasty conversion step...
update_sample_genotype(sample_genotypes, marker_dict)
st.store_sample(sample)
st.store_sample_genotypes(sample, sample_genotypes)
# TODO: Sample class?
def get_sample(gsm):
sample_id = gsm.get_accession()
# If there is more than one series_id, we want the 'SuperSeries'
# rather than the 'SubSeries' accession. Hopefully sorting works?
# There may be a better way to do this!
series_id = sorted(
gsm.metadata['series_id'])[-1]
characteristics = get_characteristics_as_dict(
gsm.metadata['characteristics_ch1'])
return [sample_id, series_id, characteristics['breed']]
def get_characteristics_as_dict(sample_characteristics):
"""By default, gsm.metadata gives us a list, even if there is only one
element for a given field. i.e. sample_characteristics should
be a list already.
"""
# There are some funky 'characteristics' that we don't need and
# we don't want to worry about parsing them...
sc_filtered = (
sc for sc in sample_characteristics if not sc.startswith('status (')
)
scs = dict(field.split(": ") for field in sc_filtered)
# MAGIC: The only samples with missing breeds that I've found
# also have the 'sub-type' characteristic that can stand in for
# breed.
if 'breed' not in scs:
scs['breed'] = scs['sub-type']
return scs
# TODO: SampleGenotype class?
def get_sample_genotypes(gsm):
# Convert a Pandas data frame into a dict
snps_dict = {gsm.table.at[i, 'ID_REF']: gsm.table.at[i, 'VALUE']
for i in gsm.table.index}
return snps_dict
def gpl_to_marker_dict(marker_table):
"""Used to convert marker value into a SNP genotpe.
Marker values are basically AA, BB, AB and NC. SNP genotypes
are the actual bases, e.g. GG, TT, GT and None.
"""
# Convert a Pandas data frame into a dict
marker_dict = {marker_table.at[i, 'ID']: marker_table.at[i, 'SNP']
for i in marker_table.index}
# Convert the marker values to SNP genotypes
for key, value in marker_dict.items():
marker_dict[key] = snp_string_to_dict(value)
return marker_dict
def snp_string_to_dict(snp_string):
ref = snp_string[1]
alt = snp_string[3]
return {
'AA': ref + ref,
'BB': alt + alt,
'NC': None,
# Convert T/A calls to A/T
'AB': ''.join(sorted(ref + alt)),
# Store this away for convenience
'_snp_string': snp_string,
}
def update_sample_genotype(sg, md):
for key, value in sg.items():
sg[key] = md[key][value]
if __name__ == '__main__':
main(gpl_accession)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment