-
-
Save CholoTook/2eaed8009e48e65bc1b1b65111320a59 to your computer and use it in GitHub Desktop.
My code for using GEOparse for comment...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import GEOparse | |
import storage_thingy | |
# The Illumina CanineHD BeadChip in GEO | |
gpl_accession = "GPL17481" | |
# Not sure where (or how really) to do this | |
st = storage_thingy.StorageThingy(basedir="./Data/DB/") | |
def main(gpl_accession): | |
print(f"Working on GPL: {gpl_accession}.") | |
gpl = GEOparse.get_GEO(gpl_accession, | |
destdir='./Data/GEO/') | |
print(f"Title: {gpl.metadata['title'][0]}.") | |
print(f"Rows : {gpl.metadata['data_row_count'][0]}.") | |
marker_dict = gpl_to_marker_dict(gpl.table) | |
# Not sure why, but a GPL doesn't have explicit GSMs, rather they are | |
# linked via metadata... | |
gsm_accessions = gpl.metadata['sample_id'] | |
print(f"Found {len(gsm_accessions)} GSMs.") | |
for gsm_accession in gsm_accessions: | |
print(f"Working on GSM: {gsm_accession}.") | |
gsm = GEOparse.get_GEO(gsm_accession, | |
destdir='./Data/GEO/Samples/') | |
sample = get_sample(gsm) | |
sample_genotypes = get_sample_genotypes(gsm) | |
# An all round nasty conversion step... | |
update_sample_genotype(sample_genotypes, marker_dict) | |
st.store_sample(sample) | |
st.store_sample_genotypes(sample, sample_genotypes) | |
# TODO: Sample class? | |
def get_sample(gsm): | |
sample_id = gsm.get_accession() | |
# If there is more than one series_id, we want the 'SuperSeries' | |
# rather than the 'SubSeries' accession. Hopefully sorting works? | |
# There may be a better way to do this! | |
series_id = sorted( | |
gsm.metadata['series_id'])[-1] | |
characteristics = get_characteristics_as_dict( | |
gsm.metadata['characteristics_ch1']) | |
return [sample_id, series_id, characteristics['breed']] | |
def get_characteristics_as_dict(sample_characteristics): | |
"""By default, gsm.metadata gives us a list, even if there is only one | |
element for a given field. i.e. sample_characteristics should | |
be a list already. | |
""" | |
# There are some funky 'characteristics' that we don't need and | |
# we don't want to worry about parsing them... | |
sc_filtered = ( | |
sc for sc in sample_characteristics if not sc.startswith('status (') | |
) | |
scs = dict(field.split(": ") for field in sc_filtered) | |
# MAGIC: The only samples with missing breeds that I've found | |
# also have the 'sub-type' characteristic that can stand in for | |
# breed. | |
if 'breed' not in scs: | |
scs['breed'] = scs['sub-type'] | |
return scs | |
# TODO: SampleGenotype class? | |
def get_sample_genotypes(gsm): | |
# Convert a Pandas data frame into a dict | |
snps_dict = {gsm.table.at[i, 'ID_REF']: gsm.table.at[i, 'VALUE'] | |
for i in gsm.table.index} | |
return snps_dict | |
def gpl_to_marker_dict(marker_table): | |
"""Used to convert marker value into a SNP genotpe. | |
Marker values are basically AA, BB, AB and NC. SNP genotypes | |
are the actual bases, e.g. GG, TT, GT and None. | |
""" | |
# Convert a Pandas data frame into a dict | |
marker_dict = {marker_table.at[i, 'ID']: marker_table.at[i, 'SNP'] | |
for i in marker_table.index} | |
# Convert the marker values to SNP genotypes | |
for key, value in marker_dict.items(): | |
marker_dict[key] = snp_string_to_dict(value) | |
return marker_dict | |
def snp_string_to_dict(snp_string): | |
ref = snp_string[1] | |
alt = snp_string[3] | |
return { | |
'AA': ref + ref, | |
'BB': alt + alt, | |
'NC': None, | |
# Convert T/A calls to A/T | |
'AB': ''.join(sorted(ref + alt)), | |
# Store this away for convenience | |
'_snp_string': snp_string, | |
} | |
def update_sample_genotype(sg, md): | |
for key, value in sg.items(): | |
sg[key] = md[key][value] | |
if __name__ == '__main__': | |
main(gpl_accession) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment