Last active
December 28, 2015 02:29
-
-
Save hanfeisun/7428437 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def update_one_sample(gsmid, parse_fields=['other_ids', 'paper', 'name', 'species', 'description', 'antibody', 'factor', | |
'cell type', 'cell line', 'tissue', 'strain']): | |
"""Given a gsmid, tries to create a new sample--auto-filling in the | |
meta fields | |
If overwrite is True and there is the sample that has the same gsmid, this function will overwrite that sample | |
NOTE: will try to save the sample!! | |
Returns newly created sample | |
""" | |
sraId = gsmToSra(gsmid) | |
sraXML = sra.getSraXML(sraId) if sraId else None | |
geoPost = postProcessGeo(gsmid) | |
gseId = gsmToGse(gsmid) | |
pmid = gseToPubmed(gseId) if gseId else None | |
s, created = models.Samples.objects.get_or_create(unique_id=gsmid) | |
if created: | |
s.status = "new" | |
s.date_collected = datetime.datetime.now() | |
s.fastq_file_url = sra.getSRA_downloadLink(sraXML) if sraXML else None | |
if 'other_ids' in parse_fields: | |
idList = {'sra': sraId, 'gse': gseId, 'pmid': pmid} | |
s.other_ids = json.dumps(idList) | |
if 'paper' in parse_fields and pmid: | |
s.paper = pubmed.getOrCreatePaper(pmid) | |
if 'name' in parse_fields: | |
s.name = getFromPost(geoPost, "title") | |
if 'species' in parse_fields: | |
if getFromPost(geoPost, "organism") == "HOMO SAPIENS": | |
s.species = models.Species.objects.get(pk=1) | |
else: | |
s.species = models.Species.objects.get(pk=2) | |
#HERE is where I need to create a classifier app/module | |
#FACTOR, platform, species--HERE are the rest of them! | |
description_dict = parseGeoInfo(gsmid) | |
if 'description' in parse_fields: | |
s.description = json.dumps(description_dict) | |
print s.description | |
if 'antibody' in parse_fields: | |
s.antibody = parseAntibody(description_dict) | |
if 'factor' in parse_fields: | |
s.factor = parseFactor(description_dict) | |
if 'cell type' in parse_fields: | |
s.cell_type = parseCellType(description_dict) | |
if 'tissue' in parse_fields: | |
s.tissue_type = parseTissue(description_dict) | |
if 'cell line' in parse_fields: | |
s.cell_line = parseCellLine(description_dict) | |
# Sometimes cell line name is the `source name` field, especially when the content in `source name` is short | |
if not s.tissue_type and not s.cell_line: | |
s.cell_line = parseCellLineBySourceName(description_dict) | |
if 'strain' in parse_fields: | |
s.strain = parseStrain(description_dict) | |
if 'disease' in parse_fields: | |
s.disease = parseDisease(description_dict) | |
if 'cell pop' in parse_fields: | |
s.cell_pop = parseCellPop(description_dict) | |
s.save() | |
return s | |
def _general_parser(description_dict, description_key, model_selector, max_create_length=100, new=False): | |
if not description_dict.get(description_key, None): | |
return None | |
if len(description_dict.get(description_key, "")) > 0: | |
result = sorted(model_selector(models).objects.extra(where={"%s like CONCAT('%%', `name`, '%%')"}, | |
params=[description_dict[description_key]]), | |
key=lambda o: len(o.name), | |
reverse=True) | |
if result and len(result[0].name.strip())>0: | |
return result[0] | |
if new and len(description_dict[description_key]) <= max_create_length: | |
ret, created = model_selector(models).objects.get_or_create(name=description_dict[description_key]) | |
if created: | |
ret.status = 'new' | |
return ret | |
return None | |
def _general_list_parser(description_dict, description_keys, model_selector, max_create_length=100, new=False): | |
for dk in description_keys: | |
gp = _general_parser(description_dict, dk, model_selector, max_create_length, new) | |
if gp: | |
return gp | |
return None | |
def parseCellLineBySourceName(description_dict): | |
return _general_parser(description_dict, 'source name', lambda m: m.CellLines, 15, new=False) | |
def parseCellType(description_dict): | |
return _general_list_parser(description_dict, ['cell type', 'cell lineage'], lambda m: m.CellTypes, new=True) | |
def parseCellLine(description_dict): | |
return _general_list_parser(description_dict, ['cell line', 'cell'], lambda m: m.CellLines, new=True) | |
def parseTissue(description_dict): | |
return _general_list_parser(description_dict, ['tissue', 'tissue type', 'tissue depot'], lambda m: m.TissueTypes, | |
new=True) | |
def parseStrain(description_dict): | |
return _general_list_parser(description_dict, ['strain', 'strain background'], lambda m: m.Strains, new=True) | |
def parseDisease(description_dict): | |
return _general_list_parser(description_dict, ['disease', 'tumor stage', 'cell karotype'], | |
lambda m: m.DiseaseStates, new=True) | |
def parseCellPop(description_dict): | |
return _general_list_parser(description_dict, ['source name'], lambda m: m.CellPops, new=False) | |
def parseFactor(description_dict): | |
# TODO: use description dict to parse, instead of using geoPost | |
standard_fields = ["chip antibody", "antibody", "chip", "antibody source", "antibody antibodydescription", | |
"antibody targetdescription", "factor", "title"] | |
non_standard_fields = [i for i in description_dict.keys() if "antibody" in i and i not in standard_fields] | |
#1. try to get the values | |
for t in standard_fields + non_standard_fields: | |
tmp = description_dict.get(t, "").strip() | |
# skip the null field | |
if not tmp: | |
continue | |
# make all character upper case, then delete strings like `ANTI` and `_` | |
tmp = tmp.upper().replace("ANTI-", " ").replace("ANTI", " ").replace("_", " ").strip() | |
# `N/A` often concurs with `Input` | |
if "N/A" in tmp: | |
return models.Factors.objects.get_or_create(name="Input")[0] | |
# If the field has very short description and it is not `TITLE`, the description is usually the factor name. | |
if t != "title": | |
if len(tmp) < 10 and tmp != "": | |
ret, created = models.Factors.objects.get_or_create(name=tmp) | |
if created: | |
ret.status = 'new' | |
return ret | |
# split the description into tokens | |
splited = re.findall(r"[\w-]+", tmp) | |
for s in splited: | |
if re.match(r"^[\d-]+$", s): | |
continue | |
if len(s) <= 2 and d.check(s): | |
continue | |
# POL2 factor usually starts with `POL2` | |
if (s.startswith("POL2") and len(s) < 10): | |
return models.Factors.objects.get_or_create(name="POL2")[0] | |
# If a token is neither a number nor a vocabulary in dictionary, it may be the factor name | |
if models.Factors.objects.filter(name__iexact=s): | |
return models.Factors.objects.get(name__iexact=s) | |
if models.Aliases.objects.filter(name__iexact=s): | |
alias = models.Aliases.objects.get(name__iexact=s) | |
print "Con!! Find a factor by its alias" | |
print alias.name | |
return alias.factor | |
# special cases for `Input` and `POL2` | |
if "INPUT" in splited: | |
return models.Factors.objects.get_or_create(name="Input")[0] | |
if ("POLYMERASE" in splited) or ("POL" in splited): | |
return models.Factors.objects.get_or_create(name="POL2")[0] | |
return None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment