Skip to content

Instantly share code, notes, and snippets.

@hanfeisun
Last active December 28, 2015 02:29
Show Gist options
  • Save hanfeisun/7428437 to your computer and use it in GitHub Desktop.
Save hanfeisun/7428437 to your computer and use it in GitHub Desktop.
def update_one_sample(gsmid, parse_fields=['other_ids', 'paper', 'name', 'species', 'description', 'antibody', 'factor',
'cell type', 'cell line', 'tissue', 'strain']):
"""Given a gsmid, tries to create a new sample--auto-filling in the
meta fields
If overwrite is True and there is the sample that has the same gsmid, this function will overwrite that sample
NOTE: will try to save the sample!!
Returns newly created sample
"""
sraId = gsmToSra(gsmid)
sraXML = sra.getSraXML(sraId) if sraId else None
geoPost = postProcessGeo(gsmid)
gseId = gsmToGse(gsmid)
pmid = gseToPubmed(gseId) if gseId else None
s, created = models.Samples.objects.get_or_create(unique_id=gsmid)
if created:
s.status = "new"
s.date_collected = datetime.datetime.now()
s.fastq_file_url = sra.getSRA_downloadLink(sraXML) if sraXML else None
if 'other_ids' in parse_fields:
idList = {'sra': sraId, 'gse': gseId, 'pmid': pmid}
s.other_ids = json.dumps(idList)
if 'paper' in parse_fields and pmid:
s.paper = pubmed.getOrCreatePaper(pmid)
if 'name' in parse_fields:
s.name = getFromPost(geoPost, "title")
if 'species' in parse_fields:
if getFromPost(geoPost, "organism") == "HOMO SAPIENS":
s.species = models.Species.objects.get(pk=1)
else:
s.species = models.Species.objects.get(pk=2)
#HERE is where I need to create a classifier app/module
#FACTOR, platform, species--HERE are the rest of them!
description_dict = parseGeoInfo(gsmid)
if 'description' in parse_fields:
s.description = json.dumps(description_dict)
print s.description
if 'antibody' in parse_fields:
s.antibody = parseAntibody(description_dict)
if 'factor' in parse_fields:
s.factor = parseFactor(description_dict)
if 'cell type' in parse_fields:
s.cell_type = parseCellType(description_dict)
if 'tissue' in parse_fields:
s.tissue_type = parseTissue(description_dict)
if 'cell line' in parse_fields:
s.cell_line = parseCellLine(description_dict)
# Sometimes cell line name is the `source name` field, especially when the content in `source name` is short
if not s.tissue_type and not s.cell_line:
s.cell_line = parseCellLineBySourceName(description_dict)
if 'strain' in parse_fields:
s.strain = parseStrain(description_dict)
if 'disease' in parse_fields:
s.disease = parseDisease(description_dict)
if 'cell pop' in parse_fields:
s.cell_pop = parseCellPop(description_dict)
s.save()
return s
def _general_parser(description_dict, description_key, model_selector, max_create_length=100, new=False):
if not description_dict.get(description_key, None):
return None
if len(description_dict.get(description_key, "")) > 0:
result = sorted(model_selector(models).objects.extra(where={"%s like CONCAT('%%', `name`, '%%')"},
params=[description_dict[description_key]]),
key=lambda o: len(o.name),
reverse=True)
if result and len(result[0].name.strip())>0:
return result[0]
if new and len(description_dict[description_key]) <= max_create_length:
ret, created = model_selector(models).objects.get_or_create(name=description_dict[description_key])
if created:
ret.status = 'new'
return ret
return None
def _general_list_parser(description_dict, description_keys, model_selector, max_create_length=100, new=False):
for dk in description_keys:
gp = _general_parser(description_dict, dk, model_selector, max_create_length, new)
if gp:
return gp
return None
def parseCellLineBySourceName(description_dict):
return _general_parser(description_dict, 'source name', lambda m: m.CellLines, 15, new=False)
def parseCellType(description_dict):
return _general_list_parser(description_dict, ['cell type', 'cell lineage'], lambda m: m.CellTypes, new=True)
def parseCellLine(description_dict):
return _general_list_parser(description_dict, ['cell line', 'cell'], lambda m: m.CellLines, new=True)
def parseTissue(description_dict):
return _general_list_parser(description_dict, ['tissue', 'tissue type', 'tissue depot'], lambda m: m.TissueTypes,
new=True)
def parseStrain(description_dict):
return _general_list_parser(description_dict, ['strain', 'strain background'], lambda m: m.Strains, new=True)
def parseDisease(description_dict):
return _general_list_parser(description_dict, ['disease', 'tumor stage', 'cell karotype'],
lambda m: m.DiseaseStates, new=True)
def parseCellPop(description_dict):
return _general_list_parser(description_dict, ['source name'], lambda m: m.CellPops, new=False)
def parseFactor(description_dict):
# TODO: use description dict to parse, instead of using geoPost
standard_fields = ["chip antibody", "antibody", "chip", "antibody source", "antibody antibodydescription",
"antibody targetdescription", "factor", "title"]
non_standard_fields = [i for i in description_dict.keys() if "antibody" in i and i not in standard_fields]
#1. try to get the values
for t in standard_fields + non_standard_fields:
tmp = description_dict.get(t, "").strip()
# skip the null field
if not tmp:
continue
# make all character upper case, then delete strings like `ANTI` and `_`
tmp = tmp.upper().replace("ANTI-", " ").replace("ANTI", " ").replace("_", " ").strip()
# `N/A` often concurs with `Input`
if "N/A" in tmp:
return models.Factors.objects.get_or_create(name="Input")[0]
# If the field has very short description and it is not `TITLE`, the description is usually the factor name.
if t != "title":
if len(tmp) < 10 and tmp != "":
ret, created = models.Factors.objects.get_or_create(name=tmp)
if created:
ret.status = 'new'
return ret
# split the description into tokens
splited = re.findall(r"[\w-]+", tmp)
for s in splited:
if re.match(r"^[\d-]+$", s):
continue
if len(s) <= 2 and d.check(s):
continue
# POL2 factor usually starts with `POL2`
if (s.startswith("POL2") and len(s) < 10):
return models.Factors.objects.get_or_create(name="POL2")[0]
# If a token is neither a number nor a vocabulary in dictionary, it may be the factor name
if models.Factors.objects.filter(name__iexact=s):
return models.Factors.objects.get(name__iexact=s)
if models.Aliases.objects.filter(name__iexact=s):
alias = models.Aliases.objects.get(name__iexact=s)
print "Con!! Find a factor by its alias"
print alias.name
return alias.factor
# special cases for `Input` and `POL2`
if "INPUT" in splited:
return models.Factors.objects.get_or_create(name="Input")[0]
if ("POLYMERASE" in splited) or ("POL" in splited):
return models.Factors.objects.get_or_create(name="POL2")[0]
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment