hanfeisun/parser.py

## parser.py
def update_one_sample(gsmid, parse_fields=['other_ids', 'paper', 'name', 'species', 'description', 'antibody', 'factor',
                                           'cell type', 'cell line', 'tissue', 'strain']):
    """Given a gsmid, tries to create a new sample--auto-filling in the
    meta fields


    If overwrite is True and there is the sample that has the same gsmid, this function will overwrite that sample

    NOTE: will try to save the sample!!

    Returns newly created sample
    """

    sraId = gsmToSra(gsmid)
    sraXML = sra.getSraXML(sraId) if sraId else None
    geoPost = postProcessGeo(gsmid)
    gseId = gsmToGse(gsmid)
    pmid = gseToPubmed(gseId) if gseId else None

    s, created = models.Samples.objects.get_or_create(unique_id=gsmid)
    if created:
        s.status = "new"

    s.date_collected = datetime.datetime.now()
    s.fastq_file_url = sra.getSRA_downloadLink(sraXML) if sraXML else None

    if 'other_ids' in parse_fields:
        idList = {'sra': sraId, 'gse': gseId, 'pmid': pmid}
        s.other_ids = json.dumps(idList)

    if 'paper' in parse_fields and pmid:
        s.paper = pubmed.getOrCreatePaper(pmid)

    if 'name' in parse_fields:
        s.name = getFromPost(geoPost, "title")

    if 'species' in parse_fields:
        if getFromPost(geoPost, "organism") == "HOMO SAPIENS":
            s.species = models.Species.objects.get(pk=1)
        else:
            s.species = models.Species.objects.get(pk=2)

    #HERE is where I need to create a classifier app/module
    #FACTOR, platform, species--HERE are the rest of them!

    description_dict = parseGeoInfo(gsmid)
    if 'description' in parse_fields:
        s.description = json.dumps(description_dict)
        print s.description

    if 'antibody' in parse_fields:
        s.antibody = parseAntibody(description_dict)

    if 'factor' in parse_fields:
        s.factor = parseFactor(description_dict)

    if 'cell type' in parse_fields:
        s.cell_type = parseCellType(description_dict)

    if 'tissue' in parse_fields:
        s.tissue_type = parseTissue(description_dict)

    if 'cell line' in parse_fields:
        s.cell_line = parseCellLine(description_dict)

        # Sometimes cell line name is the `source name` field, especially when the content in `source name` is short
        if not s.tissue_type and not s.cell_line:
            s.cell_line = parseCellLineBySourceName(description_dict)

    if 'strain' in parse_fields:
        s.strain = parseStrain(description_dict)

    if 'disease' in parse_fields:
        s.disease = parseDisease(description_dict)

    if 'cell pop' in parse_fields:
        s.cell_pop = parseCellPop(description_dict)

    s.save()
    return s

def _general_parser(description_dict, description_key, model_selector, max_create_length=100, new=False):
    if not description_dict.get(description_key, None):
        return None


    if len(description_dict.get(description_key, "")) > 0:
        result = sorted(model_selector(models).objects.extra(where={"%s like CONCAT('%%', `name`, '%%')"},
                                                             params=[description_dict[description_key]]),
                        key=lambda o: len(o.name),
                        reverse=True)

        if result and len(result[0].name.strip())>0:
            return result[0]

    if new and len(description_dict[description_key]) <= max_create_length:
        ret, created = model_selector(models).objects.get_or_create(name=description_dict[description_key])
        if created:
            ret.status = 'new'
        return ret

    return None


def _general_list_parser(description_dict, description_keys, model_selector, max_create_length=100, new=False):
    for dk in description_keys:
        gp = _general_parser(description_dict, dk, model_selector, max_create_length, new)
        if gp:
            return gp
    return None


def parseCellLineBySourceName(description_dict):
    return _general_parser(description_dict, 'source name', lambda m: m.CellLines, 15, new=False)


def parseCellType(description_dict):
    return _general_list_parser(description_dict, ['cell type', 'cell lineage'], lambda m: m.CellTypes, new=True)


def parseCellLine(description_dict):
    return _general_list_parser(description_dict, ['cell line', 'cell'], lambda m: m.CellLines, new=True)


def parseTissue(description_dict):
    return _general_list_parser(description_dict, ['tissue', 'tissue type', 'tissue depot'], lambda m: m.TissueTypes,
                                new=True)


def parseStrain(description_dict):
    return _general_list_parser(description_dict, ['strain', 'strain background'], lambda m: m.Strains, new=True)


def parseDisease(description_dict):
    return _general_list_parser(description_dict, ['disease', 'tumor stage', 'cell karotype'],
                                lambda m: m.DiseaseStates, new=True)


def parseCellPop(description_dict):
    return _general_list_parser(description_dict, ['source name'], lambda m: m.CellPops, new=False)


def parseFactor(description_dict):
    # TODO: use description dict to parse, instead of using geoPost

    standard_fields = ["chip antibody", "antibody", "chip", "antibody source", "antibody antibodydescription",
                       "antibody targetdescription", "factor", "title"]

    non_standard_fields = [i for i in description_dict.keys() if "antibody" in i and i not in standard_fields]
    #1. try to get the values

    for t in standard_fields + non_standard_fields:
        tmp = description_dict.get(t, "").strip()
        # skip the null field
        if not tmp:
            continue

        # make all character upper case, then delete strings like `ANTI` and `_`
        tmp = tmp.upper().replace("ANTI-", " ").replace("ANTI", " ").replace("_", " ").strip()

        # `N/A` often concurs with `Input`
        if "N/A" in tmp:
            return models.Factors.objects.get_or_create(name="Input")[0]

        # If the field has very short description and it is not `TITLE`, the description is usually the factor name.
        if t != "title":
            if len(tmp) < 10 and tmp != "":
                ret, created = models.Factors.objects.get_or_create(name=tmp)
                if created:
                    ret.status = 'new'
                return ret

                # split the description into tokens
        splited = re.findall(r"[\w-]+", tmp)
        for s in splited:

            if re.match(r"^[\d-]+$", s):
                continue

            if len(s) <= 2 and d.check(s):
                continue

            # POL2 factor usually starts with `POL2`
            if (s.startswith("POL2") and len(s) < 10):
                return models.Factors.objects.get_or_create(name="POL2")[0]

            # If a token is neither a number nor a vocabulary in dictionary, it may be the factor name
            if models.Factors.objects.filter(name__iexact=s):
                return models.Factors.objects.get(name__iexact=s)

            if models.Aliases.objects.filter(name__iexact=s):
                alias = models.Aliases.objects.get(name__iexact=s)
                print "Con!! Find a factor by its alias"
                print alias.name
                return alias.factor


        # special cases for `Input` and `POL2`
        if "INPUT" in splited:
            return models.Factors.objects.get_or_create(name="Input")[0]
        if ("POLYMERASE" in splited) or ("POL" in splited):
            return models.Factors.objects.get_or_create(name="POL2")[0]

    return None
	def update_one_sample(gsmid, parse_fields=['other_ids', 'paper', 'name', 'species', 'description', 'antibody', 'factor',
	'cell type', 'cell line', 'tissue', 'strain']):
	"""Given a gsmid, tries to create a new sample--auto-filling in the
	meta fields


	If overwrite is True and there is the sample that has the same gsmid, this function will overwrite that sample

	NOTE: will try to save the sample!!

	Returns newly created sample
	"""

	sraId = gsmToSra(gsmid)
	sraXML = sra.getSraXML(sraId) if sraId else None
	geoPost = postProcessGeo(gsmid)
	gseId = gsmToGse(gsmid)
	pmid = gseToPubmed(gseId) if gseId else None

	s, created = models.Samples.objects.get_or_create(unique_id=gsmid)
	if created:
	s.status = "new"

	s.date_collected = datetime.datetime.now()
	s.fastq_file_url = sra.getSRA_downloadLink(sraXML) if sraXML else None

	if 'other_ids' in parse_fields:
	idList = {'sra': sraId, 'gse': gseId, 'pmid': pmid}
	s.other_ids = json.dumps(idList)

	if 'paper' in parse_fields and pmid:
	s.paper = pubmed.getOrCreatePaper(pmid)

	if 'name' in parse_fields:
	s.name = getFromPost(geoPost, "title")

	if 'species' in parse_fields:
	if getFromPost(geoPost, "organism") == "HOMO SAPIENS":
	s.species = models.Species.objects.get(pk=1)
	else:
	s.species = models.Species.objects.get(pk=2)

	#HERE is where I need to create a classifier app/module
	#FACTOR, platform, species--HERE are the rest of them!

	description_dict = parseGeoInfo(gsmid)
	if 'description' in parse_fields:
	s.description = json.dumps(description_dict)
	print s.description

	if 'antibody' in parse_fields:
	s.antibody = parseAntibody(description_dict)

	if 'factor' in parse_fields:
	s.factor = parseFactor(description_dict)

	if 'cell type' in parse_fields:
	s.cell_type = parseCellType(description_dict)

	if 'tissue' in parse_fields:
	s.tissue_type = parseTissue(description_dict)

	if 'cell line' in parse_fields:
	s.cell_line = parseCellLine(description_dict)

	# Sometimes cell line name is the `source name` field, especially when the content in `source name` is short
	if not s.tissue_type and not s.cell_line:
	s.cell_line = parseCellLineBySourceName(description_dict)

	if 'strain' in parse_fields:
	s.strain = parseStrain(description_dict)

	if 'disease' in parse_fields:
	s.disease = parseDisease(description_dict)

	if 'cell pop' in parse_fields:
	s.cell_pop = parseCellPop(description_dict)

	s.save()
	return s

	def _general_parser(description_dict, description_key, model_selector, max_create_length=100, new=False):
	if not description_dict.get(description_key, None):
	return None


	if len(description_dict.get(description_key, "")) > 0:
	result = sorted(model_selector(models).objects.extra(where={"%s like CONCAT('%%', `name`, '%%')"},
	params=[description_dict[description_key]]),
	key=lambda o: len(o.name),
	reverse=True)

	if result and len(result[0].name.strip())>0:
	return result[0]

	if new and len(description_dict[description_key]) <= max_create_length:
	ret, created = model_selector(models).objects.get_or_create(name=description_dict[description_key])
	if created:
	ret.status = 'new'
	return ret

	return None


	def _general_list_parser(description_dict, description_keys, model_selector, max_create_length=100, new=False):
	for dk in description_keys:
	gp = _general_parser(description_dict, dk, model_selector, max_create_length, new)
	if gp:
	return gp
	return None


	def parseCellLineBySourceName(description_dict):
	return _general_parser(description_dict, 'source name', lambda m: m.CellLines, 15, new=False)


	def parseCellType(description_dict):
	return _general_list_parser(description_dict, ['cell type', 'cell lineage'], lambda m: m.CellTypes, new=True)


	def parseCellLine(description_dict):
	return _general_list_parser(description_dict, ['cell line', 'cell'], lambda m: m.CellLines, new=True)




	def parseTissue(description_dict):
	return _general_list_parser(description_dict, ['tissue', 'tissue type', 'tissue depot'], lambda m: m.TissueTypes,
	new=True)


	def parseStrain(description_dict):
	return _general_list_parser(description_dict, ['strain', 'strain background'], lambda m: m.Strains, new=True)


	def parseDisease(description_dict):
	return _general_list_parser(description_dict, ['disease', 'tumor stage', 'cell karotype'],
	lambda m: m.DiseaseStates, new=True)


	def parseCellPop(description_dict):
	return _general_list_parser(description_dict, ['source name'], lambda m: m.CellPops, new=False)


	def parseFactor(description_dict):
	# TODO: use description dict to parse, instead of using geoPost

	standard_fields = ["chip antibody", "antibody", "chip", "antibody source", "antibody antibodydescription",
	"antibody targetdescription", "factor", "title"]

	non_standard_fields = [i for i in description_dict.keys() if "antibody" in i and i not in standard_fields]
	#1. try to get the values

	for t in standard_fields + non_standard_fields:
	tmp = description_dict.get(t, "").strip()
	# skip the null field
	if not tmp:
	continue

	# make all character upper case, then delete strings like `ANTI` and `_`
	tmp = tmp.upper().replace("ANTI-", " ").replace("ANTI", " ").replace("_", " ").strip()

	# `N/A` often concurs with `Input`
	if "N/A" in tmp:
	return models.Factors.objects.get_or_create(name="Input")[0]

	# If the field has very short description and it is not `TITLE`, the description is usually the factor name.
	if t != "title":
	if len(tmp) < 10 and tmp != "":
	ret, created = models.Factors.objects.get_or_create(name=tmp)
	if created:
	ret.status = 'new'
	return ret

	# split the description into tokens
	splited = re.findall(r"[\w-]+", tmp)
	for s in splited:

	if re.match(r"^[\d-]+$", s):
	continue

	if len(s) <= 2 and d.check(s):
	continue

	# POL2 factor usually starts with `POL2`
	if (s.startswith("POL2") and len(s) < 10):
	return models.Factors.objects.get_or_create(name="POL2")[0]

	# If a token is neither a number nor a vocabulary in dictionary, it may be the factor name
	if models.Factors.objects.filter(name__iexact=s):
	return models.Factors.objects.get(name__iexact=s)

	if models.Aliases.objects.filter(name__iexact=s):
	alias = models.Aliases.objects.get(name__iexact=s)
	print "Con!! Find a factor by its alias"
	print alias.name
	return alias.factor



	# special cases for `Input` and `POL2`
	if "INPUT" in splited:
	return models.Factors.objects.get_or_create(name="Input")[0]
	if ("POLYMERASE" in splited) or ("POL" in splited):
	return models.Factors.objects.get_or_create(name="POL2")[0]

	return None