Skip to content

Instantly share code, notes, and snippets.

@hanfeisun
Created April 14, 2014 04:51
Show Gist options
  • Save hanfeisun/10616923 to your computer and use it in GitHub Desktop.
Save hanfeisun/10616923 to your computer and use it in GitHub Desktop.
"""Library to interface with the GEO (GDS- GEO DataSets) repository"""
import os
import re
import sys
import urllib
import traceback
from datetime import datetime
import time
import enchant
import cPickle
# from classifiers import *
from django.db.models import Q
import re
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
from xml.dom.minidom import parseString
#AUTO-load classifiers
#a trick to get the current module
_modname = globals()['__name__']
_this_mod = sys.modules[_modname]
_ppath = "/".join(_this_mod.__file__.split("/")[:-1])
d = enchant.Dict("en_US")
import json
# #CAN drop this if this is an app!
# DEPLOY_DIR="/home/lentaing/envs/newdc1.4/src"
# sys.path.insert(0, DEPLOY_DIR)
# from django.core.management import setup_environ
# from django.utils.encoding import smart_str
# import settings
# setup_environ(settings)
from django.utils.encoding import smart_str
from datacollection import models
#dynamically load classifiers
#import classifiers
import sra
import pubmed
### HELPER fns
def getFromPost(geoPost, cat):
"""tries to search for cat(VALUE) returns VALUE if found, otherwise ""
NOTE: categories can be in UPPER or lowercase, eg. TITLE or title
"""
m = re.search("%s\((.+)\)" % cat.upper(), geoPost)
if m:
return m.group(1)
else:
return ""
def cleanCategory(s):
"""Given a string, replaces ' ' with '_'
'/', '&', '.', '(', ')'with ''
"""
tmp = s.replace(" ", "_")
for bad in ['/', '&', '.', '(', ')', ',']:
tmp = tmp.replace(bad, "")
return tmp
def isXML(doc):
"""TEST if it is a valid geo XML record
NOTE: first lines are-
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
"""
f = doc.split("\n")
return f[0].strip() == """<?xml version="1.0" encoding="UTF-8" standalone="no"?>"""
def readGeoXML(path, docString=None):
"""
Input: a file path or a string--default is to use the path
Tries to read in the geo xml record,
**KEY: REMOVES the xmlns line
Returns the xml record text WITHOUT the xmlns line!!!
"""
if docString:
f = docString.split("\n")
else:
f = open(path)
tmp = []
for l in f:
if l.find("xmlns=\"http://www.ncbi.nlm.nih.gov/geo/info/MINiML\"") == -1:
tmp.append(l)
if not docString:
f.close()
return "".join(tmp)
### GDS interface
def getGDSSamples():
"""Will run the predefined query and return a list of GDS ids
NOTE: this returns ALL GDS samples which are of SRA type i.e.
ALL CHIP-SEQ, RNA-SEQ, etc.
"""
#expireDate = now - 90 days in seconds
#ref: http://stackoverflow.com/questions/7430928/python-comparing-date-check-for-old-file
_expireDate = time.time() - 60 * 60 * 24 * 90
ret = []
#TRY: to read a file first -- IF IT IS NOT STALE
path = os.path.join(_ppath, "gdsSamples.txt")
if os.path.exists(path) and not os.path.getctime(path) < _expireDate:
f = open(path)
for l in f:
ret.append(l.strip())
f.close()
else:
#REFRESH!
#REAL URL
URL = """http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=SRA[Sample Type] AND gsm[Entry Type] AND ("homo sapiens"[Organism] OR "mus musculus"[Organism])&retmax=100000"""
#TEST URL
#URL = """http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=SRA[Sample Type] AND gsm[Entry Type] AND ("homo sapiens"[Organism] OR "mus musculus"[Organism])&retmax=10"""
try:
#print "getGDSSample: %s" % URL
f = urllib.urlopen(URL)
root = ET.fromstring(f.read())
f.close()
#Get the IDList
tmp = root.findall("IdList/Id")
ret = [i.text for i in tmp]
#write to disk
f = open(path, "w")
for l in ret:
f.write("%s\n" % l)
f.close()
except:
print "Exception in user code:"
print '-' * 60
traceback.print_exc(file=sys.stdout)
print '-' * 60
return ret
### Translation fns
def gsm_idToAcc(gdsId):
"""Given a GDS id, e.g. 300982523, tries to give a GDS accession, e.g.
GSM982523
NOTE: there is an algorithm: acc = "GSM"+gdsId[1:] (strip leading 0s)
"""
#Cut = dropping of the "3" (which indicates sample) and removal of leading
#leading 0s
cut = gdsId[1:].lstrip("0")
return "GSM%s" % cut
def gse_idToAcc(gdsId):
"""Given a GDS id, e.g. 200030833, tries to give a GDS accession, e.g.
GSE30833
NOTE: there is an algorithm: acc = "GSE"+gdsId[1:] (strip leading 0s)
"""
#Cut = dropping of the "2" (which indicates series) and removal of leading
#leading 0s
cut = gdsId[1:].lstrip("0")
return "GSE%s" % cut
### Librarian fns
def gsmToGse(gsmid):
"""Given a gsmid, will try to get the geo series id (GSE) that the
sample is associated with; if it is associated with several GSEs, then
returns the first.
STORES the GSE ID, e.g. 200030833 in the file
NOTE: if we want GSEs then we need to translate IDs to GSEXXXXX just
like we do for GSMs above
uses this query:
http://www.ncbi.nlm.nih.gov/gds/?term=gse%5BEntry+Type%5D+AND+GSM764990%5BGEO+Accession%5D&report=docsum&format=text
"""
ret = None
path = os.path.join(_ppath, "GSM_GSE")
if not os.path.exists(path):
os.mkdir(path)
subdir = os.path.join(path, gsmid[:7])
if not os.path.exists(subdir):
os.mkdir(subdir)
path = os.path.join(subdir, "%s.txt" % gsmid)
if os.path.exists(path):
f = open(path)
ret = f.read().strip()
f.close()
else:
#This URL is slow!
#NOTE: for every ncbi query, try to find the eutils equivalent!
#--it's faster
#URL = "http://www.ncbi.nlm.nih.gov/gds/?term=gse[Entry Type] AND %s[GEO Accession]&report=docsum&format=text" % gsmid
URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=gse[Entry Type] AND %s[GEO Accession]" % gsmid
try:
#print "gsmToGse: %s" % URL
urlf = urllib.urlopen(URL)
root = ET.fromstring(urlf.read())
urlf.close()
#Get the IDList
tmp = root.findall("IdList/Id")
if tmp:
ret = tmp[0].text.strip()
f = open(path, "w")
f.write(ret)
f.close()
#FIRST URL
# m = re.search("ID:\ (\d+)", urlf.read())
# urlf.close()
# if m:
# ret = m.group(1).strip()
# f = open(path, "w")
# f.write(ret)
# f.close()
except:
print "gsmToGse"
print "URL is" + URL
print '-' * 60
traceback.print_exc(file=sys.stdout)
print '-' * 60
return ret
def gseToPubmed(gseid):
"""Given a gseid, will try to get the pubmed id
"""
ret = None
path = os.path.join(_ppath, "GSE_PUB")
if not os.path.exists(path):
os.mkdir(path)
subdir = os.path.join(path, gseid[:6])
if not os.path.exists(subdir):
os.mkdir(subdir)
path = os.path.join(subdir, "%s.txt" % gseid)
if os.path.exists(path):
f = open(path)
ret = f.read().strip()
f.close()
else:
URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?id=%s&db=pubmed&dbfrom=gds" % gseid
try:
print "gseToPubmed: %s" % URL
urlf = urllib.urlopen(URL)
root = ET.fromstring(urlf.read())
urlf.close()
#Get the IDList
tmp = root.findall("LinkSet/LinkSetDb/Link/Id")
if tmp:
ret = tmp[0].text.strip()
f = open(path, "w")
f.write(ret)
f.close()
except:
print "gsmToGse"
print '-' * 60
traceback.print_exc(file=sys.stdout)
print '-' * 60
return ret
def gsmToSra(gsmid):
"""Given a gsm id, will try to get an SRA id, using this query:
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=GSM530220 to get the SRA id
Returns the SRA id corresponding to the GSM, None otherwise
"""
ret = None
path = os.path.join(_ppath, "GSM_SRA")
if not os.path.exists(path):
os.mkdir(path)
subdir = os.path.join(path, gsmid[:7])
if not os.path.exists(subdir):
os.mkdir(subdir)
path = os.path.join(subdir, "%s.txt" % gsmid)
if os.path.exists(path):
f = open(path)
ret = f.read().strip()
f.close()
else:
URL = "http://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=%s" % gsmid
try:
#print "gsmToSra: %s" % URL
urlf = urllib.urlopen(URL)
root = ET.fromstring(urlf.read())
urlf.close()
#Get the IDList--should just be one
tmp = root.findall("IdList/Id")
if tmp:
ret = tmp[0].text.strip()
f = open(path, "w")
f.write(ret)
f.close()
except:
print '-' * 60
traceback.print_exc(file=sys.stdout)
print '-' * 60
return ret
def getGeoXML(accession):
"""HANDLES GEO XML records--i.e. our GEO XML librarian!
Given a GEO ACCESSION ID, return the xml record for it
(making the urllib call)"""
#path pattern: EXAMPLE-GSM1126513 geo/GSM1126/GSM1126513
path = os.path.join(_ppath, "geo")
if not os.path.exists(path):
os.mkdir(path)
subdir = os.path.join(path, accession[:7])
if not os.path.exists(subdir):
os.mkdir(subdir)
path = os.path.join(subdir, "%s.xml" % accession)
if os.path.exists(path):
f = open(path)
docString = f.read()
f.close()
else:
#print accession
URL = "http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=%s&view=quick&form=xml&targ=self" % accession
try:
#print "getGeoXML: %s" % URL
urlf = urllib.urlopen(URL)
docString = urlf.read()
urlf.close()
if isXML(docString):
#write to file
f = open(path, "w")
f.write(docString)
f.close()
else:
print accession
print "ERROR: accession is NOT xml. (The accession may be deleted from GEO repository)"
return None
except:
print "Exception in user code:"
print '-' * 60
traceback.print_exc(file=sys.stdout)
print '-' * 60
docString = None
return docString
def parseGeoInfo(accession):
"""parse necessary information (detailed description from the Geo XML file"""
xmlString = readGeoXML(None, getGeoXML(accession))
tree = ET.fromstring(xmlString)
ret = {}
for node in tree.findall("Sample/Channel/Characteristics"):
if node.get("tag"):
ret[node.get("tag").replace("_", " ")] = node.text.strip()
ret["source name"] = tree.find("Sample/Channel/Source").text.strip()
ret["title"] = tree.find("Sample/Title").text.strip()
ret['last update date'] = tree.find("Sample/Status/Last-Update-Date").text.strip()
ret['release date'] = tree.find("Sample/Status/Release-Date").text.strip()
return ret
def postProcessGeo(accession, docString=None):
"""post processes the GEO record to feed into the classifiers
uses docString IF it is available, otherwise will read the record
"""
#ignore these tags
ignore = ["Growth-Protocol", "Extract-Protocol", "Treatment-Protocol"]
_thresh = 10 #max 10 words
path = os.path.join(_ppath, "geoPost")
if not os.path.exists(path):
os.mkdir(path)
subdir = os.path.join(path, accession[:7])
if not os.path.exists(subdir):
os.mkdir(subdir)
path = os.path.join(subdir, "%s.txt" % accession)
if os.path.exists(path):
f = open(path)
docString = f.read()
f.close()
return docString
else:
#need to build the doc
if not docString:
#read the document from geo
docString = getGeoXML(accession)
#process the string
if not docString:
return None
text = readGeoXML(None, docString=docString)
try:
root = ET.fromstring(text)
except:
print "Could not parse: %s" % accession
print '-' * 60
traceback.print_exc(file=sys.stdout)
print '-' * 60
return None
#3. collect all of the information under Sample/Channel
tmp = []
#Things in sample to get:
ls = ['Title', 'Source', 'Library-Strategy', 'Library-Source',
'Library-Selection', 'Supplementary-Data']
for t in ls:
tag = root.findall("Sample/%s" % t)
for tt in tag:
tmp.append("%s(%s)" % (t.upper(), tt.text.strip().upper()))
channels = root.findall("Sample/Channel")
for c in channels:
for child in c:
category = ""
if child.tag in ignore:
continue
#Special case: Characteristic--take the tag attrib
if child.tag == "Characteristics":
if "tag" in child.attrib and child.attrib["tag"]:
category = child.attrib["tag"].lstrip().rstrip()
else:
category = child.tag.lstrip().rstrip()
#convert categories like "cell line" to "CELL_LINE"
#tmp.append("%s(%s)" % (category.replace(" ","_").upper(),
# child.text.strip()))
val = child.text.strip()
#THRESHOLD: values can be at most 10 words
if len(val.split()) <= _thresh:
tmp.append("%s(%s)" % (cleanCategory(category).upper(),
val.upper()))
else:
#take first 10 words
tmp.append("%s(%s)" % (cleanCategory(category).upper(),
" ".join(val.split()[:_thresh]).upper()))
#4. write the information to file
f = open(path, "w")
f.write("%s" % smart_str("\n".join(tmp)))
f.close()
return "\n".join(tmp)
### Syncing fns
def syncGeo_GeoPost():
"""will ensure that the records in geo are post processed
Returns a list of newly created records
"""
ret = []
p = os.path.join(_ppath, "geo")
for root, dirs, files in os.walk(p):
for fname in files:
path = os.path.join(root, fname)
acc = fname.split(".")[0]
dest = os.path.join(_ppath, "geoPost", acc[:7], "%s.txt" % acc)
if not os.path.exists(dest):
#update geoPost
if postProcessGeo(acc):
ret.append(acc)
return ret
def syncGeo_SRA():
"""will try to ensure that there is one SRA record for every geo record
KEY: use this to query the SRA db- see gsmToSra
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=GSM530220 to get the SRA id
Returns a list of newly created records (in SRA)
"""
ret = []
p = os.path.join(_ppath, "geo")
for root, dirs, files in os.walk(p):
for fname in files:
path = os.path.join(root, fname)
gsm = fname.split(".")[0]
sraId = gsmToSra(gsm)
if sraId:
dest = os.path.join(_ppath, "sra", sraId[:3], "%s.xml" % sraId)
if not os.path.exists(dest):
#create the new record
tmp = sra.getSraXML(sraId)
if tmp:
ret.append(sraId)
return ret
def syncSRA_Geo():
"""
NOTE: this fn is NOT as important as syncGeo_SRA b/c Geo is our
primary model!!!
will try to ensure that there is one Geo record for every SRA record
NOTE: tries to find the GSM first in the GSM_SRA map, then in the SRA recrd
Returns a list of newly created records (in GEO)
"""
#relies on the GSM_SRA map first
#1. load the map
_map = {}
p = os.path.join(_ppath, "GSM_SRA")
for root, dirs, files in os.walk(p):
for fname in files:
path = os.path.join(root, fname)
gsm = fname.split(".")[0]
f = open(path)
sraId = f.read().strip()
f.close()
_map[sraId] = gsm
#2. check the sra db
ret = []
p = os.path.join(_ppath, "sra")
for root, dirs, files in os.walk(p):
for fname in files:
gsm = None
path = os.path.join(root, fname)
sraId = fname.split(".")[0]
if sraId in _map:
gsm = _map[sraId]
else:
# try to get the gsm from the file:
docString = sra.getSraXML(sraId)
#try to grep the GSMid
m = re.search("GSM\d+", docString)
if m:
gsm = m.group(0)
if gsm:
dest = os.path.join(_ppath, "geo", gsm[:7], "%s.xml" % gsm)
if not os.path.exists(dest):
if getGeoXML(gsm):
ret.append(gsm)
return ret
def syncGSM_GSE():
"""will ensure that the records in geo have a gse id
Returns a list of newly created records
"""
ret = []
p = os.path.join(_ppath, "geo")
for root, dirs, files in os.walk(p):
for fname in files:
path = os.path.join(root, fname)
acc = fname.split(".")[0]
if acc:
dest = os.path.join(_ppath, "GSM_GSE", acc[:7], "%s.txt" % acc)
if not os.path.exists(dest):
#"update GSM_GSE"
if gsmToGse(acc):
ret.append(acc)
return ret
def syncGSE_PUB():
"""will ensure that every gse record have a pubmed id
Returns a list of newly created records
"""
ret = []
p = os.path.join(_ppath, "GSM_GSE")
for root, dirs, files in os.walk(p):
for fname in files:
path = os.path.join(root, fname)
f = open(path)
acc = f.read().strip()
f.close()
dest = os.path.join(_ppath, "GSE_PUB", acc[:6], "%s.txt" % acc)
if not os.path.exists(dest):
#update GSE_PUB
if gseToPubmed(acc):
ret.append(acc)
return ret
def getGeoSamples_byType(ddir="geo", ttype="ChIP-Seq", refresh=False):
"""A filter for our Geo model; searches our db for the specific sample
type.
NOTE: hones in on Library-Strategy tag
Returns a list of samples fitting the specified
NOTE: building this up takes time, around 10 secs almost 1 minute!
TRY: caching the result, reading from a cached file takes only 1 minute
Store them in files by the .ttype--in the local dir
"""
ret = []
#check for a cached file:
p = os.path.join(_ppath, ddir, ".%s" % ttype)
if not refresh and os.path.exists(p):
f = open(p)
for l in f:
ret.append(l.strip())
f.close()
else:
#NEED to generate the file, and make the call recursively
#actually, just one level of recursion b/c geo is pretty flat
p = os.path.join(_ppath, ddir)
ls = os.listdir(p)
for df in ls:
path = os.path.join(p, df)
if os.path.isfile(path): #it's a file--check if it's ChIP-Seq
acc = df.split(".")[0]
#NOTE: we need readGeoXML to process
text = readGeoXML(path)
try:
rt = ET.fromstring(text)
#check for Library-Strategy
tag = rt.findall("Sample/Library-Strategy")
if tag and tag[0].text.strip() == ttype:
#It's a match!
ret.append(acc)
except:
#ignored!
pass
else:
#it's a dir recur
newd = os.path.join(ddir, df)
ret.extend(getGeoSamples_byType(newd, ttype, refresh))
#write the local file:
f = open(os.path.join(_ppath, ddir, ".%s" % ttype), "w")
for gsm in ret:
f.write("%s\n" % gsm)
f.close()
return ret
def getGeoSamples_byTypes(path, ddir="geo", refresh=False):
ttypes = ["ChIP-Seq", "DNase-Hypersensitivity"]
ret = []
if not refresh and os.path.exists(path):
ret = cPickle.load(open(path))
return ret
for t in ttypes:
ret += getGeoSamples_byType(ddir, t, refresh)
cPickle.dump(ret, open(path, "w"))
return ret
def parseUpdateTime(description_dict):
# a trick to convert time string into a datetime object
time_struct = time.strptime(description_dict["last update date"], "%Y-%m-%d")
return datetime.fromtimestamp(time.mktime(time_struct))
def parseReleaseTime(description_dict):
# a trick to convert time string into a datetime object
time_struct = time.strptime(description_dict["release date"], "%Y-%m-%d")
return datetime.fromtimestamp(time.mktime(time_struct))
def parseAntibody(description_dict):
"""Given a geoPost, will 1. try to parse out the antibody information
2. create the new antibody if necessary with the name as:
VENDOR Catalog# (TARGET) OR
VENDOR Catalog# --if there is no target OR
TARGET --if there is no vendor info
To do this we key in on some key concepts
Returns the sample's antibody, None otherwise
"""
targetFlds = ["antibody source", "chip antibody", "antibody"]
vendorFlds = ["antibody vendorname", "chip antibody provider",
"antibody vendor", "antibody manufacturer",
"chip antibody vendor", "chip antibody manufacturer", "antibody vendorcatalog#",
"antibody vendor and catalog number", "antibody vendor/catalog", "antobody vendor/catalog#"
]
catalogFlds = ["antibody vendorid", "chip antibody catalog",
"antibody CATALOG NUMBER", "chip antibody cat #", "antibody catalog #"]
lotFlds = ["chip antibody lot #"]
#1. try to get the values
vals = [None, None, None, None]
used_fld = []
for (i, ls) in enumerate([targetFlds, vendorFlds, catalogFlds, lotFlds]):
for f in ls:
tmp = description_dict.get(f)
if tmp:
vals[i] = tmp
used_fld.append(f)
break
#2. get each term of the antibody separately
(target, vendor, cat, lot) = tuple(vals)
if target and "input" in target.lower():
ret, created = models.Antibodies.objects.get_or_create(name="input")
return ret
def if_match_then_get(keyword, current_key):
if keyword in current_key:
current_value = description_dict[current_key]
if current_value and current_key not in used_fld:
used_fld.append(current_key)
if current_value.startswith("catalog#"):
current_value = current_value.replace("catalog#, or reference):", "").strip()
return current_value
return None
#3. sometimes the field name is not standard, in other words, it is not included in xxxFlds
#If so, use string matching to parse these fields
for k, v in description_dict.items():
if not (k and v):
continue
if not vendor:
vendor = if_match_then_get("vendor", k)
if not cat:
cat = if_match_then_get("catalog", k)
if not lot:
lot = if_match_then_get("lot", k)
#4. compose the complete antibody name
name_list = []
if vendor:
name_list.append(vendor)
if cat:
name_list.append(cat)
if lot:
name_list.append(lot)
if not vendor and not cat and not lot:
if not target:
target = if_match_then_get('antibody', k)
if target:
name_list.append(target)
name = ", ".join(name_list)
if name:
ret, created = models.Antibodies.objects.get_or_create(name=name)
return ret
else:
return None
# def parseFactor(description_dict):
# # TODO: use description dict to parse, instead of using geoPost
#
# standard_fields = ["chip antibody", "antibody", "chip", "antibody source", "antibody antibodydescription",
# "antibody targetdescription", "factor"]
# avoid_words = ["MILLIPORE", "ABCAM", "METHYLCAP"]
#
# non_standard_fields = [i for i in description_dict.keys() if "antibody" in i and i not in standard_fields] + [
# 'title']
# #1. try to get the values
# factor_pattern = re.compile(r"[a-z]+[-\.]?[a-z0-9]+", re.I)
# factor_term_pattern = re.compile(r"^[a-z]+[-\.]?[a-z0-9]+$", re.I)
# possible_new_factor = None
#
# for t in standard_fields + non_standard_fields:
# print t
# tmp = description_dict.get(t, "").strip()
# # skip the null field
# if not tmp:
# continue
#
# # make all character upper case, then delete strings like `ANTI` and `_`
# tmp = tmp.upper().replace("ANTI-", " ").replace("ANTI", " ").replace("_", " ").strip()
#
# # `N/A` often concurs with `Input`
# if "N/A" in tmp:
# return models.Factors.objects.get_or_create(name="Input")[0]
#
# if "antibody" in t and ("NONE" in tmp or "INPUT" in tmp or "IGG" in tmp):
# return models.Factors.objects.get_or_create(name="Input")[0]
#
# # If the field has very short description and it is not `TITLE`, the description is usually the factor name.
# if t not in non_standard_fields and not possible_new_factor \
# and len(tmp) < 10 and tmp not in avoid_words and factor_term_pattern.match(tmp) \
# and not models.Aliases.objects.filter(name__icontains=tmp) \
# and not models.Factors.objects.filter(name__icontains=tmp):
# possible_new_factor = tmp
#
#
#
# # split the description into tokens
# splited = factor_pattern.findall(tmp)
# print splited
# for s in splited:
#
# if d.check(s):
# continue
# if s in avoid_words:
# continue
# # POL2 factor usually starts with `POL2`
# if (s.startswith("POL2") and len(s) < 10):
# return models.Factors.objects.get_or_create(name="POL2")[0]
#
# # If a token is neither a number nor a vocabulary in dictionary, it may be the factor name
# if models.Factors.objects.filter(name__iexact=s):
# return models.Factors.objects.get(name__iexact=s)
#
# if models.Aliases.objects.filter(name__iexact=s):
# try:
# alias = models.Aliases.objects.get(name__iexact=s)
# return alias.factor
# except Exception as e:
# print e
# print "FATAL!"
# return None
#
#
#
# # special cases for `Input` and `POL2`
# if "INPUT" in splited:
# return models.Factors.objects.get_or_create(name="Input")[0]
# if ("POLYMERASE" in splited) or ("POL" in splited):
# return models.Factors.objects.get_or_create(name="POL2")[0]
#
# if possible_new_factor:
# ret, created = models.Factors.objects.get_or_create(name=possible_new_factor)
# if created:
# ret.status = 'new'
# ret.save()
# return ret
#
# return None
def _parse_a_field(description_dict, a_field, DCmodel, max_create_length=100, new=False):
if not description_dict.get(a_field, None):
return None
if len(description_dict.get(a_field, "")) > 0:
result = sorted(DCmodel.objects.extra(where={"%s REGEXP CONCAT('([^a-zA-Z0-9]|^)', `name`, '([^a-zA-Z0-9\-]|$)')"},
params=[description_dict[a_field]]),
key=lambda o: len(o.name),
reverse=True)
if result and len(result[0].name.strip()) > 0:
return result[0]
if new and len(description_dict[a_field]) <= max_create_length:
ret, created = DCmodel.objects.get_or_create(name=description_dict[a_field])
if created:
ret.status = 'new'
return ret
return None
def _parse_fields(description_dict, strict_fields, greedy_fields, DCmodel, greedy_length=100):
for sf in strict_fields:
ret = _parse_a_field(description_dict, sf, DCmodel, greedy_length, new=False)
if ret:
return ret
for gf in greedy_fields:
ret = _parse_a_field(description_dict, gf, DCmodel, greedy_length, new=True)
if ret:
return ret
return None
#
# def parseCellLineBySourceName(description_dict):
# return _parse_fields(description_dict, ['source name'], [],lambda m: m.CellLines, 15, )
#
def _search_factor_by_pattern(field_name, field_content):
field_content = field_content.upper()
if "antibody" in field_name:
if "NONE" in field_content or "INPUT" in field_content or "IGG" in field_content or "N/A" in field_content:
return models.Factors.objects.get_or_create(name="Input")[0]
if "POL2" in field_content or "POLYMERASE" in field_content:
return models.Factors.objects.get_or_create(name="POL2")[0]
if "CTCF" in field_content:
return models.Factors.objects.get_or_create(name="CTCF")[0]
return None
def _guess_factor_boldly(field_name, field_content):
factor_finder = re.compile(r"anti[^a-z]([a-z]+[-\.]?[a-z0-9]+)", re.I)
# get the word after anti
factor_pattern = re.compile(r"^[a-z]+[-\.]?[a-z0-9]+$", re.I)
factor_found = factor_finder.findall(field_content)
if factor_found:
return models.Factors.objects.get_or_create(name=factor_found[0])[0]
if len(field_content) < 10 and factor_pattern.match(field_content):
return models.Factors.objects.get_or_create(name=field_content)[0]
return None
def parseFactor(description_dict):
standard_fields = [i for i in
["chip antibody", "antibody", "chip", "antibody source", "antibody antibodydescription",
"antibody targetdescription", "factor", "title"] if i in description_dict.keys()]
non_standard_fields = [i for i in description_dict.keys() if "antibody" in i and i not in standard_fields]
fields = standard_fields + non_standard_fields
first_try = _parse_fields(description_dict, fields, [], models.Factors)
if first_try:
return first_try
print ".",
second_try = _parse_fields(description_dict, fields, [], models.Aliases)
if second_try:
return second_try.factor
print ".",
for f in fields:
third_try = _search_factor_by_pattern(f, description_dict[f])
if third_try:
return third_try
print ".",
for f in fields:
print f
fourth_try = _guess_factor_boldly(f, description_dict[f])
if fourth_try:
return fourth_try
print ".",
return None
def parseCellType(description_dict):
return _parse_fields(description_dict,
['cell type', 'cell lineage', 'cell', 'cell line', 'source name', 'cell description', 'title'],
['cell type'],
models.CellTypes)
def parseCellLine(description_dict):
return _parse_fields(description_dict,
['cell', 'cell line', 'source name', 'cell description', 'title'],
['cell line'],
models.CellLines)
def parseCellPop(description_dict):
return _parse_fields(description_dict, ['cell', 'source name', 'cell description', 'title'], [], models.CellPops)
def parseTissue(description_dict):
return _parse_fields(description_dict,
['tissue', 'tissue type', 'tissue depot', 'source name', 'cell description', 'title'],
['tissue', 'tissue type'],
models.TissueTypes, )
def parseStrain(description_dict):
return _parse_fields(description_dict,
['strain', 'strain background', 'source name', 'cell description', 'title'],
['strain'],
models.Strains)
def parseDisease(description_dict):
return _parse_fields(description_dict,
['disease', 'tumor stage', 'cell karotype', 'source name', 'title'],
['disease'],
models.DiseaseStates)
def update_one_sample(gsmid, parse_fields=['other_ids', 'paper', 'name', 'species', 'description', 'antibody', 'factor',
'cell type', 'cell line', 'cell pop', 'tissue', 'strain', 'disease']):
"""Given a gsmid, tries to create a new sample--auto-filling in the
meta fields
If overwrite is True and there is the sample that has the same gsmid, this function will overwrite that sample
NOTE: will try to save the sample!!
Returns newly created sample
"""
sraId = gsmToSra(gsmid)
sraXML = sra.getSraXML(sraId) if sraId else None
geoPost = postProcessGeo(gsmid)
if not geoPost:
return None
s, created = models.Samples.objects.get_or_create(unique_id=gsmid)
assert isinstance(s, models.Samples)
if created:
s.status = "new"
if not s.dc_collect_date:
s.dc_collect_date = datetime.now()
s.fastq_file_url = sra.getSRA_downloadLink(sraXML) if sraXML else None
if ('other_ids' in parse_fields) or ('paper' in parse_fields):
gseId = gsmToGse(gsmid)
pmid = gseToPubmed(gseId) if gseId else None
if 'other_ids' in parse_fields:
idList = {'sra': sraId, 'gse': gseId, 'pmid': pmid}
s.other_ids = json.dumps(idList)
if 'paper' in parse_fields and pmid:
s.paper = pubmed.getOrCreatePaper(pmid)
if 'name' in parse_fields:
s.name = getFromPost(geoPost, "title")
if 'species' in parse_fields:
if getFromPost(geoPost, "organism") == "HOMO SAPIENS":
s.species = models.Species.objects.get(pk=1)
else:
s.species = models.Species.objects.get(pk=2)
#HERE is where I need to create a classifier app/module
#FACTOR, platform, species--HERE are the rest of them!
description_dict = parseGeoInfo(gsmid)
if 'description' in parse_fields:
s.description = json.dumps(description_dict)
print s.description
if 'antibody' in parse_fields:
s.antibody = parseAntibody(description_dict)
if 'factor' in parse_fields:
s.factor = parseFactor(description_dict)
if 'cell type' in parse_fields:
s.cell_type = parseCellType(description_dict)
if 'tissue' in parse_fields:
s.tissue_type = parseTissue(description_dict)
if 'cell line' in parse_fields:
s.cell_line = parseCellLine(description_dict)
# # Sometimes cell line name is the `source name` field, especially when the content in `source name` is short
# if not s.tissue_type and not s.cell_line:
# s.cell_line = parseCellLineBySourceName(description_dict)
if 'strain' in parse_fields:
s.strain = parseStrain(description_dict)
if 'disease' in parse_fields:
s.disease_state = parseDisease(description_dict)
if 'cell pop' in parse_fields:
s.cell_pop = parseCellPop(description_dict)
if 'update date' in parse_fields:
s.geo_last_update_date = parseUpdateTime(description_dict)
if 'release date' in parse_fields:
s.geo_release_date = parseReleaseTime(description_dict)
s.save()
return s
def sync_samples(refresh=False):
"""Will run through the whole flow of this package, trying to update
the db w/ new samples if any.
refresh: whether need to sync local repository (pickle file) with GEO.
If yes, sync local pickle file with GEO, then sync database with local pickle file.
If no, only sync database with local pickle file.
Returns: list of newly created samples
"""
if refresh:
print "# 1. resync the repository from Internet"
gdsSamples = getGDSSamples()
for gdsid in gdsSamples:
gsm = gsm_idToAcc(gdsid)
if gsm:
geoXML = getGeoXML(gsm)
geoPost = postProcessGeo(gsm)
# "2a. sync geo and sra"
sraId = gsmToSra(gsm)
if sraId:
sra.getSraXML(sraId)
print "# 2. get all samples"
local_repo_path = "/data1/newdc1.4/src/scripts/repository_samples.pickle"
local_repo_samples = set(getGeoSamples_byTypes(path=local_repo_path, refresh=refresh))
print "# 3. try to calculate new samples"
local_db_samples = set(models.Samples.objects.values_list('unique_id', flat=True))
print "There are %d samples in local repo." % len(local_repo_samples)
print "There are %d samples in local db." % len(local_db_samples)
need_added_samples = sorted(list(local_repo_samples - local_db_samples))
print "%d samples will be added." % len(need_added_samples)
print "The first 10 of them are " + str(need_added_samples[:10])
print "# 4. try to add new samples"
ret = []
for s in need_added_samples:
try:
ret.append(update_one_sample(s))
except KeyboardInterrupt:
print "User interrupts!"
sys.exit(1)
# except Exception as e:
# print type(e)
# print e.args
# print e
# continue
return ret
def updateExistingSamples(samples,
parse_fields=['other_ids', 'paper', 'name', 'species', 'description', 'antibody', 'factor',
'cell type', 'cell line', 'tissue', 'strain', 'update date', 'release date']):
"""Will run through the whole flow of this package, trying to update
the db w/ new samples if any.
Returns: list of newly created samples
"""
print samples
if parse_fields:
for s in samples:
print s
update_one_sample(s.unique_id, parse_fields)
# the following is a quick hack to modify the DNase-seq's factor name to "DNase" in one time
# need to improve
# Dnase_samples = set(getGeoSamples_byTypes("geo", ["DNase-Hypersensitivity"],refresh=False))
# Dnase_factor = models.Factors.objects.get(name="DNase")
# samples.filter(unique_id__in = Dnase_samples).update(factor=Dnase_factor)
#
def createDatasets():
"""
Will collect samples which no datasets owns them.
"""
from _json import encode_basestring_ascii as c_encode_basestring_ascii
cnt = 0
# This is a hack to get the dumped version of a unicode string
relative_regex = re.compile(r"cell|tissue|time|source|sex")
rep_regex = re.compile(r"treat|antibody|genotype|protocol|donor")
control_factor_regex = re.compile(r"Input")
def _description_get(k):
# This is a trick to deal with an encoding issue related with JSON dumper
return c_encode_basestring_ascii(description_dict[k])
def _query_in_description(regex):
matched_keys = [k for k in description_dict.keys() if regex.search(k)]
print matched_keys
ret = Q()
for k in matched_keys:
ret &= Q(description__contains=_description_get(k))
return ret
def _filter_by_name(queryset):
main_title = an_orphan.name
if not main_title:
return queryset
ret_queryset = [an_orphan]
tokenizer = re.compile(r"\d+nm|t\d+|zt\d+|[a-z]{2,}", re.I)
main_tokenset = set(tokenizer.findall(main_title))
for a_sample in queryset:
if a_sample != an_orphan:
current_title = a_sample.name
current_tokenset = set(tokenizer.findall(current_title))
if main_tokenset == current_tokenset:
ret_queryset.append(a_sample)
return ret_queryset
# samples with necessary metadata
candidate_samples = models.Samples.objects.exclude(
Q(factor__isnull=True) | Q(description__exact='') | Q(other_ids__exact=''))
# .exclude(status=u'imported')
# TODO: add info for imported data and re-parse
orphan_treat_samples = candidate_samples.exclude(
factor__name__regex=control_factor_regex.pattern
).filter(
TREATS__isnull=True,
CONTS__isnull=True,
)
error_cnt = 0
for an_orphan in orphan_treat_samples:
if an_orphan.TREATS.all() or an_orphan.CONTS.all():
continue
print an_orphan
try:
gse_id = json.loads(an_orphan.other_ids)['gse']
except Exception as e:
print "Exception when loads GSE ID"
print e
error_cnt += 1
continue
try:
description_dict = json.loads(an_orphan.description)
except Exception as e:
print "Exception when loads the description dict!"
print e
error_cnt += 1
continue
relative_samples = candidate_samples.filter(
Q(other_ids__contains='"gse": "%s"' % gse_id) &
_query_in_description(relative_regex))
rep_samples = relative_samples.filter(
Q(antibody=an_orphan.antibody, factor=an_orphan.factor, cell_line=an_orphan.cell_line) &
_query_in_description(rep_regex)).order_by('unique_id')
rep_samples = _filter_by_name(rep_samples)
control_samples = relative_samples.filter(
Q(factor__name__regex=control_factor_regex.pattern)).order_by('unique_id')
new_dataset = models.Datasets(status='info')
new_dataset.save()
new_dataset.treats.add(*rep_samples)
new_dataset.conts.add(*control_samples)
cnt += 1
# if cnt == 20:
# break
print new_dataset
print "Error cnt"
print error_cnt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment