Skip to content

Instantly share code, notes, and snippets.

@kfrancoi
Last active March 28, 2021 02:46
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kfrancoi/e3467da194fa1410c240 to your computer and use it in GitHub Desktop.
Save kfrancoi/e3467da194fa1410c240 to your computer and use it in GitHub Desktop.
This script parse the MESH xml dump file and turn it into an rdf triple file following partially the bio2rdf convention.
import sys
import os
import itertools
from datetime import datetime
try:
from lxml import etree
except ImportError:
import xml.etree.ElementTree as etree
import RDF
from IPython.core.debugger import Tracer
debug_here = Tracer()
class MeshXMLParser :
dbPediaLabel = None
baseMeshURI = "http://bio2rdf.org/"
vocabularyDescriptorURI = baseMeshURI+"mesh_vocabulary:Descriptor"
meshResource = "mesh:"
bio2rdf_vocabularyNamespace = baseMeshURI+"bio2rdf_vocabulary:namespace"
bio2rdf_vocabularyIdentifier = baseMeshURI+"bio2rdf_vocabulary:identifier"
mesh_vocabularyRecordType = baseMeshURI+"mesh_vocabulary:record-type"
mesh_vocabularyMeshHeadings = baseMeshURI+"mesh_vocabulary:mesh-heading"
mesh_vocabularyDescriptor = baseMeshURI+"mesh_vocabulary:Descriptor"
mesh_vocabularyConcept = baseMeshURI+"mesh_vocabulary:Concept"
mesh_vocabularySemanticType = baseMeshURI+"mesh_vocabulary:SemanticType"
mesh_vocabularyTerm = baseMeshURI+"mesh_vocabulary:Term"
mesh_vocabularyTreeNumber = baseMeshURI+"mesh_vocabulary:TreeNumber"
dc_identifier = "http://purl.org/dc/terms/identifier"
dc_title = "http://purl.org/dc/terms/title"
labelPredicateURI = "http://www.w3.org/2000/01/rdf-schema#label"
broader = "http://www.w3.org/2004/02/skos/core#broader"
related = "http://www.w3.org/2004/02/skos/core#related"
subject = "http://purl.org/dc/terms/subject"
isOfType = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
sameAs = "http://www.w3.org/2002/07/owl#sameAs"
'''TODO:
- allowable qualifier
- resource type
- link with dbpedia : <http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/meshId> "D001321"@en .
'''
def __init__(self):
self.fileNumber = 0
self.initModel()
self.entitySet = set()
#self.model = RDF.Model(RDF.MemoryStorage())
#self.model = RDF.Model(RDF.Storage(storage_name="file", name='meshTriple.nt', options_string=""))
def initModel(self):
self.model = RDF.Model(RDF.MemoryStorage())
self.fileNumber+=1
def addStatementToModel(self, subject, predicate, object):
self.model.append(RDF.Statement(subject, predicate, object))
def makeDescriptorResourceNode(self, entity, label):
descriptorNode = self.makeResourceNode(entity)
#Add descriptor Name rdf label
self.addLabel(descriptorNode, label)
#Add type Descriptor
self.addStatementToModel(
descriptorNode,
RDF.Uri(self.isOfType),
RDF.Uri(self.mesh_vocabularyDescriptor))
#Add vocabulary mesh heading, title
self.addStatementToModel(
descriptorNode,
RDF.Uri(self.mesh_vocabularyMeshHeadings),
self.makeLiteralProperty(label))
# self.addStatementToModel(
# descriptorNode,
# RDF.Uri(self.dc_title),
# self.makeLiteralProperty(label))
return descriptorNode
def makeConceptResourceNode(self, entity, label):
if entity not in self.entitySet:
conceptNode = self.makeResourceNode(entity)
#Add label to concept Node
self.addLabel(conceptNode, label)
#Add type Concept
self.addStatementToModel(
conceptNode,
RDF.Uri(self.isOfType),
RDF.Uri(self.mesh_vocabularyConcept))
self.entitySet.add(entity)
return conceptNode
else :
return self.getResourceNode(entity)
def makeSemanticTypeNode(self, entity, label):
if entity not in self.entitySet:
semanticTypeNode = self.makeResourceNode(entity)
#Add label to concept Node
self.addLabel(semanticTypeNode, label)
#Add type Concept
self.addStatementToModel(
semanticTypeNode,
RDF.Uri(self.isOfType),
RDF.Uri(self.mesh_vocabularySemanticType))
self.entitySet.add(entity)
return semanticTypeNode
else :
return self.getResourceNode(entity)
def makeTermResourceNode(self, entity, label):
if entity not in self.entitySet:
termNode = self.makeResourceNode(entity)
#Add label to term Node
self.addLabel(termNode, label)
#Add type Term
self.addStatementToModel(
termNode,
RDF.Uri(self.isOfType),
RDF.Uri(self.mesh_vocabularyTerm))
self.entitySet.add(entity)
return termNode
else :
return self.getResourceNode(entity)
def makeResourceNode(self, entity):
resourceNode = RDF.Node(RDF.Uri(self.baseMeshURI+self.meshResource+entity))
#Add vocabulary namespace
self.addStatementToModel(
resourceNode,
RDF.Uri(self.bio2rdf_vocabularyNamespace),
self.makeLiteralProperty("mesh"))
#Add vocabulary identifier
self.addStatementToModel(
resourceNode,
RDF.Uri(self.bio2rdf_vocabularyIdentifier),
self.makeLiteralProperty(entity))
self.addStatementToModel(
resourceNode,
RDF.Uri(self.dc_identifier),
self.makeLiteralProperty("mesh:"+entity))
return resourceNode
def makeLiteralProperty(self, literal):
#return RDF.Node(literal=literal, datatype=RDF.Node("http://www.w3.org/2001/XMLSchema#string"))
return RDF.Node(literal)
def getResourceNode(self, identifier):
return RDF.Node(RDF.Uri(self.baseMeshURI+self.meshResource+identifier))
def addLabel(self, resourceNode, label):
try :
self.addStatementToModel(resourceNode, RDF.Uri(self.labelPredicateURI), self.makeLiteralProperty(label.lower()))
#If DBpedia label is available, try to match the MESH label with a DBpedia label
if self.dbPediaLabel and label.lower() in self.dbPediaLabel:
self.addSameAsRelationship(resourceNode, RDF.Uri(self.dbPediaLabel[label.lower()]))
except AttributeError:
pass
def addTreeNumber(self, resourceNode, treeNumber):
try :
self.addStatementToModel(resourceNode, RDF.Uri(self.mesh_vocabularyTreeNumber), self.makeLiteralProperty(treeNumber))
except AttributeError:
pass
def addSubjectOfRelationship(self, startNode, endNode):
self.addStatementToModel(startNode, RDF.Uri(self.subject), endNode)
def addBroaderRelationship(self, startNode, endNode):
self.addStatementToModel(startNode, RDF.Uri(self.broader), endNode)
def addSameAsRelationship(self, startNode, endNode):
self.addStatementToModel(startNode, RDF.Uri(self.sameAs), endNode)
def addRelatedDescriptor(self, startNode, endNode):
self.addStatementToModel(startNode, RDF.Uri(self.related), endNode)
def addSameAsRelationshipBetweenTerms(self, resourcesNodes):
for i,j in itertools.permutations(resourcesNodes, 2):
self.addSameAsRelationship(i, j)
self.addSameAsRelationship(j, i)
def flushToFile(self, fileName):
ser = RDF.Serializer(name="ntriples")
ser.serialize_model_to_file(fileName+"_"+str(self.fileNumber), self.model)
output = open(fileName, 'a')
with open(fileName+"_"+str(self.fileNumber), 'r') as tempFile :
output.write(tempFile.read())
output.close()
os.remove(fileName+"_"+str(self.fileNumber))
self.initModel()
def createDBPediaLabelDict(dbpediaLabelFile):
dbpediaLabel = dict()
count = 1
startTime = datetime.now()
parser=RDF.Parser(name="ntriples")
print "Parse : "+ 'file://'+dbpediaLabelFile
stream=parser.parse_as_stream('file://'+dbpediaLabelFile)
for statement in stream:
if count % 1e5 == 0 :
endTime = datetime.now()
print str(count) + " labels parsed in %s seconds"%str((endTime-startTime).seconds)
startTime = endTime
if str(statement.predicate).split("#")[-1] == "label":
dbpediaLabel[str(statement.object).lower()] = str(statement.subject)
return dbpediaLabel
def main(xmlFile, outputFile, dbpediaLabelFile=None):
meshParser = MeshXMLParser()
if dbpediaLabelFile is not None:
meshParser.dbPediaLabel = createDBPediaLabelDict(dbpediaLabelFile)
print "Taking DBpedia label into account..."
startTime = datetime.now()
count = 1
print "Parsing XML file %s..."%(xmlFile)
tree = etree.parse(xmlFile)
print "Done."
print "Iterating over the model"
descriptorRecordSet = tree.getroot()
for descriptorRecord in descriptorRecordSet:
if count % 1000 == 0 :
endTime = datetime.now()
print str(count) + " descriptors parsed in %s seconds"%str((endTime-startTime).seconds)
startTime = endTime
#Flush the model into file
meshParser.flushToFile(outputFile)
descriptorUI = descriptorRecord.find("DescriptorUI")
descriptorName = descriptorRecord.find("DescriptorName")
treeNumberList = descriptorRecord.find("TreeNumberList")
conceptList = descriptorRecord.find("ConceptList")
relatedDescriptors = descriptorRecord.find("SeeRelatedList")
#1) Create descriptor Node
if descriptorUI is None:
continue
descriptorNode = meshParser.makeDescriptorResourceNode(descriptorUI.text, descriptorName.getchildren()[0].text)
#2) Get related list of descriptors
if relatedDescriptors is not None:
for i in range(len(relatedDescriptors.getchildren())):
meshParser.addRelatedDescriptor(descriptorNode,
meshParser.getResourceNode(
relatedDescriptors.getchildren()[i].find("DescriptorReferredTo").find("DescriptorUI").text))
#4) Add dscriptor Tree number label
if treeNumberList is not None:
for i in range(len(treeNumberList.getchildren())):
meshParser.addTreeNumber(descriptorNode, treeNumberList.getchildren()[i].text)
#5) Iterate over conceptList
for concept in conceptList.getchildren():
conceptUI = concept.find("ConceptUI")
conceptName = concept.find("ConceptName")
termList = concept.find("TermList")
semanticTypeList = concept.find("SemanticTypeList")
conceptRelationList = concept.find("ConceptRelationList")
if conceptUI is None:
continue
#1) Get or create concept node
conceptNode = meshParser.makeConceptResourceNode(conceptUI.text, conceptName.getchildren()[0].text)
#2) Create semantic relation between descriptor and concept
meshParser.addBroaderRelationship(conceptNode, descriptorNode)
if semanticTypeList is not None:
for semanticType in semanticTypeList.getchildren():
semanticTypeUI = semanticType.find("SemanticTypeUI")
semanticTypeName = semanticType.find("SemanticTypeName")
semanticTypeNode = meshParser.makeSemanticTypeNode(semanticTypeUI.text, semanticTypeName.text)
meshParser.addSubjectOfRelationship(conceptNode, semanticTypeNode)
if conceptRelationList is not None:
for conceptRelation in conceptRelationList.getchildren():
if conceptRelation.attrib["RelationName"] == "NRW":
meshParser.addBroaderRelationship(
meshParser.getResourceNode(conceptRelation.find("Concept2UI").text),
meshParser.getResourceNode(conceptRelation.find("Concept1UI").text))
elif conceptRelation.attrib["RelationName"] == "BRD":
meshParser.addBroaderRelationship(
meshParser.getResourceNode(conceptRelation.find("Concept1UI").text),
meshParser.getResourceNode(conceptRelation.find("Concept2UI").text))
elif conceptRelation.attrib["RelationName"] == "REL":
meshParser.addRelatedDescriptor(
meshParser.getResourceNode(conceptRelation.find("Concept1UI").text),
meshParser.getResourceNode(conceptRelation.find("Concept2UI").text))
termNodes = []
for term in termList.getchildren():
termUI = term.find("TermUI")
termName = term.find("String")
termLexicalTag = term.attrib["LexicalTag"]
if termUI is None :
continue
#Get or create term node
termNode = meshParser.makeTermResourceNode(termUI.text, termName.text)
#Create semantic relation between concept and terms
meshParser.addSameAsRelationship(termNode, conceptNode)
termNodes.append(termNode)
# Create same as relationships between syonoyme terms
meshParser.addSameAsRelationshipBetweenTerms(termNodes)
count+=1
print "Done."
print "Saving n-triple to file..."
meshParser.flushToFile(outputFile)
print "Done."
def help():
print "Usage: python meshXML2RDF.py [desc2014.xml] [output rdf file] [(optional) DBPedia label.ttl]"
print ""
print "This script parse the MESH xml dump file and turn it into an rdf triple file following partially the bio2rdf convention."
print "If DBpedia label are provided, this script add a 'sameAs' relationship between mesh terms and DBPedia entity."
print "Please go to http://bio2rdf.org/ for more information"
print "Author : Kevin Francoisse (https://github.com/kfrancoi)"
print "Company : Sagacify (http://sagacify.com)"
if __name__ == '__main__':
if (len(sys.argv) < 3):
help()
else :
print "Input : " + sys.argv[1]
print "Output :" + sys.argv[2]
if len(sys.argv) == 4 :
print "DBpedia label : " + sys.argv[3]
main(sys.argv[1], sys.argv[2], sys.argv[3])
else :
main(sys.argv[1], sys.argv[2])
@micheldumontier
Copy link

nice!!!! i'll have to install python and try it out :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment