Last active
March 28, 2021 02:46
-
-
Save kfrancoi/e3467da194fa1410c240 to your computer and use it in GitHub Desktop.
This script parse the MESH xml dump file and turn it into an rdf triple file following partially the bio2rdf convention.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import itertools | |
from datetime import datetime | |
try: | |
from lxml import etree | |
except ImportError: | |
import xml.etree.ElementTree as etree | |
import RDF | |
from IPython.core.debugger import Tracer | |
debug_here = Tracer() | |
class MeshXMLParser : | |
dbPediaLabel = None | |
baseMeshURI = "http://bio2rdf.org/" | |
vocabularyDescriptorURI = baseMeshURI+"mesh_vocabulary:Descriptor" | |
meshResource = "mesh:" | |
bio2rdf_vocabularyNamespace = baseMeshURI+"bio2rdf_vocabulary:namespace" | |
bio2rdf_vocabularyIdentifier = baseMeshURI+"bio2rdf_vocabulary:identifier" | |
mesh_vocabularyRecordType = baseMeshURI+"mesh_vocabulary:record-type" | |
mesh_vocabularyMeshHeadings = baseMeshURI+"mesh_vocabulary:mesh-heading" | |
mesh_vocabularyDescriptor = baseMeshURI+"mesh_vocabulary:Descriptor" | |
mesh_vocabularyConcept = baseMeshURI+"mesh_vocabulary:Concept" | |
mesh_vocabularySemanticType = baseMeshURI+"mesh_vocabulary:SemanticType" | |
mesh_vocabularyTerm = baseMeshURI+"mesh_vocabulary:Term" | |
mesh_vocabularyTreeNumber = baseMeshURI+"mesh_vocabulary:TreeNumber" | |
dc_identifier = "http://purl.org/dc/terms/identifier" | |
dc_title = "http://purl.org/dc/terms/title" | |
labelPredicateURI = "http://www.w3.org/2000/01/rdf-schema#label" | |
broader = "http://www.w3.org/2004/02/skos/core#broader" | |
related = "http://www.w3.org/2004/02/skos/core#related" | |
subject = "http://purl.org/dc/terms/subject" | |
isOfType = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" | |
sameAs = "http://www.w3.org/2002/07/owl#sameAs" | |
'''TODO: | |
- allowable qualifier | |
- resource type | |
- link with dbpedia : <http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/meshId> "D001321"@en . | |
''' | |
def __init__(self): | |
self.fileNumber = 0 | |
self.initModel() | |
self.entitySet = set() | |
#self.model = RDF.Model(RDF.MemoryStorage()) | |
#self.model = RDF.Model(RDF.Storage(storage_name="file", name='meshTriple.nt', options_string="")) | |
def initModel(self): | |
self.model = RDF.Model(RDF.MemoryStorage()) | |
self.fileNumber+=1 | |
def addStatementToModel(self, subject, predicate, object): | |
self.model.append(RDF.Statement(subject, predicate, object)) | |
def makeDescriptorResourceNode(self, entity, label): | |
descriptorNode = self.makeResourceNode(entity) | |
#Add descriptor Name rdf label | |
self.addLabel(descriptorNode, label) | |
#Add type Descriptor | |
self.addStatementToModel( | |
descriptorNode, | |
RDF.Uri(self.isOfType), | |
RDF.Uri(self.mesh_vocabularyDescriptor)) | |
#Add vocabulary mesh heading, title | |
self.addStatementToModel( | |
descriptorNode, | |
RDF.Uri(self.mesh_vocabularyMeshHeadings), | |
self.makeLiteralProperty(label)) | |
# self.addStatementToModel( | |
# descriptorNode, | |
# RDF.Uri(self.dc_title), | |
# self.makeLiteralProperty(label)) | |
return descriptorNode | |
def makeConceptResourceNode(self, entity, label): | |
if entity not in self.entitySet: | |
conceptNode = self.makeResourceNode(entity) | |
#Add label to concept Node | |
self.addLabel(conceptNode, label) | |
#Add type Concept | |
self.addStatementToModel( | |
conceptNode, | |
RDF.Uri(self.isOfType), | |
RDF.Uri(self.mesh_vocabularyConcept)) | |
self.entitySet.add(entity) | |
return conceptNode | |
else : | |
return self.getResourceNode(entity) | |
def makeSemanticTypeNode(self, entity, label): | |
if entity not in self.entitySet: | |
semanticTypeNode = self.makeResourceNode(entity) | |
#Add label to concept Node | |
self.addLabel(semanticTypeNode, label) | |
#Add type Concept | |
self.addStatementToModel( | |
semanticTypeNode, | |
RDF.Uri(self.isOfType), | |
RDF.Uri(self.mesh_vocabularySemanticType)) | |
self.entitySet.add(entity) | |
return semanticTypeNode | |
else : | |
return self.getResourceNode(entity) | |
def makeTermResourceNode(self, entity, label): | |
if entity not in self.entitySet: | |
termNode = self.makeResourceNode(entity) | |
#Add label to term Node | |
self.addLabel(termNode, label) | |
#Add type Term | |
self.addStatementToModel( | |
termNode, | |
RDF.Uri(self.isOfType), | |
RDF.Uri(self.mesh_vocabularyTerm)) | |
self.entitySet.add(entity) | |
return termNode | |
else : | |
return self.getResourceNode(entity) | |
def makeResourceNode(self, entity): | |
resourceNode = RDF.Node(RDF.Uri(self.baseMeshURI+self.meshResource+entity)) | |
#Add vocabulary namespace | |
self.addStatementToModel( | |
resourceNode, | |
RDF.Uri(self.bio2rdf_vocabularyNamespace), | |
self.makeLiteralProperty("mesh")) | |
#Add vocabulary identifier | |
self.addStatementToModel( | |
resourceNode, | |
RDF.Uri(self.bio2rdf_vocabularyIdentifier), | |
self.makeLiteralProperty(entity)) | |
self.addStatementToModel( | |
resourceNode, | |
RDF.Uri(self.dc_identifier), | |
self.makeLiteralProperty("mesh:"+entity)) | |
return resourceNode | |
def makeLiteralProperty(self, literal): | |
#return RDF.Node(literal=literal, datatype=RDF.Node("http://www.w3.org/2001/XMLSchema#string")) | |
return RDF.Node(literal) | |
def getResourceNode(self, identifier): | |
return RDF.Node(RDF.Uri(self.baseMeshURI+self.meshResource+identifier)) | |
def addLabel(self, resourceNode, label): | |
try : | |
self.addStatementToModel(resourceNode, RDF.Uri(self.labelPredicateURI), self.makeLiteralProperty(label.lower())) | |
#If DBpedia label is available, try to match the MESH label with a DBpedia label | |
if self.dbPediaLabel and label.lower() in self.dbPediaLabel: | |
self.addSameAsRelationship(resourceNode, RDF.Uri(self.dbPediaLabel[label.lower()])) | |
except AttributeError: | |
pass | |
def addTreeNumber(self, resourceNode, treeNumber): | |
try : | |
self.addStatementToModel(resourceNode, RDF.Uri(self.mesh_vocabularyTreeNumber), self.makeLiteralProperty(treeNumber)) | |
except AttributeError: | |
pass | |
def addSubjectOfRelationship(self, startNode, endNode): | |
self.addStatementToModel(startNode, RDF.Uri(self.subject), endNode) | |
def addBroaderRelationship(self, startNode, endNode): | |
self.addStatementToModel(startNode, RDF.Uri(self.broader), endNode) | |
def addSameAsRelationship(self, startNode, endNode): | |
self.addStatementToModel(startNode, RDF.Uri(self.sameAs), endNode) | |
def addRelatedDescriptor(self, startNode, endNode): | |
self.addStatementToModel(startNode, RDF.Uri(self.related), endNode) | |
def addSameAsRelationshipBetweenTerms(self, resourcesNodes): | |
for i,j in itertools.permutations(resourcesNodes, 2): | |
self.addSameAsRelationship(i, j) | |
self.addSameAsRelationship(j, i) | |
def flushToFile(self, fileName): | |
ser = RDF.Serializer(name="ntriples") | |
ser.serialize_model_to_file(fileName+"_"+str(self.fileNumber), self.model) | |
output = open(fileName, 'a') | |
with open(fileName+"_"+str(self.fileNumber), 'r') as tempFile : | |
output.write(tempFile.read()) | |
output.close() | |
os.remove(fileName+"_"+str(self.fileNumber)) | |
self.initModel() | |
def createDBPediaLabelDict(dbpediaLabelFile): | |
dbpediaLabel = dict() | |
count = 1 | |
startTime = datetime.now() | |
parser=RDF.Parser(name="ntriples") | |
print "Parse : "+ 'file://'+dbpediaLabelFile | |
stream=parser.parse_as_stream('file://'+dbpediaLabelFile) | |
for statement in stream: | |
if count % 1e5 == 0 : | |
endTime = datetime.now() | |
print str(count) + " labels parsed in %s seconds"%str((endTime-startTime).seconds) | |
startTime = endTime | |
if str(statement.predicate).split("#")[-1] == "label": | |
dbpediaLabel[str(statement.object).lower()] = str(statement.subject) | |
return dbpediaLabel | |
def main(xmlFile, outputFile, dbpediaLabelFile=None): | |
meshParser = MeshXMLParser() | |
if dbpediaLabelFile is not None: | |
meshParser.dbPediaLabel = createDBPediaLabelDict(dbpediaLabelFile) | |
print "Taking DBpedia label into account..." | |
startTime = datetime.now() | |
count = 1 | |
print "Parsing XML file %s..."%(xmlFile) | |
tree = etree.parse(xmlFile) | |
print "Done." | |
print "Iterating over the model" | |
descriptorRecordSet = tree.getroot() | |
for descriptorRecord in descriptorRecordSet: | |
if count % 1000 == 0 : | |
endTime = datetime.now() | |
print str(count) + " descriptors parsed in %s seconds"%str((endTime-startTime).seconds) | |
startTime = endTime | |
#Flush the model into file | |
meshParser.flushToFile(outputFile) | |
descriptorUI = descriptorRecord.find("DescriptorUI") | |
descriptorName = descriptorRecord.find("DescriptorName") | |
treeNumberList = descriptorRecord.find("TreeNumberList") | |
conceptList = descriptorRecord.find("ConceptList") | |
relatedDescriptors = descriptorRecord.find("SeeRelatedList") | |
#1) Create descriptor Node | |
if descriptorUI is None: | |
continue | |
descriptorNode = meshParser.makeDescriptorResourceNode(descriptorUI.text, descriptorName.getchildren()[0].text) | |
#2) Get related list of descriptors | |
if relatedDescriptors is not None: | |
for i in range(len(relatedDescriptors.getchildren())): | |
meshParser.addRelatedDescriptor(descriptorNode, | |
meshParser.getResourceNode( | |
relatedDescriptors.getchildren()[i].find("DescriptorReferredTo").find("DescriptorUI").text)) | |
#4) Add dscriptor Tree number label | |
if treeNumberList is not None: | |
for i in range(len(treeNumberList.getchildren())): | |
meshParser.addTreeNumber(descriptorNode, treeNumberList.getchildren()[i].text) | |
#5) Iterate over conceptList | |
for concept in conceptList.getchildren(): | |
conceptUI = concept.find("ConceptUI") | |
conceptName = concept.find("ConceptName") | |
termList = concept.find("TermList") | |
semanticTypeList = concept.find("SemanticTypeList") | |
conceptRelationList = concept.find("ConceptRelationList") | |
if conceptUI is None: | |
continue | |
#1) Get or create concept node | |
conceptNode = meshParser.makeConceptResourceNode(conceptUI.text, conceptName.getchildren()[0].text) | |
#2) Create semantic relation between descriptor and concept | |
meshParser.addBroaderRelationship(conceptNode, descriptorNode) | |
if semanticTypeList is not None: | |
for semanticType in semanticTypeList.getchildren(): | |
semanticTypeUI = semanticType.find("SemanticTypeUI") | |
semanticTypeName = semanticType.find("SemanticTypeName") | |
semanticTypeNode = meshParser.makeSemanticTypeNode(semanticTypeUI.text, semanticTypeName.text) | |
meshParser.addSubjectOfRelationship(conceptNode, semanticTypeNode) | |
if conceptRelationList is not None: | |
for conceptRelation in conceptRelationList.getchildren(): | |
if conceptRelation.attrib["RelationName"] == "NRW": | |
meshParser.addBroaderRelationship( | |
meshParser.getResourceNode(conceptRelation.find("Concept2UI").text), | |
meshParser.getResourceNode(conceptRelation.find("Concept1UI").text)) | |
elif conceptRelation.attrib["RelationName"] == "BRD": | |
meshParser.addBroaderRelationship( | |
meshParser.getResourceNode(conceptRelation.find("Concept1UI").text), | |
meshParser.getResourceNode(conceptRelation.find("Concept2UI").text)) | |
elif conceptRelation.attrib["RelationName"] == "REL": | |
meshParser.addRelatedDescriptor( | |
meshParser.getResourceNode(conceptRelation.find("Concept1UI").text), | |
meshParser.getResourceNode(conceptRelation.find("Concept2UI").text)) | |
termNodes = [] | |
for term in termList.getchildren(): | |
termUI = term.find("TermUI") | |
termName = term.find("String") | |
termLexicalTag = term.attrib["LexicalTag"] | |
if termUI is None : | |
continue | |
#Get or create term node | |
termNode = meshParser.makeTermResourceNode(termUI.text, termName.text) | |
#Create semantic relation between concept and terms | |
meshParser.addSameAsRelationship(termNode, conceptNode) | |
termNodes.append(termNode) | |
# Create same as relationships between syonoyme terms | |
meshParser.addSameAsRelationshipBetweenTerms(termNodes) | |
count+=1 | |
print "Done." | |
print "Saving n-triple to file..." | |
meshParser.flushToFile(outputFile) | |
print "Done." | |
def help(): | |
print "Usage: python meshXML2RDF.py [desc2014.xml] [output rdf file] [(optional) DBPedia label.ttl]" | |
print "" | |
print "This script parse the MESH xml dump file and turn it into an rdf triple file following partially the bio2rdf convention." | |
print "If DBpedia label are provided, this script add a 'sameAs' relationship between mesh terms and DBPedia entity." | |
print "Please go to http://bio2rdf.org/ for more information" | |
print "Author : Kevin Francoisse (https://github.com/kfrancoi)" | |
print "Company : Sagacify (http://sagacify.com)" | |
if __name__ == '__main__': | |
if (len(sys.argv) < 3): | |
help() | |
else : | |
print "Input : " + sys.argv[1] | |
print "Output :" + sys.argv[2] | |
if len(sys.argv) == 4 : | |
print "DBpedia label : " + sys.argv[3] | |
main(sys.argv[1], sys.argv[2], sys.argv[3]) | |
else : | |
main(sys.argv[1], sys.argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
nice!!!! i'll have to install python and try it out :)