Last active
March 27, 2022 14:10
-
-
Save akirayou/243fee1ca02bc671e6f19f69af2630b2 to your computer and use it in GitHub Desktop.
NBDC版日化辞RDFから欲しいのだけををダンプする時の書き方のテスト。tar.gzの中のファイルは1~222まで別れてるのその番号ごとに処理すれば大丈夫っぽい? https://dbarchive.biosciencedbc.jp/jp/nikkaji/desc.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Sun Mar 27 11:47:20 2022 | |
@author: youak | |
""" | |
from rdflib import Graph | |
import tarfile | |
import gzip | |
tf={} | |
tf["main"]=tarfile.open("NBDC_NikkajiRDF_main.tar.gz","r") | |
tf["inchi"]=tarfile.open("NBDC_NikkajiRDF_InChI.tar.gz","r") | |
tf["smiles"]=tarfile.open("NBDC_NikkajiRDF_SMILES.tar.gz","r") | |
K=tf.keys() | |
def tf_elm2graph(tf,e): | |
g=Graph() | |
g.parse(gzip.open(tf.extractfile(e), mode='rt',encoding="utf-8")) | |
return g | |
for es in zip( *[tf[k] for k in K] ): | |
g={} | |
for i,k in enumerate(K): | |
g[k]=tf_elm2graph(tf[k],es[i]) | |
from rdflib.namespace import RDF,RDFS,SKOS,DCTERMS | |
from rdflib.term import URIRef | |
for s,_,_ in g["main"].triples( (None,RDF.type,None)): | |
_,_,ID = g["main"].triples( (s,DCTERMS.identifier,None)).__next__() | |
print("Nikkaji_ID",ID) | |
for _,p,o in g["main"].triples( (s,RDFS.label,None)): | |
print("Label",o) | |
for _,p,o in g["main"].triples( (s,SKOS.altLabel,None)): | |
print("altLabel",o) | |
for _,p,o in g["main"].triples( (s,SKOS.altLabel,None)): | |
print("altLabel",o) | |
#デリファレンスめんどうなので、命名規則からlabelを確定 | |
for ss,_,_ in g["inchi"].triples( (None,RDFS.label, ID+"_standard_InChI")):#当該がないばあいもあるのでforでまわす | |
_,_,o = g["inchi"].triples( (ss,URIRef("http://semanticscience.org/resource/SIO_000300"),None)).__next__() | |
print("InChI",o) | |
for ss,_,_ in g["inchi"].triples( (None,RDFS.label, ID+"_standard_InChIKey")):#当該がないばあいもあるのでforでまわす | |
_,_,o = g["inchi"].triples( (ss,URIRef("http://semanticscience.org/resource/SIO_000300"),None)).__next__() | |
print("InChIKey",o) | |
no_smile=True | |
for ss,_,_ in g["smiles"].triples( (None,RDFS.label, ID+"_canonical_SMILES")):#当該がないばあいもあるのでforでまわす | |
_,_,o = g["smiles"].triples( (ss,URIRef('http://semanticscience.org/resource/SIO_000300'),None)).__next__() | |
print("canonical_SMILES",o) | |
no_smile=False | |
if(no_smile):print("NoSMILES==========================================================================") | |
print() | |
#break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment