Skip to content

Instantly share code, notes, and snippets.

@takehiko
Created April 27, 2015 20:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save takehiko/3b08b36dc5cb5d81bf69 to your computer and use it in GitHub Desktop.
Save takehiko/3b08b36dc5cb5d81bf69 to your computer and use it in GitHub Desktop.
ISIL extractor
#!/usr/bin/env ruby
# isilx.rb : ISIL extractor
# by takehikom
# see also: http://www.ndl.go.jp/jp/aboutus/standards/opendataset.html
require 'rexml/document'
class IsilExtractor
def initialize(opt_nonpublic = false)
@isil = Hash.new
@nonpublic = opt_nonpublic
end
attr_reader :isil
def open_rdf(basename = "isilpublic")
rdffile = basename + ".rdf"
if !test(?f, rdffile)
zipfile = basename + "_lod.zip"
["wget http://www.ndl.go.jp/jp/aboutus/standards/opendataset/#{zipfile}",
"unzip #{zipfile}"].each do |command|
puts command
system command
end
end
puts "==== read #{rdffile} ====" if $DEBUG
open(rdffile)
end
def start
@doc = REXML::Document.new(open_rdf(@nonpublic ? "isilnonpublic" : "isilpublic"))
REXML::XPath.each(@doc, "/rdf:RDF/rdf:Description[schema:addressLocality='和歌山市']") do |e1|
puts "==== find e1 ====", e1 if $DEBUG
node_id = e1.attributes["rdf:nodeID"]
e2 = REXML::XPath.first(@doc, "/rdf:RDF/rdf:Description/org:siteAddress[@rdf:nodeID=\'#{node_id}\']")
raise if e2.nil?
e3 = e2.parent
puts "==== find e3 ====", e3 if $DEBUG
node_id2 = e3.attributes["rdf:nodeID"]
e4 = REXML::XPath.first(@doc, "/rdf:RDF/rdf:Description/org:hasSite[@rdf:nodeID=\'#{node_id2}\']")
raise if e4.nil?
e5 = e4.parent
puts "==== find e5 ====", e5 if $DEBUG
rdf_about = e5.attributes["rdf:about"]
e6 = REXML::XPath.first(@doc, "/rdf:RDF/rdf:Description[@rdf:about=\'#{rdf_about}\']/rdf:type")
raise if e6.nil?
e7 = e6.parent
puts "==== find e7 ====", e7 if $DEBUG
h = Hash.new("")
h["id_isil"] = find_text(e7, ".//dcterms:identifier[@rdf:datatype='http://ndl.go.jp/dcndl/terms/ISIL']")
h["id_ndllibcode"] = find_text(e7, ".//dcterms:identifier[@rdf:datatype='http://ndl.go.jp/dcndl/terms/NDLLibCode']")
h.delete("id_ndllibcode") if h["id_ndllibcode"].empty?
h["name_ja"] = find_text(e7, ".//schema:name[@xml:lang='ja']")
h["name_kana"] = find_text(e7, ".//schema:name[@xml:lang='ja-kana']")
h["name_en"] = find_text(e7, ".//schema:name[@xml:lang='en']")
h["get_long"] = find_text(e3, ".//geo:long")
h["get_lat"] = find_text(e3, ".//geo:lat")
# h["telephone"] = find_text(e3, ".//schema:telephone")
# h["faxnumber"] = find_text(e3, ".//schema:faxNumber")
h["postalcode"] = find_text(e1, ".//schema:postalCode")
h["country"] = find_text(e1, ".//schema:addressCountry")
h["countrycode"] = find_text(e1, ".//gn:countryCode")
h["region"] = find_text(e1, ".//schema:addressRegion")
h["locality"] = find_text(e1, ".//schema:addressLocality")
h["street_address"] = find_text(e1, ".//schema:streetAddress")
h["same_resources"] = REXML::XPath.match(e7, ".//owl:sameAs").map {|e| e.attributes["rdf:resource"]}
h["node_ids"] = [node_id2, node_id]
@isil[h["id_isil"]] = h
puts "==== record ====", h.inspect if $DEBUG
end
puts "==== result ====" if $DEBUG
@isil.each_key do |k|
print "#{k}: "
puts @isil[k].inspect
end
end
def find_text(e, path)
e2 = REXML::XPath.first(e, path)
e2 ? e2.text : ""
end
end
if __FILE__ == $0
IsilExtractor.new(/^no/ =~ ARGV.first).start
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment