Parse the microdata from Web Data Commons and output a little information.
#!/usr/bin/env ruby | |
# ccwd_microdata.rb /path/to/ccrdf.html-microdata.nq | |
require 'pp' | |
require 'rubygems' | |
require 'rdf' | |
require 'rdf/nquads' | |
if !ARGV[0] | |
puts "must supply path to nquads file" | |
exit | |
end | |
domains = [] | |
predicates = [] | |
RDF::NQuads::Reader.open(ARGV[0]) do |reader| | |
reader.each_statement do |statement| | |
if !statement.context.nil? | |
domains << statement.context.host | |
end | |
predicates << statement.predicate.to_s | |
end | |
end | |
domains.uniq! | |
predicates.uniq! | |
puts "CONTEXT DOMAINS: #{domains.length}" | |
pp domains.sort | |
puts "===========================\nPREDICATES: #{predicates.length}" | |
pp predicates.sort |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment