jronallo/get_and_process_webdatacommons_data.sh

## get_and_process_webdatacommons_data.sh
#!/usr/bin/env bash
# These steps will take a long time to download the data set.
# First, get the list of available NQuad files to download.
wget http://webdatacommons.org/2012-08/stats/files.list

# We're only interested in the microdata set right now since that seems to be where schema.org/Book is used more. So create a file list
cat files.list | grep html-microdata > microdata_files.list

# OK, this will take a while depending on your connection. Let it run overnight.
wget -i microdata_files.list

# Gunzip all the compressed files.
gunzip *gz

# Use grep to filter through all the NQuads and select only those that have schema.org/Book on the line. This can be changed to get any type.
cat html-microdata* | grep schema.org/Book >> microdata_books_nquads.nq

# Then run the script to create the report.
nquad_context_count_per_host.rb microdata_books_nquads.nq

# Open the CSV file with the appropriate program. This works on Ubuntu.
xdg-open microdata_books_nquads.csv

## quad_context_count_per_host.rb
#! /usr/bin/env ruby

# counts statements and lists properties for hosts in nquads file

require 'rdf'
require 'rdf/nquads'
require 'csv'

contexts = {}
filename = ARGV[0]
extension = File.extname(filename)
basename = File.basename(filename, extension)

RDF::NQuads::Reader.open(filename) do |reader|
  reader.each_statement do |statement|
    if !statement.context.nil?
      host = statement.context.host
      predicate = statement.predicate.to_s

      contexts[host] ||= {}
      contexts[host]['count'] ||= 0
      contexts[host]['count'] += 1
      contexts[host]['props'] ||= []
      if predicate.include?('http://schema.org/')
        predicate.sub!('http://schema.org/', '')
      end
      contexts[host]['props'] << predicate
      contexts[host]['props'].uniq!
    end
  end
end

sorted_contexts = contexts.sort_by{|k,v| v['count']}.reverse

CSV.open("#{basename}.csv", 'wb') do |csv|
  csv << %w(host count properties)
  sorted_contexts.each do |context, data|
    csv << [context, data['count'], data['props'].sort.join(' ')]
  end
end
	#!/usr/bin/env bash
	# These steps will take a long time to download the data set.
	# First, get the list of available NQuad files to download.
	wget http://webdatacommons.org/2012-08/stats/files.list

	# We're only interested in the microdata set right now since that seems to be where schema.org/Book is used more. So create a file list
	cat files.list \| grep html-microdata > microdata_files.list

	# OK, this will take a while depending on your connection. Let it run overnight.
	wget -i microdata_files.list

	# Gunzip all the compressed files.
	gunzip *gz

	# Use grep to filter through all the NQuads and select only those that have schema.org/Book on the line. This can be changed to get any type.
	cat html-microdata* \| grep schema.org/Book >> microdata_books_nquads.nq

	# Then run the script to create the report.
	nquad_context_count_per_host.rb microdata_books_nquads.nq

	# Open the CSV file with the appropriate program. This works on Ubuntu.
	xdg-open microdata_books_nquads.csv
	#! /usr/bin/env ruby

	# counts statements and lists properties for hosts in nquads file

	require 'rdf'
	require 'rdf/nquads'
	require 'csv'

	contexts = {}
	filename = ARGV[0]
	extension = File.extname(filename)
	basename = File.basename(filename, extension)

	RDF::NQuads::Reader.open(filename) do \|reader\|
	reader.each_statement do \|statement\|
	if !statement.context.nil?
	host = statement.context.host
	predicate = statement.predicate.to_s

	contexts[host] \|\|= {}
	contexts[host]['count'] \|\|= 0
	contexts[host]['count'] += 1
	contexts[host]['props'] \|\|= []
	if predicate.include?('http://schema.org/')
	predicate.sub!('http://schema.org/', '')
	end
	contexts[host]['props'] << predicate
	contexts[host]['props'].uniq!
	end
	end
	end

	sorted_contexts = contexts.sort_by{\|k,v\| v['count']}.reverse

	CSV.open("#{basename}.csv", 'wb') do \|csv\|
	csv << %w(host count properties)
	sorted_contexts.each do \|context, data\|
	csv << [context, data['count'], data['props'].sort.join(' ')]
	end
	end